Example #1
0
import string
import datetime
import traceback
from addmoduleid import add_module_id
from check_schema_tracking_log import check_schema, schema2dict
from load_course_sql import find_course_sql_dir
from path import path
from edx2course_axis import date_parse
import bqutil
import gsutil

sfn = 'schema_forum.json'

mypath = os.path.dirname(os.path.realpath(__file__))
SCHEMA = json.loads(open('%s/schemas/%s' % (mypath, sfn)).read())['forum']
SCHEMA_DICT = schema2dict(SCHEMA)


def do_rephrase(data, do_schema_check=True, linecnt=0):

    if '_id' in data:
        data['mongoid'] = data['_id']['$oid']
        data.pop('_id')

    if 'parent_id' in data:
        data['parent_id'] = data['parent_id']['$oid']

    def fix_date(dstr):
        if dstr:
            try:
                dtime = int(dstr)
def make_video_stats(course_id, api_key, basedir, datedir, force_recompute,
                     use_dataset_latest):
    '''
    Create Video stats for Videos Viewed and Videos Watched.
    First create a video axis, based on course axis. Then use tracking logs to count up videos viewed and videos watched
    '''

    assert api_key is not None, "[analyze videos]: Public API Key is missing from configuration file. Visit https://developers.google.com/console/help/new/#generatingdevkeys for details on how to generate public key, and then add to edx2bigquery_config.py as API_KEY variable"

    # Get Course Dir path
    basedir = path(basedir or '')
    course_dir = course_id.replace('/', '__')
    lfp = find_course_sql_dir(course_id, basedir, datedir, use_dataset_latest)

    # get schema
    mypath = os.path.dirname(os.path.realpath(__file__))
    SCHEMA_FILE = '%s/%s' % (mypath, SCHEMA_VIDEO_AXIS)
    the_schema = json.loads(open(SCHEMA_FILE).read())[SCHEMA_VIDEO_AXIS_NAME]
    the_dict_schema = schema2dict(the_schema)

    # Create initial video axis
    videoAxisExists = False
    dataset = bqutil.course_id2dataset(course_id,
                                       use_dataset_latest=use_dataset_latest)
    va_date = None
    try:
        tinfo = bqutil.get_bq_table_info(dataset, TABLE_VIDEO_AXIS)
        assert tinfo is not None, "[analyze videos] %s.%s does not exist. First time creating table" % (
            dataset, TABLE_VIDEO_AXIS)
        videoAxisExists = True
        va_date = tinfo['lastModifiedTime']  # datetime
    except (AssertionError, Exception) as err:
        print "%s --> Attempting to process %s table" % (str(err),
                                                         TABLE_VIDEO_AXIS)
        sys.stdout.flush()

    # get course axis time
    ca_date = None
    try:
        tinfo = bqutil.get_bq_table_info(dataset, TABLE_COURSE_AXIS)
        ca_date = tinfo['lastModifiedTime']  # datetime
    except (AssertionError, Exception) as err:
        pass

    if videoAxisExists and (not force_recompute) and ca_date and va_date and (
            ca_date > va_date):
        force_recompute = True
        print "video_axis exists, but has date %s, older than course_axis date %s; forcing recompute" % (
            va_date, ca_date)
        sys.stdout.flush()

    if not videoAxisExists or force_recompute:
        force_recompute = True
        createVideoAxis(course_id=course_id,
                        force_recompute=force_recompute,
                        use_dataset_latest=use_dataset_latest)

        # Get video lengths
        va = bqutil.get_table_data(dataset, TABLE_VIDEO_AXIS)
        assert va is not None, "[analyze videos] Possibly no data in video axis table. Check course axis table"
        va_bqdata = va['data']
        fileoutput = lfp / FILENAME_VIDEO_AXIS
        getYoutubeDurations(dataset=dataset,
                            bq_table_input=va_bqdata,
                            api_key=api_key,
                            outputfilename=fileoutput,
                            schema=the_dict_schema,
                            force_recompute=force_recompute)

        # upload and import video axis
        gsfn = gsutil.gs_path_from_course_id(
            course_id,
            use_dataset_latest=use_dataset_latest) / FILENAME_VIDEO_AXIS
        gsutil.upload_file_to_gs(fileoutput, gsfn)
        table = TABLE_VIDEO_AXIS
        bqutil.load_data_to_table(dataset, table, gsfn, the_schema, wait=True)

    else:
        print "[analyze videos] %s.%s already exists (and force recompute not specified). Skipping step to generate %s using latest course axis" % (
            dataset, TABLE_VIDEO_AXIS, TABLE_VIDEO_AXIS)

    # Lastly, create video stats
    createVideoStats_day(course_id,
                         force_recompute=force_recompute,
                         use_dataset_latest=use_dataset_latest)
    createVideoStats(course_id,
                     force_recompute=force_recompute,
                     use_dataset_latest=use_dataset_latest)
def process_file(course_id,
                 basedir=None,
                 datedir=None,
                 use_dataset_latest=False):

    basedir = path(basedir or '')
    course_dir = course_id.replace('/', '__')
    lfp = find_course_sql_dir(course_id,
                              basedir,
                              datedir,
                              use_dataset_latest=use_dataset_latest)

    cdir = lfp
    print "Processing %s from files in %s" % (course_id, cdir)
    sys.stdout.flush()

    mypath = os.path.dirname(os.path.realpath(__file__))
    SCHEMA_FILE = '%s/schemas/schema_user_info_combo.json' % mypath

    the_dict_schema = schema2dict(
        json.loads(open(SCHEMA_FILE).read())['user_info_combo'])

    uic = defaultdict(
        dict
    )  # dict with key = user_id, and val = dict to be written out as JSON line

    def copy_elements(src, dest, fields, prefix="", skip_empty=False):
        for key in fields:
            if skip_empty and (not key in src):
                src[key] = None
            if src[key] == 'NULL':
                continue
            if key == 'course_id' and src[key].startswith('course-v1:'):
                # special handling for mangled "opaque keys" version of course_id, e.g. course-v1:MITx+6.00.2x_3+1T2015
                src[key] = src[key].split(':', 1)[1].replace('+', '/')
            dest[prefix + key] = src[key]

    def openfile(fn_in, mode='r', add_dir=True):
        if add_dir:
            fn = cdir / fn_in
        else:
            fn = fn_in
        if (not os.path.exists(fn)) and (not fn.endswith('.gz')):
            fn += ".gz"
        if mode == 'r' and not os.path.exists(fn):
            newfn = convert_sql(
                fn)  # try converting from *.sql file, if that exists
            if not newfn:
                return None  # failure, no file found, return None
            fn = newfn
        if fn.endswith('.gz'):
            return gzip.GzipFile(fn, mode)
        return open(fn, mode)

    def tsv2csv(fn_in, fn_out):
        import csv
        fp = openfile(fn_out, 'w', add_dir=False)
        csvfp = csv.writer(fp)
        for line in openfile(fn_in, add_dir=False):
            csvfp.writerow(line[:-1].split('\t'))
        fp.close()

    def convert_sql(fnroot):
        '''
        Returns filename if suitable file exists or was created by conversion of tab separated values to comma separated values.
        Returns False otherwise.
        '''
        if fnroot.endswith('.gz'):
            fnroot = fnroot[:-3]
        if fnroot.endswith('.csv'):
            fnroot = fnroot[:-4]
        if os.path.exists(fnroot + ".csv"):
            return fnroot + ".csv"
        if os.path.exists(fnroot + ".csv.gz"):
            return fnroot + ".csv.gz"
        if os.path.exists(fnroot + ".sql") or os.path.exists(fnroot +
                                                             ".sql.gz"):
            infn = fnroot + '.sql'
            outfn = fnroot + '.csv.gz'
            print "--> Converting %s to %s" % (infn, outfn)
            tsv2csv(infn, outfn)
            return outfn
        return False

    nusers = 0
    fields = ['username', 'email', 'is_staff', 'last_login', 'date_joined']
    for line in csv.DictReader(openfile('users.csv')):
        uid = int(line['id'])
        copy_elements(line, uic[uid], fields)
        uic[uid]['user_id'] = uid
        nusers += 1
        uic[uid]['y1_anomalous'] = None
        uic[uid]['edxinstructordash_Grade'] = None
        uic[uid]['edxinstructordash_Grade_timestamp'] = None

    print "  %d users loaded from users.csv" % nusers

    fp = openfile('profiles.csv')
    if fp is None:
        print "--> Skipping profiles.csv, file does not exist"
    else:
        nprofiles = 0
        fields = [
            'name', 'language', 'location', 'meta', 'courseware', 'gender',
            'mailing_address', 'year_of_birth', 'level_of_education', 'goals',
            'allow_certificate', 'country', 'city'
        ]
        for line in csv.DictReader(fp):
            uid = int(line['user_id'])
            copy_elements(line, uic[uid], fields, prefix="profile_")
            nprofiles += 1
        print "  %d profiles loaded from profiles.csv" % nprofiles

    fp = openfile('enrollment.csv')
    if fp is None:
        print "--> Skipping enrollment.csv, file does not exist"
    else:
        nenrollments = 0
        fields = [
            'course_id',
            'created',
            'is_active',
            'mode',
        ]
        for line in csv.DictReader(fp):
            uid = int(line['user_id'])
            copy_elements(line, uic[uid], fields, prefix="enrollment_")
            nenrollments += 1
        print "  %d enrollments loaded from profiles.csv" % nenrollments

    # see if from_mongodb files are present for this course; if so, merge in that data
    mongodir = cdir.dirname() / 'from_mongodb'
    if mongodir.exists():
        print "--> %s exists, merging in users, profile, and enrollment data from mongodb" % mongodir
        sys.stdout.flush()
        fp = gzip.GzipFile(mongodir / "users.json.gz")
        fields = ['username', 'email', 'is_staff', 'last_login', 'date_joined']
        nadded = 0
        for line in fp:
            pdata = json.loads(line)
            uid = int(pdata['_id'])
            if not uid in uic:
                copy_elements(pdata, uic[uid], fields, skip_empty=True)
                uic[uid]['user_id'] = uid
                nadded += 1
        fp.close()
        print "  %d additional users loaded from %s/users.json.gz" % (nadded,
                                                                      mongodir)

        fp = gzip.GzipFile(mongodir / "profiles.json.gz")
        fields = [
            'name', 'language', 'location', 'meta', 'courseware', 'gender',
            'mailing_address', 'year_of_birth', 'level_of_education', 'goals',
            'allow_certificate', 'country', 'city'
        ]
        nadd_profiles = 0

        def fix_unicode(elem, fields):
            for k in fields:
                if (k in elem) and elem[k]:
                    elem[k] = elem[k].encode('utf8')

        for line in fp:
            pdata = json.loads(line.decode('utf8'))
            uid = int(pdata['user_id'])
            if not uic[uid].get('profile_name', None):
                copy_elements(pdata,
                              uic[uid],
                              fields,
                              prefix="profile_",
                              skip_empty=True)
                fix_unicode(uic[uid], [
                    'profile_name', 'profile_mailing_address', 'profile_goals',
                    'profile_location', 'profile_language'
                ])
                uic[uid]['y1_anomalous'] = 1
                nadd_profiles += 1
        fp.close()
        print "  %d additional profiles loaded from %s/profiles.json.gz" % (
            nadd_profiles, mongodir)

        # if datedir is specified, then do not add entries from mongodb where the enrollment happened after the datedir cutoff
        cutoff = None
        if datedir:
            cutoff = "%s 00:00:00" % datedir

        fp = gzip.GzipFile(mongodir / "enrollment.json.gz")
        fields = [
            'course_id',
            'created',
            'is_active',
            'mode',
        ]
        nadd_enrollment = 0
        n_removed_after_cutoff = 0
        for line in fp:
            pdata = json.loads(line.decode('utf8'))
            uid = int(pdata['user_id'])
            if not uic[uid].get('enrollment_course_id', None):
                if cutoff and (pdata['created'] > cutoff) and (
                        uic[uid].get('y1_anomalous')
                        == 1):  # remove if enrolled after datedir cutoff
                    uic.pop(uid)
                    n_removed_after_cutoff += 1
                else:
                    copy_elements(pdata,
                                  uic[uid],
                                  fields,
                                  prefix="enrollment_",
                                  skip_empty=True)
                    nadd_enrollment += 1
        fp.close()
        print "  %d additional enrollments loaded from %s/enrollment.json.gz" % (
            nadd_enrollment, mongodir)

        print "     from mongodb files, added %s (of %s) new users (%s profiles, %s enrollments, %s after cutoff %s)" % (
            nadded - n_removed_after_cutoff, nadded, nadd_profiles,
            nadd_enrollment, n_removed_after_cutoff, cutoff)
        sys.stdout.flush()

    # See if instructor grade reports are present for this course; if so, merge in that data
    edxinstructordash = cdir.dirname() / 'from_edxinstructordash'
    if edxinstructordash.exists():
        print "--> %s exists, merging in users, profile, and enrollment data from_edxinstructordash" % edxinstructordash
        sys.stdout.flush()

        grade_report_fn = (edxinstructordash / 'grade_report.csv')
        fp = openfile(grade_report_fn, add_dir=False)
        if fp is None:
            print "--> Skipping grade_report.csv, file does not exist in dir from_edxinstructordash"
        nadded = 0
        print fp
        for line in csv.DictReader(fp):
            uid = int(line['Student ID'])
            fields = ['Grade', 'Grade_timestamp']
            #['course_id','Student ID','Email','Username','Grade' ]
            #'Enrollment Track',' Verification Status','Certificate Eligible','Certificate Delivered','Certificate Type' ]
            copy_elements(line, uic[uid], fields, prefix="edxinstructordash_")
            nadded += 1
        fp.close()
        print "  %d grades loaded from %s/grade_report.csv" % (
            nadded, edxinstructordash)
        sys.stdout.flush()

    fp = openfile('certificates.csv')
    if fp is None:
        print "--> Skipping certificates.csv, file does not exist"
    else:
        for line in csv.DictReader(fp):
            uid = int(line['user_id'])
            fields = [
                'download_url',
                'grade',
                'course_id',
                'key',
                'distinction',
                'status',
                'verify_uuid',
                'download_uuid',
                'name',
                'created_date',
                'modified_date',
                'error_reason',
                'mode',
            ]
            copy_elements(line, uic[uid], fields, prefix="certificate_")
            if 'user_id' not in uic[uid]:
                uic[uid]['user_id'] = uid

    # sanity check for entries with user_id but missing username
    nmissing_uname = 0
    for uid, entry in uic.iteritems():
        if (not 'username' in entry) or (not entry['username']):
            nmissing_uname += 1
            if nmissing_uname < 10:
                print "missing username: %s" % entry
    print "--> %d entries missing username" % nmissing_uname
    sys.stdout.flush()

    # sanity check for entries missing course_id
    nmissing_cid = 0
    for uid, entry in uic.iteritems():
        if (not 'enrollment_course_id'
                in entry) or (not entry['enrollment_course_id']):
            nmissing_cid += 1
            entry['enrollment_course_id'] = course_id
    print "--> %d entries missing enrollment_course_id (all fixed by setting to %s)" % (
        nmissing_cid, course_id)
    sys.stdout.flush()

    fp = openfile('user_id_map.csv')
    if fp is None:
        print "--> Skipping user_id_map.csv, file does not exist"
    else:
        for line in csv.DictReader(fp):
            uid = int(line['id'])
            fields = ['hash_id']
            copy_elements(line, uic[uid], fields, prefix="id_map_")

    # sort by userid
    uidset = uic.keys()
    uidset.sort()

    # write out result, checking schema along the way

    fieldnames = the_dict_schema.keys()
    ofp = openfile('user_info_combo.json.gz', 'w')
    ocsv = csv.DictWriter(openfile('user_info_combo.csv.gz', 'w'),
                          fieldnames=fieldnames)
    ocsv.writeheader()

    for uid in uidset:
        data = uic[uid]
        check_schema(uid, data, the_ds=the_dict_schema, coerce=True)
        if ('enrollment_course_id' not in data) and ('certificate_course_id'
                                                     not in data):
            print "Oops!  missing course_id in user_info_combo line: inconsistent SQL?"
            print "data = %s" % data
            print "Suppressing this row"
            continue
        row_course_id = data.get('enrollment_course_id',
                                 data.get('certificate_course_id', ''))
        if not row_course_id == course_id:
            print "Oops!  course_id=%s in user_info_combo line: inconsistent with expected=%s" % (
                row_course_id, course_id)
            print "data = %s" % data
            print "Suppressing this row"
            continue
        ofp.write(json.dumps(data) + '\n')
        try:
            ocsv.writerow(data)
        except Exception as err:
            print "failed to write data=%s" % data
            raise

    print "Done with make_user_info_combo for %s" % course_id
    sys.stdout.flush()
def process_file(course_id, basedir=None, datedir=None, use_dataset_latest=False):

    basedir = path(basedir or '')
    course_dir = course_id.replace('/','__')
    lfp = find_course_sql_dir(course_id, basedir, datedir, use_dataset_latest=use_dataset_latest)

    cdir = lfp
    print "Processing %s from files in %s" % (course_id, cdir)
    sys.stdout.flush()

    mypath = os.path.dirname(os.path.realpath(__file__))
    SCHEMA_FILE = '%s/schemas/schema_user_info_combo.json' % mypath
    
    the_dict_schema = schema2dict(json.loads(open(SCHEMA_FILE).read())['user_info_combo'])
    
    uic = defaultdict(dict)		# dict with key = user_id, and val = dict to be written out as JSON line
    
    def copy_elements(src, dest, fields, prefix="", skip_empty=False):
        for key in fields:
            if skip_empty and (not key in src):
                src[key] = None
            if src[key]=='NULL':
                continue
            if key=='course_id' and src[key].startswith('course-v1:'):
                # special handling for mangled "opaque keys" version of course_id, e.g. course-v1:MITx+6.00.2x_3+1T2015
                src[key] = src[key].split(':',1)[1].replace('+','/')
            dest[prefix + key] = src[key]
    
    def openfile(fn_in, mode='r', add_dir=True):
        if add_dir:
            fn = cdir / fn_in
        else:
            fn = fn_in
        if (not os.path.exists(fn)) and (not fn.endswith('.gz')):
            fn += ".gz"
        if mode=='r' and not os.path.exists(fn):
            newfn = convert_sql(fn)		# try converting from *.sql file, if that exists
            if not newfn:
                return None			# failure, no file found, return None
            fn = newfn
        if fn.endswith('.gz'):
            return gzip.GzipFile(fn, mode)
        return open(fn, mode)
    
    def tsv2csv(fn_in, fn_out):
        import csv
        fp = openfile(fn_out, 'w', add_dir=False)
        csvfp = csv.writer(fp)
        for line in openfile(fn_in, add_dir=False):
            csvfp.writerow(line[:-1].split('\t'))
        fp.close()
    
    def convert_sql(fnroot):
        '''
        Returns filename if suitable file exists or was created by conversion of tab separated values to comma separated values.
        Returns False otherwise.
        '''
        if fnroot.endswith('.gz'):
            fnroot = fnroot[:-3]
        if fnroot.endswith('.csv'):
            fnroot = fnroot[:-4]
        if os.path.exists(fnroot + ".csv"):
            return fnroot + ".csv"
        if os.path.exists(fnroot + ".csv.gz"):
            return fnroot + ".csv.gz"
        if os.path.exists(fnroot + ".sql") or os.path.exists(fnroot + ".sql.gz"):
            infn = fnroot + '.sql'
            outfn = fnroot + '.csv.gz'
            print "--> Converting %s to %s" % (infn, outfn)
            tsv2csv(infn, outfn)
            return outfn
        return False

    nusers = 0
    fields = ['username', 'email', 'is_staff', 'last_login', 'date_joined']
    for line in csv.DictReader(openfile('users.csv')):
        uid = int(line['id'])
        copy_elements(line, uic[uid], fields)
        uic[uid]['user_id'] = uid
        nusers += 1
        uic[uid]['y1_anomalous'] = None
    
    print "  %d users loaded from users.csv" % nusers

    fp = openfile('profiles.csv')
    if fp is None:
        print "--> Skipping profiles.csv, file does not exist"
    else:
        nprofiles = 0
        fields = ['name', 'language', 'location', 'meta', 'courseware', 
                  'gender', 'mailing_address', 'year_of_birth', 'level_of_education', 'goals', 
                  'allow_certificate', 'country', 'city']
        for line in csv.DictReader(fp):
            uid = int(line['user_id'])
            copy_elements(line, uic[uid], fields, prefix="profile_")
            nprofiles += 1
        print "  %d profiles loaded from profiles.csv" % nprofiles
    
    fp = openfile('enrollment.csv')
    if fp is None:
        print "--> Skipping enrollment.csv, file does not exist"
    else:
        nenrollments = 0
        fields = ['course_id', 'created', 'is_active', 'mode', ]
        for line in csv.DictReader(fp):
            uid = int(line['user_id'])
            copy_elements(line, uic[uid], fields, prefix="enrollment_")
            nenrollments += 1
        print "  %d enrollments loaded from profiles.csv" % nenrollments
    
    # see if from_mongodb files are present for this course; if so, merge in that data
    mongodir = cdir.dirname() / 'from_mongodb'
    if mongodir.exists():
        print "--> %s exists, merging in users, profile, and enrollment data from mongodb" % mongodir
        sys.stdout.flush()
        fp = gzip.GzipFile(mongodir / "users.json.gz")
        fields = ['username', 'email', 'is_staff', 'last_login', 'date_joined']
        nadded = 0
        for line in fp:
            pdata = json.loads(line)
            uid = int(pdata['_id'])
            if not uid in uic:
                copy_elements(pdata, uic[uid], fields, skip_empty=True)
                uic[uid]['user_id'] = uid
                nadded += 1
        fp.close()
        print "  %d additional users loaded from %s/users.json.gz" % (nadded, mongodir)
                
        fp = gzip.GzipFile(mongodir / "profiles.json.gz")
        fields = ['name', 'language', 'location', 'meta', 'courseware', 
                  'gender', 'mailing_address', 'year_of_birth', 'level_of_education', 'goals', 
                  'allow_certificate', 'country', 'city']
        nadd_profiles = 0
        def fix_unicode(elem, fields):
            for k in fields:
                if (k in elem) and elem[k]:
                    elem[k] = elem[k].encode('utf8')

        for line in fp:
            pdata = json.loads(line.decode('utf8'))
            uid = int(pdata['user_id'])
            if not uic[uid].get('profile_name', None):
                copy_elements(pdata, uic[uid], fields, prefix="profile_", skip_empty=True)
                fix_unicode(uic[uid], ['profile_name', 'profile_mailing_address', 'profile_goals', 'profile_location', 'profile_language'])
                uic[uid]['y1_anomalous'] = 1
                nadd_profiles += 1
        fp.close()
        print "  %d additional profiles loaded from %s/profiles.json.gz" % (nadd_profiles, mongodir)
                
        # if datedir is specified, then do not add entries from mongodb where the enrollment happened after the datedir cutoff
        cutoff = None
        if datedir:
            cutoff = "%s 00:00:00" % datedir

        fp = gzip.GzipFile(mongodir / "enrollment.json.gz")
        fields = ['course_id', 'created', 'is_active', 'mode', ]
        nadd_enrollment = 0
        n_removed_after_cutoff = 0
        for line in fp:
            pdata = json.loads(line.decode('utf8'))
            uid = int(pdata['user_id'])
            if not uic[uid].get('enrollment_course_id', None):
                if cutoff and (pdata['created'] > cutoff) and (uic[uid].get('y1_anomalous')==1):	# remove if enrolled after datedir cutoff
                    uic.pop(uid)
                    n_removed_after_cutoff += 1
                else:
                    copy_elements(pdata, uic[uid], fields, prefix="enrollment_", skip_empty=True)
                    nadd_enrollment += 1
        fp.close()
        print "  %d additional enrollments loaded from %s/enrollment.json.gz" % (nadd_enrollment, mongodir)

        print "     from mongodb files, added %s (of %s) new users (%s profiles, %s enrollments, %s after cutoff %s)" % (nadded - n_removed_after_cutoff,
                                                                                                                         nadded, nadd_profiles, nadd_enrollment,
                                                                                                                         n_removed_after_cutoff,
                                                                                                                         cutoff)
        sys.stdout.flush()

    fp = openfile('certificates.csv')
    if fp is None:
        print "--> Skipping certificates.csv, file does not exist"
    else:
        for line in csv.DictReader(fp):
            uid = int(line['user_id'])
            fields = ['download_url', 'grade', 'course_id', 'key', 'distinction', 'status', 
                      'verify_uuid', 'download_uuid', 'name', 'created_date', 'modified_date', 'error_reason', 'mode',]
            copy_elements(line, uic[uid], fields, prefix="certificate_")
            if 'user_id' not in uic[uid]:
                uic[uid]['user_id'] = uid
    
    # sanity check for entries with user_id but missing username
    nmissing_uname = 0
    for uid, entry in uic.iteritems():
        if (not 'username' in entry) or (not entry['username']):
            nmissing_uname += 1
            if nmissing_uname < 10:
                print "missing username: %s" % entry
    print "--> %d entries missing username" % nmissing_uname
    sys.stdout.flush()
    
    # sanity check for entries missing course_id
    nmissing_cid = 0
    for uid, entry in uic.iteritems():
        if (not 'enrollment_course_id' in entry) or (not entry['enrollment_course_id']):
            nmissing_cid += 1
            entry['enrollment_course_id'] = course_id
    print "--> %d entries missing enrollment_course_id (all fixed by setting to %s)" % (nmissing_cid, course_id)
    sys.stdout.flush()

    fp = openfile('user_id_map.csv')
    if fp is None:
        print "--> Skipping user_id_map.csv, file does not exist"
    else:
        for line in csv.DictReader(fp):
            uid = int(line['id'])
            fields = ['hash_id']
            copy_elements(line, uic[uid], fields, prefix="id_map_")
    
    # sort by userid
    uidset = uic.keys()
    uidset.sort()
    
    # write out result, checking schema along the way
    
    fieldnames = the_dict_schema.keys()
    ofp = openfile('user_info_combo.json.gz', 'w')
    ocsv = csv.DictWriter(openfile('user_info_combo.csv.gz', 'w'), fieldnames=fieldnames)
    ocsv.writeheader()
    
    for uid in uidset:
        data = uic[uid]
        check_schema(uid, data, the_ds=the_dict_schema, coerce=True)
        if ('enrollment_course_id' not in data) and ('certificate_course_id' not in data):
            print "Oops!  missing course_id in user_info_combo line: inconsistent SQL?"
            print "data = %s" % data
            print "Suppressing this row"
            continue
        row_course_id = data.get('enrollment_course_id', data.get('certificate_course_id', ''))
        if not row_course_id==course_id:
            print "Oops!  course_id=%s in user_info_combo line: inconsistent with expected=%s" % (row_course_id, course_id)
            print "data = %s" % data
            print "Suppressing this row"
            continue
        ofp.write(json.dumps(data) + '\n')
        try:
            ocsv.writerow(data)
        except Exception as err:
            print "failed to write data=%s" % data
            raise
    
    print "Done with make_user_info_combo for %s" % course_id
    sys.stdout.flush()
import string
import datetime
import traceback
from addmoduleid import add_module_id
from check_schema_tracking_log import check_schema, schema2dict
from load_course_sql import find_course_sql_dir
from path import path
from edx2course_axis import date_parse
import bqutil
import gsutil

sfn = 'schema_forum.json'

mypath = os.path.dirname(os.path.realpath(__file__))
SCHEMA = json.loads(open('%s/schemas/%s' % (mypath, sfn)).read())['forum']
SCHEMA_DICT = schema2dict(SCHEMA)

def do_rephrase(data, do_schema_check=True, linecnt=0):

    if '_id' in data:
        data['mongoid'] = data['_id']['$oid']
        data.pop('_id')

    if 'parent_id' in data:
        data['parent_id'] = data['parent_id']['$oid']

    def fix_date(dstr):
        if dstr:
            try:
                dtime = int(dstr)
                if dtime:
def make_video_stats(course_id, api_key, basedir, datedir, force_recompute, use_dataset_latest, use_latest_sql_dir):
    '''
    Create Video stats for Videos Viewed and Videos Watched.
    First create a video axis, based on course axis. Then use tracking logs to count up videos viewed and videos watched
    '''

    assert api_key is not None, "[analyze videos]: Public API Key is missing from configuration file. Visit https://developers.google.com/console/help/new/#generatingdevkeys for details on how to generate public key, and then add to edx2bigquery_config.py as API_KEY variable"

    # Get Course Dir path
    basedir = path(basedir or '')
    course_dir = course_id.replace('/','__')
    lfp = find_course_sql_dir(course_id, basedir, datedir, use_dataset_latest or use_latest_sql_dir)
    
    # get schema
    mypath = os.path.dirname(os.path.realpath(__file__))
    SCHEMA_FILE = '%s/%s' % ( mypath, SCHEMA_VIDEO_AXIS )
    the_schema = json.loads(open(SCHEMA_FILE).read())[ SCHEMA_VIDEO_AXIS_NAME ]
    the_dict_schema = schema2dict(the_schema)

    # Create initial video axis
    videoAxisExists = False
    dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest)
    va_date = None
    try:
        tinfo = bqutil.get_bq_table_info(dataset, TABLE_VIDEO_AXIS )
        assert tinfo is not None, "[analyze videos] %s.%s does not exist. First time creating table" % ( dataset, TABLE_VIDEO_AXIS )
	videoAxisExists = True
        va_date = tinfo['lastModifiedTime']		# datetime
    except (AssertionError, Exception) as err:
        print "%s --> Attempting to process %s table" % ( str(err), TABLE_VIDEO_AXIS )
        sys.stdout.flush()

    # get course axis time
    ca_date = None
    try:
        tinfo = bqutil.get_bq_table_info(dataset, TABLE_COURSE_AXIS )
        ca_date = tinfo['lastModifiedTime']		# datetime
    except (AssertionError, Exception) as err:
        pass

    if videoAxisExists and (not force_recompute) and ca_date and va_date and (ca_date > va_date):
        force_recompute = True
        print "video_axis exists, but has date %s, older than course_axis date %s; forcing recompute" % (va_date, ca_date)
        sys.stdout.flush()

    if not videoAxisExists or force_recompute:
        force_recompute = True
        createVideoAxis(course_id=course_id, force_recompute=force_recompute, use_dataset_latest=use_dataset_latest)

        # Get video lengths
        va = bqutil.get_table_data(dataset, TABLE_VIDEO_AXIS)
        assert va is not None, "[analyze videos] Possibly no data in video axis table. Check course axis table"
        va_bqdata = va['data']
        fileoutput = lfp / FILENAME_VIDEO_AXIS
        getYoutubeDurations( dataset=dataset, bq_table_input=va_bqdata, api_key=api_key, outputfilename=fileoutput, schema=the_dict_schema, force_recompute=force_recompute )

        # upload and import video axis
        gsfn = gsutil.gs_path_from_course_id(course_id, use_dataset_latest=use_dataset_latest) / FILENAME_VIDEO_AXIS
        gsutil.upload_file_to_gs(fileoutput, gsfn)
        table = TABLE_VIDEO_AXIS
        bqutil.load_data_to_table(dataset, table, gsfn, the_schema, wait=True)

    else:
        print "[analyze videos] %s.%s already exists (and force recompute not specified). Skipping step to generate %s using latest course axis" % ( dataset, TABLE_VIDEO_AXIS, TABLE_VIDEO_AXIS )

    # Lastly, create video stats
    createVideoStats_day( course_id, force_recompute=force_recompute, use_dataset_latest=use_dataset_latest )
    createVideoStats( course_id, force_recompute=force_recompute, use_dataset_latest=use_dataset_latest )

    # also create person_course_video_watched
    createPersonCourseVideo( course_id, force_recompute=force_recompute, use_dataset_latest=use_dataset_latest )
def do_save(cid, caset_in, xbundle, datadir, log_msg, use_dataset_latest=False):
    '''
    Save course axis data to bigquery
    
    cid = course_id
    caset = list of course axis data in dict format
    xbundle = XML bundle of course (everything except static files)
    datadir = directory where output files should be written
    log_msg = list of messages about processing errors and issues
    '''

    # BigQuery requires data to fit within a schema; let's make sure our lines all fit the schema
    mypath = os.path.dirname(os.path.realpath(__file__))
    the_schema = json.loads(open('%s/schemas/schema_course_axis.json' % mypath).read())['course_axis']
    dict_schema = schema2dict(the_schema)

    caset = copy.deepcopy(caset_in)

    datadir = path(datadir)
    cafn = datadir / 'course_axis.json' 
    xbfn = datadir / ('xbundle_%s.xml' % (cid.replace('/','__')))
    fp = open(cafn, 'w')
    linecnt = 0

    for ca in caset:
        linecnt += 1
        ca['course_id'] = cid
        data = ca['data']
        if data and not type(data)==dict:
            try:
                ca['data'] = json.loads(data)	# make it native, for mongo
            except Exception as err:
                print "failed to create json for %s, error=%s" % (data, err)
        if ca['start'] is not None:
            ca['start'] = str(ca['start'])	# datetime to string
        if  ca['due'] is not None:
            ca['due'] = str(ca['due'])	# datetime to string
        if (ca['data'] is None) or (ca['data']==''):
            ca.pop('data')
        check_schema(linecnt, ca, the_ds=dict_schema, coerce=True)
        try:
            # db.course_axis.insert(ca)
            fp.write(json.dumps(ca)+'\n')
        except Exception as err:
            print "Failed to save!  Error=%s, data=%s" % (err, ca)
    fp.close()

    # upload axis.json file and course xbundle
    gsdir = path(gsutil.gs_path_from_course_id(cid, use_dataset_latest=use_dataset_latest))
    if 1:
        gsutil.upload_file_to_gs(cafn, gsdir, options="-z json", verbose=False)
        gsutil.upload_file_to_gs(xbfn, gsdir, options='-z xml', verbose=False)

    # import into BigQuery
    dataset = bqutil.course_id2dataset(cid, use_dataset_latest=use_dataset_latest)
    bqutil.create_dataset_if_nonexistent(dataset)	# create dataset if not already existent
    table = "course_axis"
    bqutil.load_data_to_table(dataset, table, gsdir / (cafn.basename()), the_schema)

    msg = "="*100 + '\n'
    msg += "Course axis for %s\n" % (cid)
    msg += "="*100 + '\n'
    msg += '\n'.join(log_msg)
    msg = msg[:16184]		# max message length 16384
    
    bqutil.add_description_to_table(dataset, table, msg, append=True)

    print "    Done - inserted %s records into course_axis" % len(caset)
Example #8
0
def process_file(course_id, basedir=None, datedir=None, use_dataset_latest=False):

    basedir = path(basedir or '')
    course_dir = course_id.replace('/','__')
    lfp = find_course_sql_dir(course_id, basedir, datedir, use_dataset_latest=use_dataset_latest)

    cdir = lfp
    print "Processing %s role data in %s" % (course_id, cdir)
    sys.stdout.flush()

    mypath = os.path.dirname(os.path.realpath(__file__))
    SCHEMA_FILE = '%s/schemas/schema_roles.json' % mypath
    the_dict_schema = schema2dict(json.loads(open(SCHEMA_FILE).read())['staff']) 
    
    role_data = defaultdict(dict)		# dict with key = user_id, and val = dict to be written out as JSON line
    
    def convertLongToWide( longData, unique_id, column_header, values, columns ):
        import numpy as np
        print "    Currently %d roles assigned... Dropping duplicates" % len(longData)
        sys.stdout.flush()
    
        ld2 = longData.drop_duplicates(subset=[unique_id, column_header])#, keep='last')
    
        print "    %d unique roles assigned" % len(ld2)
        print "    Processing role data with these known roles: %s" % columns
        sys.stdout.flush()
        wideData = ld2.pivot( index=unique_id, columns=column_header, values=values )
        wideData = pd.DataFrame( wideData , index=wideData.index ).reset_index()
        
        # Ensuring all specified columns exist
        cols = wideData.columns.tolist()
        for c in columns:
            if c not in cols:
                wideData[ c ] = pd.Series([np.nan])
        return wideData
    
    def createUniqueId( longData ):
        import numpy as np
        if pd.notnull(longData['user_id']) and pd.notnull(['course_id']):
            uid = str(longData['user_id']) + str('__') + str(longData['course_id'])
        else:
            uid = 'NULL'
        return uid
    
    def splitUniqueId( wideData ):
        import numpy as np
        wideData['user_id'] = pd.Series([np.nan])
        wideData['course_id'] = pd.Series([np.nan])
        wideData[['user_id', 'course_id']] = wideData['uid'].apply( lambda x: pd.Series(x.split('__') ) )
        wideData['course_id'] = wideData['course_id'].apply( forceTransparentCourseId )
        return wideData
    
    def forceTransparentCourseId( course_id ):
        
        #if key=='course_id' and src[key].startswith('course-v1:'):
        if 'course-v1' in course_id:
            # special handling for mangled "opaque keys" version of course_id, e.g. course-v1:MITx+6.00.2x_3+1T2015
            #src[key] = src[key].split(':',1)[1].replace('+','/')
            return course_id.split(':',1)[1].replace('+','/')
        else:
            return course_id
    
    def openfile(fn_in, mode='r', add_dir=True, pandas_df=False):
        if pandas_df:
            return pd.read_csv( gzip.GzipFile( cdir / fn_in ), sep=',' )
        else:
            if add_dir:
                fn = cdir / fn_in
	    else:
                fn = fn_in
            if fn.endswith('.gz'):
                return gzip.GzipFile(fn, mode)
            return open(fn, mode)

    def createRoleVar( wideData, all_roles ):

	if wideData[ all_roles ].notnull().values.any():
             return str("Staff")
        else:
             return str("Student")
           
    def cleanRoles( longData, unique_id, column_header, values, columns, allfields ):
        
        longData[ values ] = int(1)
        longData['uid'] = longData.apply(createUniqueId, axis=1 )
        wideData = convertLongToWide( longData, unique_id=unique_id, column_header=column_header, values=values, columns=columns )
        wideData = splitUniqueId( wideData )[ allfields ]    
        return wideData
    
    def copy_elements(src, dest, fields, prefix="", skip_empty=False):
        for key in fields:
            if skip_empty and (not key in src):
                src[key] = None
            if src[key]=='NULL':
                continue
            if key=='course_id' and src[key].startswith('course-v1:'):
                # special handling for mangled "opaque keys" version of course_id, e.g. course-v1:MITx+6.00.2x_3+1T2015
                src[key] = src[key].split(':',1)[1].replace('+','/')

            # Ensure integer for floats, or null
            if pd.isnull(src[key]):
                copyKey = None
            elif type(src[key]) == float:
                copyKey = int(float(src[key]))
            else:
                copyKey = src[key]

            dest[prefix + key] = copyKey

    #roledata = pd.read_csv( cdir / ROLE_COURSE_ACCESS, sep=',')
    #rolediscdata = pd.read_csv( cdir / ROLE_FORUM_ACCESS, sep=',')
    roledata = openfile( fn_in=ROLE_COURSE_ACCESS, mode='w', pandas_df=True)
    rolediscdata = openfile( fn_in=ROLE_FORUM_ACCESS, mode='w', pandas_df=True)
    
    # Process Course Access Roles
    known_roles_course = OrderedDict([
                          ('beta_testers', 'roles_isBetaTester'), 
                          ('instructor', 'roles_isInstructor'), 
                          ('staff', 'roles_isStaff'),
                          ('ccx_coach', 'roles_isCCX'),
                          ('finance_admin', 'roles_isFinance'),
                          ('library_user', 'roles_isLibrary'),
                          ('sales_admin', 'roles_isSales')
                          ])
    base_fields = ['user_id', 'course_id']
    fields = base_fields + known_roles_course.keys()
    print "  Cleaning %s" % ROLE_COURSE_ACCESS
    wide_roledata = cleanRoles( roledata, 'uid', 'role', 'value', known_roles_course.keys(), fields )
    
    # Process Forum Discussion Roles
    known_roles_disc = OrderedDict([
                          ('Administrator','forumRoles_isAdmin'),
                          ('CommunityTA','forumRoles_isCommunityTA'),
                          ('Moderator', 'forumRoles_isModerator'), 
                          ('Student', 'forumRoles_isStudent')
                           ])
    base_fields = ['user_id', 'course_id']
    extra_fields = ['roles'] # To be used to create custom mapping based on existing course and discussion forum roles
    fields = base_fields + known_roles_disc.keys()
    print "  Cleaning %s" % ROLE_FORUM_ACCESS
    wide_rolediscdata = cleanRoles( rolediscdata, 'uid', 'name', 'value', known_roles_disc.keys(), fields )
    
    # Compile
    fields_all = base_fields + known_roles_course.keys() + known_roles_disc.keys() + extra_fields
    rename_dict = dict(known_roles_course, **known_roles_disc)
    wideData = pd.merge( wide_roledata, wide_rolediscdata, how='outer', on=["user_id", "course_id"], suffixes=['', '_disc'] )

    # Create Roles var
    all_roles = known_roles_disc.keys() + known_roles_course.keys()
    all_roles.remove('Student')
    wideData['roles'] = wideData.apply( createRoleVar, args=[all_roles], axis=1 )

    # Rename columns
    wideData = wideData[ fields_all ]
    wideData.rename( columns=rename_dict, inplace=True )

    finalFields = wideData.columns.tolist()

    # Write out
    fieldnames = the_dict_schema.keys()
    ofp = openfile('roles.json.gz', 'w')
    ocsv = csv.DictWriter(openfile('roles.csv', 'w'), fieldnames=finalFields)
    ocsv.writeheader()
    
    wideData_dict = wideData.to_dict(orient='record')
    
    for line in wideData_dict:
        uid = int(line['user_id'])
        copy_elements(line, role_data[uid], finalFields, skip_empty=False )
        role_data[uid]['user_id'] = uid
        data = role_data[uid]   
    
        ofp.write(json.dumps(data) + '\n')
        try:
            ocsv.writerow(data)
        except Exception as err:
            print "failed to write data=%s" % data
            raise
    
    print "Done with make_roles for %s" % course_id
    sys.stdout.flush()