def getYoutubeDurations(dataset, bq_table_input, api_key, outputfilename, schema, force_recompute):
    '''
    Add youtube durations to Video Axis file using youtube id's and then write out to specified local path to prep for google storage / bigquery upload
    '''
    
    fp = openfile(outputfilename, 'w')
    linecnt = 0
    for row_dict in bq_table_input:
        
        linecnt += 1
        verified_row = OrderedDict()
        
        # Initial pass-through of keys in current row
        for keys in row_dict:
            
            # Only include keys defined in schema
            if keys in schema.keys():
                verified_row[keys] = row_dict[keys]
            
        # Recompute Video Length durations
        if force_recompute:
            verified_row[VIDEO_LENGTH] = findVideoLength( dataset=dataset, youtube_id=verified_row[VIDEO_ID], api_key=api_key )
        
        # Ensure schema type
        check_schema(linecnt, verified_row, the_ds=schema, coerce=True)
        
        try:
            fp.write(json.dumps(verified_row)+'\n')
        except Exception as err:
            print "Failed to write line %s!  Error=%s, data=%s" % (linecnt, str(err), dataset)
    
    fp.close()
def getYoutubeDurations(dataset, bq_table_input, api_key, outputfilename,
                        schema, force_recompute):
    '''
    Add youtube durations to Video Axis file using youtube id's and then write out to specified local path to prep for google storage / bigquery upload
    '''

    fp = openfile(outputfilename, 'w')
    linecnt = 0
    for row_dict in bq_table_input:

        linecnt += 1
        verified_row = OrderedDict()

        # Initial pass-through of keys in current row
        for keys in row_dict:

            # Only include keys defined in schema
            if keys in schema.keys():
                verified_row[keys] = row_dict[keys]

        # Recompute Video Length durations
        if force_recompute:
            verified_row[VIDEO_LENGTH] = findVideoLength(
                dataset=dataset,
                youtube_id=verified_row[VIDEO_ID],
                api_key=api_key)

        # Ensure schema type
        check_schema(linecnt, verified_row, the_ds=schema, coerce=True)

        try:
            fp.write(json.dumps(verified_row) + '\n')
        except Exception as err:
            print "Failed to write line %s!  Error=%s, data=%s" % (
                linecnt, str(err), dataset)

    fp.close()
Beispiel #3
0
def do_rephrase(data, do_schema_check=True, linecnt=0):

    if '_id' in data:
        data['mongoid'] = data['_id']['$oid']
        data.pop('_id')

    if 'parent_id' in data:
        data['parent_id'] = data['parent_id']['$oid']

    def fix_date(dstr):
        if dstr:
            try:
                dtime = int(dstr)
                if dtime:
                    try:
                        dt = datetime.datetime.utcfromtimestamp(dtime / 1000.0)
                    except Exception as err:
                        print "oops, failed to convert in utcfromtimestamp dtime=%s, dstr=%s" % (
                            dtime, dstr)
                        raise
                    return str(dt)
            except Exception as err:
                try:
                    dt = date_parse(dstr[:16])
                    return str(dt)
                except Exception as err:
                    return dstr
        return None

    def do_fix_date(field, rec):
        if field in rec:
            rec[field] = fix_date(rec[field]['$date'])

    do_fix_date('time', data.get('endorsement', {}) or {})

    if 'updated_at' in data:
        data['updated_at'] = fix_date(data['updated_at']['$date'])

    if 'created_at' in data:
        data['created_at'] = fix_date(data['created_at']['$date'])

    if 'last_activity_at' in data:
        data['last_activity_at'] = fix_date(data['last_activity_at']['$date'])

    if 'comment_thread_id' in data:
        data['comment_thread_id'] = data['comment_thread_id']['$oid']

    if ('endorsement' in data) and ((data['endorsement'] == 'null') or
                                    (not data['endorsement']) or
                                    (data['endorsement'] is None)):
        data.pop('endorsement')

    if 'parent_ids' in data:
        data['parent_ids'] = ' '.join([x['$oid'] for x in data['parent_ids']])

    def mkstring(key, rec):
        if key in rec:
            rec[key] = str(rec[key])

    mkstring('historical_abuse_flaggers', data)
    mkstring('abuse_flaggers', data)
    mkstring('at_position_list', data)
    mkstring('tags_array', data)

    mkstring('up', data.get('votes', {}))
    mkstring('down', data.get('votes', {}))

    # check for any funny keys, recursively
    funny_key_sections = []

    def check_for_funny_keys(entry, name='toplevel'):
        for key, val in entry.iteritems():
            if key.startswith('i4x-') or key.startswith('xblock.'):
                sys.stderr.write(
                    "[rephrase] oops, funny key at %s in entry: %s, data=%s\n"
                    % (name, entry, ''))
                funny_key_sections.append(name)
                return True
            if len(key) > 25:
                sys.stderr.write(
                    "[rephrase] suspicious key at %s in entry: %s, data=%s\n" %
                    (name, entry, ''))

            if key[0] in '0123456789':
                sys.stderr.write(
                    "[rephrase] oops, funny key at %s in entry: %s, data=%s\n"
                    % (name, entry, ''))
                funny_key_sections.append(name)
                return True

            if '-' in key or '.' in key:
                # bad key name!  rename it, chaning "-" to "_"
                newkey = key.replace('-', '_').replace('.', '__')
                sys.stderr.write(
                    "[rephrase] oops, bad keyname at %s in entry: %s newkey+%s\n"
                    % (name, entry, newkey))
                entry[newkey] = val
                entry.pop(key)
                key = newkey
            if type(val) == dict:
                ret = check_for_funny_keys(val, name + '/' + key)
                if ret is True:
                    sys.stderr.write(
                        "        coercing section %s to become a string\n" %
                        (name + "/" + key))
                    entry[key] = json.dumps(val)
        return False

    check_for_funny_keys(data)

    if 'context' in data:
        data.pop('context')  # 25aug15: remove key

    try:
        check_schema(linecnt, data, the_ds=SCHEMA_DICT, coerce=True)
    except Exception as err:
        sys.stderr.write('[%d] oops, err=%s, failed in check_schema %s\n' %
                         (linecnt, str(err), json.dumps(data, indent=4)))
        sys.stderr.write(traceback.format_exc())
        return
def process_file(course_id,
                 basedir=None,
                 datedir=None,
                 use_dataset_latest=False):

    basedir = path(basedir or '')
    course_dir = course_id.replace('/', '__')
    lfp = find_course_sql_dir(course_id,
                              basedir,
                              datedir,
                              use_dataset_latest=use_dataset_latest)

    cdir = lfp
    print "Processing %s from files in %s" % (course_id, cdir)
    sys.stdout.flush()

    mypath = os.path.dirname(os.path.realpath(__file__))
    SCHEMA_FILE = '%s/schemas/schema_user_info_combo.json' % mypath

    the_dict_schema = schema2dict(
        json.loads(open(SCHEMA_FILE).read())['user_info_combo'])

    uic = defaultdict(
        dict
    )  # dict with key = user_id, and val = dict to be written out as JSON line

    def copy_elements(src, dest, fields, prefix="", skip_empty=False):
        for key in fields:
            if skip_empty and (not key in src):
                src[key] = None
            if src[key] == 'NULL':
                continue
            if key == 'course_id' and src[key].startswith('course-v1:'):
                # special handling for mangled "opaque keys" version of course_id, e.g. course-v1:MITx+6.00.2x_3+1T2015
                src[key] = src[key].split(':', 1)[1].replace('+', '/')
            dest[prefix + key] = src[key]

    def openfile(fn_in, mode='r', add_dir=True):
        if add_dir:
            fn = cdir / fn_in
        else:
            fn = fn_in
        if (not os.path.exists(fn)) and (not fn.endswith('.gz')):
            fn += ".gz"
        if mode == 'r' and not os.path.exists(fn):
            newfn = convert_sql(
                fn)  # try converting from *.sql file, if that exists
            if not newfn:
                return None  # failure, no file found, return None
            fn = newfn
        if fn.endswith('.gz'):
            return gzip.GzipFile(fn, mode)
        return open(fn, mode)

    def tsv2csv(fn_in, fn_out):
        import csv
        fp = openfile(fn_out, 'w', add_dir=False)
        csvfp = csv.writer(fp)
        for line in openfile(fn_in, add_dir=False):
            csvfp.writerow(line[:-1].split('\t'))
        fp.close()

    def convert_sql(fnroot):
        '''
        Returns filename if suitable file exists or was created by conversion of tab separated values to comma separated values.
        Returns False otherwise.
        '''
        if fnroot.endswith('.gz'):
            fnroot = fnroot[:-3]
        if fnroot.endswith('.csv'):
            fnroot = fnroot[:-4]
        if os.path.exists(fnroot + ".csv"):
            return fnroot + ".csv"
        if os.path.exists(fnroot + ".csv.gz"):
            return fnroot + ".csv.gz"
        if os.path.exists(fnroot + ".sql") or os.path.exists(fnroot +
                                                             ".sql.gz"):
            infn = fnroot + '.sql'
            outfn = fnroot + '.csv.gz'
            print "--> Converting %s to %s" % (infn, outfn)
            tsv2csv(infn, outfn)
            return outfn
        return False

    nusers = 0
    fields = ['username', 'email', 'is_staff', 'last_login', 'date_joined']
    for line in csv.DictReader(openfile('users.csv')):
        uid = int(line['id'])
        copy_elements(line, uic[uid], fields)
        uic[uid]['user_id'] = uid
        nusers += 1
        uic[uid]['y1_anomalous'] = None
        uic[uid]['edxinstructordash_Grade'] = None
        uic[uid]['edxinstructordash_Grade_timestamp'] = None

    print "  %d users loaded from users.csv" % nusers

    fp = openfile('profiles.csv')
    if fp is None:
        print "--> Skipping profiles.csv, file does not exist"
    else:
        nprofiles = 0
        fields = [
            'name', 'language', 'location', 'meta', 'courseware', 'gender',
            'mailing_address', 'year_of_birth', 'level_of_education', 'goals',
            'allow_certificate', 'country', 'city'
        ]
        for line in csv.DictReader(fp):
            uid = int(line['user_id'])
            copy_elements(line, uic[uid], fields, prefix="profile_")
            nprofiles += 1
        print "  %d profiles loaded from profiles.csv" % nprofiles

    fp = openfile('enrollment.csv')
    if fp is None:
        print "--> Skipping enrollment.csv, file does not exist"
    else:
        nenrollments = 0
        fields = [
            'course_id',
            'created',
            'is_active',
            'mode',
        ]
        for line in csv.DictReader(fp):
            uid = int(line['user_id'])
            copy_elements(line, uic[uid], fields, prefix="enrollment_")
            nenrollments += 1
        print "  %d enrollments loaded from profiles.csv" % nenrollments

    # see if from_mongodb files are present for this course; if so, merge in that data
    mongodir = cdir.dirname() / 'from_mongodb'
    if mongodir.exists():
        print "--> %s exists, merging in users, profile, and enrollment data from mongodb" % mongodir
        sys.stdout.flush()
        fp = gzip.GzipFile(mongodir / "users.json.gz")
        fields = ['username', 'email', 'is_staff', 'last_login', 'date_joined']
        nadded = 0
        for line in fp:
            pdata = json.loads(line)
            uid = int(pdata['_id'])
            if not uid in uic:
                copy_elements(pdata, uic[uid], fields, skip_empty=True)
                uic[uid]['user_id'] = uid
                nadded += 1
        fp.close()
        print "  %d additional users loaded from %s/users.json.gz" % (nadded,
                                                                      mongodir)

        fp = gzip.GzipFile(mongodir / "profiles.json.gz")
        fields = [
            'name', 'language', 'location', 'meta', 'courseware', 'gender',
            'mailing_address', 'year_of_birth', 'level_of_education', 'goals',
            'allow_certificate', 'country', 'city'
        ]
        nadd_profiles = 0

        def fix_unicode(elem, fields):
            for k in fields:
                if (k in elem) and elem[k]:
                    elem[k] = elem[k].encode('utf8')

        for line in fp:
            pdata = json.loads(line.decode('utf8'))
            uid = int(pdata['user_id'])
            if not uic[uid].get('profile_name', None):
                copy_elements(pdata,
                              uic[uid],
                              fields,
                              prefix="profile_",
                              skip_empty=True)
                fix_unicode(uic[uid], [
                    'profile_name', 'profile_mailing_address', 'profile_goals',
                    'profile_location', 'profile_language'
                ])
                uic[uid]['y1_anomalous'] = 1
                nadd_profiles += 1
        fp.close()
        print "  %d additional profiles loaded from %s/profiles.json.gz" % (
            nadd_profiles, mongodir)

        # if datedir is specified, then do not add entries from mongodb where the enrollment happened after the datedir cutoff
        cutoff = None
        if datedir:
            cutoff = "%s 00:00:00" % datedir

        fp = gzip.GzipFile(mongodir / "enrollment.json.gz")
        fields = [
            'course_id',
            'created',
            'is_active',
            'mode',
        ]
        nadd_enrollment = 0
        n_removed_after_cutoff = 0
        for line in fp:
            pdata = json.loads(line.decode('utf8'))
            uid = int(pdata['user_id'])
            if not uic[uid].get('enrollment_course_id', None):
                if cutoff and (pdata['created'] > cutoff) and (
                        uic[uid].get('y1_anomalous')
                        == 1):  # remove if enrolled after datedir cutoff
                    uic.pop(uid)
                    n_removed_after_cutoff += 1
                else:
                    copy_elements(pdata,
                                  uic[uid],
                                  fields,
                                  prefix="enrollment_",
                                  skip_empty=True)
                    nadd_enrollment += 1
        fp.close()
        print "  %d additional enrollments loaded from %s/enrollment.json.gz" % (
            nadd_enrollment, mongodir)

        print "     from mongodb files, added %s (of %s) new users (%s profiles, %s enrollments, %s after cutoff %s)" % (
            nadded - n_removed_after_cutoff, nadded, nadd_profiles,
            nadd_enrollment, n_removed_after_cutoff, cutoff)
        sys.stdout.flush()

    # See if instructor grade reports are present for this course; if so, merge in that data
    edxinstructordash = cdir.dirname() / 'from_edxinstructordash'
    if edxinstructordash.exists():
        print "--> %s exists, merging in users, profile, and enrollment data from_edxinstructordash" % edxinstructordash
        sys.stdout.flush()

        grade_report_fn = (edxinstructordash / 'grade_report.csv')
        fp = openfile(grade_report_fn, add_dir=False)
        if fp is None:
            print "--> Skipping grade_report.csv, file does not exist in dir from_edxinstructordash"
        nadded = 0
        print fp
        for line in csv.DictReader(fp):
            uid = int(line['Student ID'])
            fields = ['Grade', 'Grade_timestamp']
            #['course_id','Student ID','Email','Username','Grade' ]
            #'Enrollment Track',' Verification Status','Certificate Eligible','Certificate Delivered','Certificate Type' ]
            copy_elements(line, uic[uid], fields, prefix="edxinstructordash_")
            nadded += 1
        fp.close()
        print "  %d grades loaded from %s/grade_report.csv" % (
            nadded, edxinstructordash)
        sys.stdout.flush()

    fp = openfile('certificates.csv')
    if fp is None:
        print "--> Skipping certificates.csv, file does not exist"
    else:
        for line in csv.DictReader(fp):
            uid = int(line['user_id'])
            fields = [
                'download_url',
                'grade',
                'course_id',
                'key',
                'distinction',
                'status',
                'verify_uuid',
                'download_uuid',
                'name',
                'created_date',
                'modified_date',
                'error_reason',
                'mode',
            ]
            copy_elements(line, uic[uid], fields, prefix="certificate_")
            if 'user_id' not in uic[uid]:
                uic[uid]['user_id'] = uid

    # sanity check for entries with user_id but missing username
    nmissing_uname = 0
    for uid, entry in uic.iteritems():
        if (not 'username' in entry) or (not entry['username']):
            nmissing_uname += 1
            if nmissing_uname < 10:
                print "missing username: %s" % entry
    print "--> %d entries missing username" % nmissing_uname
    sys.stdout.flush()

    # sanity check for entries missing course_id
    nmissing_cid = 0
    for uid, entry in uic.iteritems():
        if (not 'enrollment_course_id'
                in entry) or (not entry['enrollment_course_id']):
            nmissing_cid += 1
            entry['enrollment_course_id'] = course_id
    print "--> %d entries missing enrollment_course_id (all fixed by setting to %s)" % (
        nmissing_cid, course_id)
    sys.stdout.flush()

    fp = openfile('user_id_map.csv')
    if fp is None:
        print "--> Skipping user_id_map.csv, file does not exist"
    else:
        for line in csv.DictReader(fp):
            uid = int(line['id'])
            fields = ['hash_id']
            copy_elements(line, uic[uid], fields, prefix="id_map_")

    # sort by userid
    uidset = uic.keys()
    uidset.sort()

    # write out result, checking schema along the way

    fieldnames = the_dict_schema.keys()
    ofp = openfile('user_info_combo.json.gz', 'w')
    ocsv = csv.DictWriter(openfile('user_info_combo.csv.gz', 'w'),
                          fieldnames=fieldnames)
    ocsv.writeheader()

    for uid in uidset:
        data = uic[uid]
        check_schema(uid, data, the_ds=the_dict_schema, coerce=True)
        if ('enrollment_course_id' not in data) and ('certificate_course_id'
                                                     not in data):
            print "Oops!  missing course_id in user_info_combo line: inconsistent SQL?"
            print "data = %s" % data
            print "Suppressing this row"
            continue
        row_course_id = data.get('enrollment_course_id',
                                 data.get('certificate_course_id', ''))
        if not row_course_id == course_id:
            print "Oops!  course_id=%s in user_info_combo line: inconsistent with expected=%s" % (
                row_course_id, course_id)
            print "data = %s" % data
            print "Suppressing this row"
            continue
        ofp.write(json.dumps(data) + '\n')
        try:
            ocsv.writerow(data)
        except Exception as err:
            print "failed to write data=%s" % data
            raise

    print "Done with make_user_info_combo for %s" % course_id
    sys.stdout.flush()
def process_file(course_id, basedir=None, datedir=None, use_dataset_latest=False):

    basedir = path(basedir or '')
    course_dir = course_id.replace('/','__')
    lfp = find_course_sql_dir(course_id, basedir, datedir, use_dataset_latest=use_dataset_latest)

    cdir = lfp
    print "Processing %s from files in %s" % (course_id, cdir)
    sys.stdout.flush()

    mypath = os.path.dirname(os.path.realpath(__file__))
    SCHEMA_FILE = '%s/schemas/schema_user_info_combo.json' % mypath
    
    the_dict_schema = schema2dict(json.loads(open(SCHEMA_FILE).read())['user_info_combo'])
    
    uic = defaultdict(dict)		# dict with key = user_id, and val = dict to be written out as JSON line
    
    def copy_elements(src, dest, fields, prefix="", skip_empty=False):
        for key in fields:
            if skip_empty and (not key in src):
                src[key] = None
            if src[key]=='NULL':
                continue
            if key=='course_id' and src[key].startswith('course-v1:'):
                # special handling for mangled "opaque keys" version of course_id, e.g. course-v1:MITx+6.00.2x_3+1T2015
                src[key] = src[key].split(':',1)[1].replace('+','/')
            dest[prefix + key] = src[key]
    
    def openfile(fn_in, mode='r', add_dir=True):
        if add_dir:
            fn = cdir / fn_in
        else:
            fn = fn_in
        if (not os.path.exists(fn)) and (not fn.endswith('.gz')):
            fn += ".gz"
        if mode=='r' and not os.path.exists(fn):
            newfn = convert_sql(fn)		# try converting from *.sql file, if that exists
            if not newfn:
                return None			# failure, no file found, return None
            fn = newfn
        if fn.endswith('.gz'):
            return gzip.GzipFile(fn, mode)
        return open(fn, mode)
    
    def tsv2csv(fn_in, fn_out):
        import csv
        fp = openfile(fn_out, 'w', add_dir=False)
        csvfp = csv.writer(fp)
        for line in openfile(fn_in, add_dir=False):
            csvfp.writerow(line[:-1].split('\t'))
        fp.close()
    
    def convert_sql(fnroot):
        '''
        Returns filename if suitable file exists or was created by conversion of tab separated values to comma separated values.
        Returns False otherwise.
        '''
        if fnroot.endswith('.gz'):
            fnroot = fnroot[:-3]
        if fnroot.endswith('.csv'):
            fnroot = fnroot[:-4]
        if os.path.exists(fnroot + ".csv"):
            return fnroot + ".csv"
        if os.path.exists(fnroot + ".csv.gz"):
            return fnroot + ".csv.gz"
        if os.path.exists(fnroot + ".sql") or os.path.exists(fnroot + ".sql.gz"):
            infn = fnroot + '.sql'
            outfn = fnroot + '.csv.gz'
            print "--> Converting %s to %s" % (infn, outfn)
            tsv2csv(infn, outfn)
            return outfn
        return False

    nusers = 0
    fields = ['username', 'email', 'is_staff', 'last_login', 'date_joined']
    for line in csv.DictReader(openfile('users.csv')):
        uid = int(line['id'])
        copy_elements(line, uic[uid], fields)
        uic[uid]['user_id'] = uid
        nusers += 1
        uic[uid]['y1_anomalous'] = None
    
    print "  %d users loaded from users.csv" % nusers

    fp = openfile('profiles.csv')
    if fp is None:
        print "--> Skipping profiles.csv, file does not exist"
    else:
        nprofiles = 0
        fields = ['name', 'language', 'location', 'meta', 'courseware', 
                  'gender', 'mailing_address', 'year_of_birth', 'level_of_education', 'goals', 
                  'allow_certificate', 'country', 'city']
        for line in csv.DictReader(fp):
            uid = int(line['user_id'])
            copy_elements(line, uic[uid], fields, prefix="profile_")
            nprofiles += 1
        print "  %d profiles loaded from profiles.csv" % nprofiles
    
    fp = openfile('enrollment.csv')
    if fp is None:
        print "--> Skipping enrollment.csv, file does not exist"
    else:
        nenrollments = 0
        fields = ['course_id', 'created', 'is_active', 'mode', ]
        for line in csv.DictReader(fp):
            uid = int(line['user_id'])
            copy_elements(line, uic[uid], fields, prefix="enrollment_")
            nenrollments += 1
        print "  %d enrollments loaded from profiles.csv" % nenrollments
    
    # see if from_mongodb files are present for this course; if so, merge in that data
    mongodir = cdir.dirname() / 'from_mongodb'
    if mongodir.exists():
        print "--> %s exists, merging in users, profile, and enrollment data from mongodb" % mongodir
        sys.stdout.flush()
        fp = gzip.GzipFile(mongodir / "users.json.gz")
        fields = ['username', 'email', 'is_staff', 'last_login', 'date_joined']
        nadded = 0
        for line in fp:
            pdata = json.loads(line)
            uid = int(pdata['_id'])
            if not uid in uic:
                copy_elements(pdata, uic[uid], fields, skip_empty=True)
                uic[uid]['user_id'] = uid
                nadded += 1
        fp.close()
        print "  %d additional users loaded from %s/users.json.gz" % (nadded, mongodir)
                
        fp = gzip.GzipFile(mongodir / "profiles.json.gz")
        fields = ['name', 'language', 'location', 'meta', 'courseware', 
                  'gender', 'mailing_address', 'year_of_birth', 'level_of_education', 'goals', 
                  'allow_certificate', 'country', 'city']
        nadd_profiles = 0
        def fix_unicode(elem, fields):
            for k in fields:
                if (k in elem) and elem[k]:
                    elem[k] = elem[k].encode('utf8')

        for line in fp:
            pdata = json.loads(line.decode('utf8'))
            uid = int(pdata['user_id'])
            if not uic[uid].get('profile_name', None):
                copy_elements(pdata, uic[uid], fields, prefix="profile_", skip_empty=True)
                fix_unicode(uic[uid], ['profile_name', 'profile_mailing_address', 'profile_goals', 'profile_location', 'profile_language'])
                uic[uid]['y1_anomalous'] = 1
                nadd_profiles += 1
        fp.close()
        print "  %d additional profiles loaded from %s/profiles.json.gz" % (nadd_profiles, mongodir)
                
        # if datedir is specified, then do not add entries from mongodb where the enrollment happened after the datedir cutoff
        cutoff = None
        if datedir:
            cutoff = "%s 00:00:00" % datedir

        fp = gzip.GzipFile(mongodir / "enrollment.json.gz")
        fields = ['course_id', 'created', 'is_active', 'mode', ]
        nadd_enrollment = 0
        n_removed_after_cutoff = 0
        for line in fp:
            pdata = json.loads(line.decode('utf8'))
            uid = int(pdata['user_id'])
            if not uic[uid].get('enrollment_course_id', None):
                if cutoff and (pdata['created'] > cutoff) and (uic[uid].get('y1_anomalous')==1):	# remove if enrolled after datedir cutoff
                    uic.pop(uid)
                    n_removed_after_cutoff += 1
                else:
                    copy_elements(pdata, uic[uid], fields, prefix="enrollment_", skip_empty=True)
                    nadd_enrollment += 1
        fp.close()
        print "  %d additional enrollments loaded from %s/enrollment.json.gz" % (nadd_enrollment, mongodir)

        print "     from mongodb files, added %s (of %s) new users (%s profiles, %s enrollments, %s after cutoff %s)" % (nadded - n_removed_after_cutoff,
                                                                                                                         nadded, nadd_profiles, nadd_enrollment,
                                                                                                                         n_removed_after_cutoff,
                                                                                                                         cutoff)
        sys.stdout.flush()

    fp = openfile('certificates.csv')
    if fp is None:
        print "--> Skipping certificates.csv, file does not exist"
    else:
        for line in csv.DictReader(fp):
            uid = int(line['user_id'])
            fields = ['download_url', 'grade', 'course_id', 'key', 'distinction', 'status', 
                      'verify_uuid', 'download_uuid', 'name', 'created_date', 'modified_date', 'error_reason', 'mode',]
            copy_elements(line, uic[uid], fields, prefix="certificate_")
            if 'user_id' not in uic[uid]:
                uic[uid]['user_id'] = uid
    
    # sanity check for entries with user_id but missing username
    nmissing_uname = 0
    for uid, entry in uic.iteritems():
        if (not 'username' in entry) or (not entry['username']):
            nmissing_uname += 1
            if nmissing_uname < 10:
                print "missing username: %s" % entry
    print "--> %d entries missing username" % nmissing_uname
    sys.stdout.flush()
    
    # sanity check for entries missing course_id
    nmissing_cid = 0
    for uid, entry in uic.iteritems():
        if (not 'enrollment_course_id' in entry) or (not entry['enrollment_course_id']):
            nmissing_cid += 1
            entry['enrollment_course_id'] = course_id
    print "--> %d entries missing enrollment_course_id (all fixed by setting to %s)" % (nmissing_cid, course_id)
    sys.stdout.flush()

    fp = openfile('user_id_map.csv')
    if fp is None:
        print "--> Skipping user_id_map.csv, file does not exist"
    else:
        for line in csv.DictReader(fp):
            uid = int(line['id'])
            fields = ['hash_id']
            copy_elements(line, uic[uid], fields, prefix="id_map_")
    
    # sort by userid
    uidset = uic.keys()
    uidset.sort()
    
    # write out result, checking schema along the way
    
    fieldnames = the_dict_schema.keys()
    ofp = openfile('user_info_combo.json.gz', 'w')
    ocsv = csv.DictWriter(openfile('user_info_combo.csv.gz', 'w'), fieldnames=fieldnames)
    ocsv.writeheader()
    
    for uid in uidset:
        data = uic[uid]
        check_schema(uid, data, the_ds=the_dict_schema, coerce=True)
        if ('enrollment_course_id' not in data) and ('certificate_course_id' not in data):
            print "Oops!  missing course_id in user_info_combo line: inconsistent SQL?"
            print "data = %s" % data
            print "Suppressing this row"
            continue
        row_course_id = data.get('enrollment_course_id', data.get('certificate_course_id', ''))
        if not row_course_id==course_id:
            print "Oops!  course_id=%s in user_info_combo line: inconsistent with expected=%s" % (row_course_id, course_id)
            print "data = %s" % data
            print "Suppressing this row"
            continue
        ofp.write(json.dumps(data) + '\n')
        try:
            ocsv.writerow(data)
        except Exception as err:
            print "failed to write data=%s" % data
            raise
    
    print "Done with make_user_info_combo for %s" % course_id
    sys.stdout.flush()
def do_rephrase(data, do_schema_check=True, linecnt=0):

    # add course_id?
    if 'course_id' not in data:
        cid = data.get('context',{}).get('course_id','')
        if cid:
            data['course_id'] = cid
        
    # add module_id?
    if 'module_id' not in data:
        add_module_id(data)

    # ensure event is dict when possible
    if not 'event_js' in data:
        try:
            event = data['event']
            if not type(event)==dict:
                event = json.loads(event)
            event_js = True
        except Exception as err:
            event_js = False
            
        data['event'] = event
        data['event_js'] = event_js

    #----------------------------------------
    # "event" is used for too many kinds of data, with colliding types for fields.
    # thus, in general, we can't store it as a fixed schema field.
    #
    # so we turn "event" into a JSON string.
    #
    # for specific kinds of events ("event_type"), we keep a parsed version of
    # "event", e.g. for problem_* event types.
    #
    # store that parsed version as "event_struct"
        
    event = None
    if 'event' in data:
        event = data['event']
        data['event'] = json.dumps(data['event'])

    # now the real rephrasing

    event_type = data.get('event_type', None)

    #----------------------------------------
    # types to keep

    KNOWN_TYPES = ['play_video', 'seq_goto', 'seq_next', 'seq_prev', 
                   'seek_video', 'load_video', 
                   'save_problem_success',
                   'save_problem_fail',
                   'reset_problem_success',
                   'reset_problem_fail',
                   'show_answer',
                   'edx.course.enrollment.activated',
                   'edx.course.enrollment.deactivated',
                   'edx.course.enrollment.mode_changed',
                   'edx.course.enrollment.upgrade.succeeded',
                   'speed_change_video',
                   'problem_check', 
                   'problem_save', 
                   'problem_reset'
                   ]

    if (type(event)==dict and (('problem_' in event_type)
                              or event_type in KNOWN_TYPES)
        and not ('video_embedded' in event_type
                 or 'harvardx.button' in event_type
                 or 'harvardx.' in event_type
                 )):
        data['event_struct'] = event
    elif type(event)==dict:	# default to always including GET and POST when available
        data['event_struct'] = {'GET': json.dumps(event.get('GET')), 'POST': json.dumps(event.get('POST'))}
        data['event_struct']['query'] = event.get('query')
    else:
        if 'event_struct' in data:
            data.pop('event_struct')

    #----------------------------------------
    # special cases

    if '_id' in data:
        data['mongoid'] = data['_id']['$oid']
        data.pop('_id')

    if type(event)==dict and 'POST' in event:
        post_str = json.dumps(event['POST'])
        event['POST'] = post_str

    if type(event)==dict and 'GET' in event:
        get_str = json.dumps(event['GET'])
        event['GET'] = get_str

    if event_type in ['problem_check', 'problem_save', 'problem_reset'] and data['event_source']=='browser':
        if type(event) in [str, unicode]:
            event = {'data': json.dumps(event)}

    if type(event) in [str, unicode]:
        #if event and data['event_js']:
        #    sys.stderr.write('unexpected STRING event: ' + json.dumps(data, indent=4) + '\n')
        event = {'data': json.dumps(event)}

    if type(event) in [list]:
        event = {'data': json.dumps(event)}

    def make_str(key):
        if event is not None and 'state' in event:
            state = event['state']
            if key in state:
                state[key] = json.dumps(state[key])

    make_str('input_state')
    make_str('correct_map')
    make_str('student_answers')

    def make_str0(key):
        ev = event or {}
        if key in ev:
            ev[key] = json.dumps(ev[key])

    make_str0('correct_map')
    make_str0('answers')
    make_str0('submission')
    make_str0('old_state')
    make_str0('new_state')
    make_str0('permutation')
    make_str0('options_selected')
    make_str0('corrections')

    def make_str2(key):
        context = data.get('context', {})
        if key in context:
            context[key] = json.dumps(context[key])

    make_str2('course_user_tags')

    def move_unknown_fields_from_context_to_context_agent(keys):	# needed to handle new fields from mobile client
        context = data.get('context', {})
        agent = {'oldagent': context.get('agent', "")}
        for key in keys:
            if '.' in key:
                (prefix, subkey) = key.split('.',1)
                if prefix in context:
                    subcontext = context[prefix]
                    if subkey in subcontext:
                        agent[key] = subcontext[subkey]
                        subcontext.pop(subkey)
            else:
                if key in context:
                    agent[key] = context[key]
                    context.pop(key)
        context['agent'] = json.dumps(agent)

    # 31-Jan-15: handle new "module.usage_key" field in context, e.g.:
    #
    #    "module": {
    #        "display_name": "Radiation Exposure", 
    #        "usage_key": "i4x://MITx/6.00.1x_5/problem/ps03:ps03-Radiation-Exposure"
    #    }, 
    # 28-May-16: context.asides

    mobile_api_context_fields = ['application', 'client', 'received_at', 'component', "open_in_browser_url", 
                                 "module.usage_key",
                                 "module.original_usage_version",
                                 "module.original_usage_key",
                                 "asides",
                             ]
    move_unknown_fields_from_context_to_context_agent(mobile_api_context_fields)

    #----------------------------------------
    # new fields which are not in schema get moved as JSON strings to the pre-existing "mongoid" field, 
    # which is unused except in very old records
    # do this, for example, for the "referer" and "accept_language" fields

    def move_fields_to_mongoid(field_paths):
        '''
        field_path is a list of lists which gives the recursive dict keys to traverse to get to the field to move.
        Move that field, with the path intact, into the mongoid field.
        '''
        mongoid = data.get('mongoid')
        if not type(mongoid)==dict:
            mongoid = {'old_mongoid' : mongoid}

        def move_field_value(ddict, vdict, fp):
            '''recursively traverse dict to get and move value for specified field path'''
            key = fp[0]
            if len(fp)==1:		# base case
                if key in ddict:
                    fval = ddict.get(key)
                    vdict[key] = fval	# copy to new values dict
                    ddict.pop(key)		# remove key from current path within data dict
                    return fval
                return None
            
            if not key in vdict:
                vdict[key] = {}

            return move_field_value(ddict.get(key, {}), vdict[key], fp[1:])
        
        vdict = mongoid
        for field_path in field_paths:
            move_field_value(data, vdict, field_path)
            
        data['mongoid'] = json.dumps(vdict)

    # 16-Mar-15: remove event_struct.requested_skip_interval

    move_fields_to_mongoid([ ['referer'],
                             ['accept_language'],
                             ['event_struct', 'requested_skip_interval'],
                             ['event_struct', 'submitted_answer'],
                             ['event_struct', 'num_attempts'],
                             ['event_struct', 'task_id'],	# 05oct15
                             ['event_struct', 'content'],	# 11jan16
                             ['nonInteraction'], 	# 24aug15
                             ['label'],	 		# 24aug15
                             ['event_struct', 'widget_placement'],	# 08may16
                             ['event_struct', 'tab_count'],	# 08may16
                             ['event_struct', 'current_tab'],	# 08may16
                             ['event_struct', 'target_tab'],	# 08may16
                         ])

    #----------------------------------------
    # general checks

    def fix_dash(key):
        ev = event or {}
        if key in ev:
            newkey = key.replace('-', '_')
            ev[newkey] = ev[key]
            ev.pop(key)

    fix_dash('child-id')

    def check_empty(data, *keys):
        # print "--> keys=%s, data=%s" % (str(keys), data)
        key = keys[0]
        if type(data)==dict and key in data:
            if len(keys)==1:
                if data[key] in ["", u'']:
                    # print "---> popped %s" % key
                    data.pop(key)
            else:
                check_empty(data[key], *keys[1:])

    check_empty(data, 'context', "user_id")

    data.pop('event_js')	# leftover from mongo import script

    #-----------------------------------------
    # check for null values in speed_change_video
    # Error encountered parsing LAC data from Oct. 2013
    # Requires that we also be able to convert the value to a float

    def string_is_float(s):
        try:
            float(s)
            return True
        except ValueError:
            return False

    if data['event_type']=='speed_change_video':
        if 'event_struct' in data and 'new_speed' in data['event_struct']:
            # First check if string is float
            if string_is_float(data['event_struct']['new_speed']):
                # Second check if value is null
                if isnan(float(data['event_struct']['new_speed'])):
                    data['event_struct'].pop('new_speed')


    # check for any funny keys, recursively
    funny_key_sections = []
    def check_for_funny_keys(entry, name='toplevel'):
        for key, val in entry.iteritems():
            if key.startswith('i4x-') or key.startswith('xblock.'):
                sys.stderr.write("[rephrase] oops, funny key at %s in entry: %s, data=%s\n" % (name, entry, ''))
                funny_key_sections.append(name)
                return True
            if len(key)>25:
                sys.stderr.write("[rephrase] suspicious key at %s in entry: %s, data=%s\n" % (name, entry, ''))

            if key[0] in '0123456789':
                sys.stderr.write("[rephrase] oops, funny key at %s in entry: %s, data=%s\n" % (name, entry, ''))
                funny_key_sections.append(name)
                return True
                
            if '-' in key or '.' in key:
                # bad key name!  rename it, chaning "-" to "_"
                newkey = key.replace('-','_').replace('.','__')
                sys.stderr.write("[rephrase] oops, bad keyname at %s in entry: %s newkey+%s\n" % (name, entry, newkey))
                entry[newkey] = val
                entry.pop(key)
                key = newkey
            if type(val)==dict:
                ret = check_for_funny_keys(val, name + '/' + key)
                if ret is True:
                    sys.stderr.write("        coercing section %s to become a string\n" % (name+"/"+key) )
                    entry[key] = json.dumps(val)
        return False

    check_for_funny_keys(data)

    try:
        check_schema(linecnt, data, coerce=True)
    except Exception as err:
        sys.stderr.write('[%d] oops, err=%s, failed in check_schema %s\n' % (linecnt, str(err), json.dumps(data, indent=4)))
        sys.stderr.write(traceback.format_exc())
        return
def do_rephrase(data, do_schema_check=True, linecnt=0):

    if '_id' in data:
        data['mongoid'] = data['_id']['$oid']
        data.pop('_id')

    if 'parent_id' in data:
        data['parent_id'] = data['parent_id']['$oid']

    def fix_date(dstr):
        if dstr:
            try:
                dtime = int(dstr)
                if dtime:
                    try:
                        dt = datetime.datetime.utcfromtimestamp(dtime/1000.0)
                    except Exception as err:
                        print "oops, failed to convert in utcfromtimestamp dtime=%s, dstr=%s" % (dtime, dstr)
                        raise
                    return str(dt)
            except Exception as err:
                try:
                    dt = date_parse(dstr[:16])
                    return str(dt)
                except Exception as err:
                    return dstr
        return None

    def do_fix_date(field, rec):
        if field in rec:
            rec[field] = fix_date(rec[field]['$date'])

    do_fix_date('time', data.get('endorsement',{}) or {})

    if 'updated_at' in data:
        data['updated_at'] = fix_date(data['updated_at']['$date'])

    if 'created_at' in data:
        data['created_at'] = fix_date(data['created_at']['$date'])

    if 'last_activity_at' in data:
        data['last_activity_at'] = fix_date(data['last_activity_at']['$date'])

    if 'comment_thread_id' in data:
        data['comment_thread_id'] = data['comment_thread_id']['$oid']

    if ('endorsement' in data) and ((data['endorsement']=='null') or (not data['endorsement']) or (data['endorsement'] is None)):
        data.pop('endorsement')

    if 'parent_ids' in data:
        data['parent_ids'] = ' '.join([x['$oid'] for x in data['parent_ids']])

    def mkstring(key, rec):
        if key in rec:
            rec[key] = str(rec[key])

    mkstring('historical_abuse_flaggers', data)
    mkstring('abuse_flaggers', data)
    mkstring('at_position_list', data)
    mkstring('tags_array', data)

    mkstring('up', data.get('votes', {}))
    mkstring('down', data.get('votes', {}))

    # check for any funny keys, recursively
    funny_key_sections = []
    def check_for_funny_keys(entry, name='toplevel'):
        for key, val in entry.iteritems():
            if key.startswith('i4x-') or key.startswith('xblock.'):
                sys.stderr.write("[rephrase] oops, funny key at %s in entry: %s, data=%s\n" % (name, entry, ''))
                funny_key_sections.append(name)
                return True
            if len(key)>25:
                sys.stderr.write("[rephrase] suspicious key at %s in entry: %s, data=%s\n" % (name, entry, ''))

            if key[0] in '0123456789':
                sys.stderr.write("[rephrase] oops, funny key at %s in entry: %s, data=%s\n" % (name, entry, ''))
                funny_key_sections.append(name)
                return True
                
            if '-' in key or '.' in key:
                # bad key name!  rename it, chaning "-" to "_"
                newkey = key.replace('-','_').replace('.','__')
                sys.stderr.write("[rephrase] oops, bad keyname at %s in entry: %s newkey+%s\n" % (name, entry, newkey))
                entry[newkey] = val
                entry.pop(key)
                key = newkey
            if type(val)==dict:
                ret = check_for_funny_keys(val, name + '/' + key)
                if ret is True:
                    sys.stderr.write("        coercing section %s to become a string\n" % (name+"/"+key) )
                    entry[key] = json.dumps(val)
        return False

    check_for_funny_keys(data)

    if 'context' in data:
        data.pop('context')	# 25aug15: remove key

    try:
        check_schema(linecnt, data, the_ds=SCHEMA_DICT, coerce=True)
    except Exception as err:
        sys.stderr.write('[%d] oops, err=%s, failed in check_schema %s\n' % (linecnt, str(err), json.dumps(data, indent=4)))
        sys.stderr.write(traceback.format_exc())
        return
def do_save(cid, caset_in, xbundle, datadir, log_msg, use_dataset_latest=False):
    '''
    Save course axis data to bigquery
    
    cid = course_id
    caset = list of course axis data in dict format
    xbundle = XML bundle of course (everything except static files)
    datadir = directory where output files should be written
    log_msg = list of messages about processing errors and issues
    '''

    # BigQuery requires data to fit within a schema; let's make sure our lines all fit the schema
    mypath = os.path.dirname(os.path.realpath(__file__))
    the_schema = json.loads(open('%s/schemas/schema_course_axis.json' % mypath).read())['course_axis']
    dict_schema = schema2dict(the_schema)

    caset = copy.deepcopy(caset_in)

    datadir = path(datadir)
    cafn = datadir / 'course_axis.json' 
    xbfn = datadir / ('xbundle_%s.xml' % (cid.replace('/','__')))
    fp = open(cafn, 'w')
    linecnt = 0

    for ca in caset:
        linecnt += 1
        ca['course_id'] = cid
        data = ca['data']
        if data and not type(data)==dict:
            try:
                ca['data'] = json.loads(data)	# make it native, for mongo
            except Exception as err:
                print "failed to create json for %s, error=%s" % (data, err)
        if ca['start'] is not None:
            ca['start'] = str(ca['start'])	# datetime to string
        if  ca['due'] is not None:
            ca['due'] = str(ca['due'])	# datetime to string
        if (ca['data'] is None) or (ca['data']==''):
            ca.pop('data')
        check_schema(linecnt, ca, the_ds=dict_schema, coerce=True)
        try:
            # db.course_axis.insert(ca)
            fp.write(json.dumps(ca)+'\n')
        except Exception as err:
            print "Failed to save!  Error=%s, data=%s" % (err, ca)
    fp.close()

    # upload axis.json file and course xbundle
    gsdir = path(gsutil.gs_path_from_course_id(cid, use_dataset_latest=use_dataset_latest))
    if 1:
        gsutil.upload_file_to_gs(cafn, gsdir, options="-z json", verbose=False)
        gsutil.upload_file_to_gs(xbfn, gsdir, options='-z xml', verbose=False)

    # import into BigQuery
    dataset = bqutil.course_id2dataset(cid, use_dataset_latest=use_dataset_latest)
    bqutil.create_dataset_if_nonexistent(dataset)	# create dataset if not already existent
    table = "course_axis"
    bqutil.load_data_to_table(dataset, table, gsdir / (cafn.basename()), the_schema)

    msg = "="*100 + '\n'
    msg += "Course axis for %s\n" % (cid)
    msg += "="*100 + '\n'
    msg += '\n'.join(log_msg)
    msg = msg[:16184]		# max message length 16384
    
    bqutil.add_description_to_table(dataset, table, msg, append=True)

    print "    Done - inserted %s records into course_axis" % len(caset)
def do_rephrase(data, do_schema_check=True, linecnt=0):
    """
    Modify the provided data dictionary in place to rephrase
    certain pieces of data for easy loading to BigQuery

    :TODO: Move the inner functions outside this function.

    :type: dict
    :param data: A tracking log record from the edX nightly data files
    :type: bool
    :param do_schema_check: Whether or not the provided record should be checked
    against the target schema
    :type: int
    :param linecnt: Some line count value
    :rtype: None
    :return: Nothing is returned since the data parameter is modified in place
    """
    # add course_id?
    if 'course_id' not in data:
        cid = data.get('context', {}).get('course_id', '')
        if cid:
            data['course_id'] = cid
    # add module_id?
    if 'module_id' not in data:
        add_module_id(data)

    if not "event" in data:
        data['event'] = ""

    # ensure event is dict when possible
    if not 'event_js' in data:
        event = data.get('event')
        try:
            if not isinstance(event, dict):
                event = json.loads(event)
            event_js = True
        except Exception as err:
            # note - do not erase event even if it can't be loaded as JSON: see how it becomes JSONified below
            event_js = False
        data['event'] = event
        data['event_js'] = event_js

    #----------------------------------------
    # "event" is used for too many kinds of data, with colliding types for fields.
    # thus, in general, we can't store it as a fixed schema field.
    #
    # so we turn "event" into a JSON string.
    #
    # for specific kinds of events ("event_type"), we keep a parsed version of
    # "event", e.g. for problem_* event types.
    #
    # store that parsed version as "event_struct"
    event = None
    if 'event' in data:
        event = data['event']
        data['event'] = json.dumps(data['event'])

    # now the real rephrasing

    event_type = data.get('event_type', '')

    #----------------------------------------
    # types to keep

    KNOWN_TYPES = [
        'play_video', 'seq_goto', 'seq_next', 'seq_prev', 'seek_video',
        'load_video', 'save_problem_success', 'save_problem_fail',
        'reset_problem_success', 'reset_problem_fail', 'show_answer',
        'edx.course.enrollment.activated', 'edx.course.enrollment.deactivated',
        'edx.course.enrollment.mode_changed',
        'edx.course.enrollment.upgrade.succeeded', 'speed_change_video',
        'problem_check', 'problem_save', 'problem_reset'
    ]
    if isinstance(event, dict):
        outs = ('video_embedded', 'harvardx.button', 'harvardx.')
        out_conds = not any(k in event_type for k in outs)
        in_conds = 'problem_' in event_type or event_type in KNOWN_TYPES
        if in_conds and out_conds:
            data['event_struct'] = event
        else:
            data['event_struct'] = {
                'GET': json.dumps(event.get('GET')),
                'POST': json.dumps(event.get('POST')),
                'query': event.get('query'),
            }
    else:
        if 'event_struct' in data:
            data.pop('event_struct')

    #----------------------------------------
    # special cases

    if '_id' in data:
        data['mongoid'] = data['_id']['$oid']
        data.pop('_id')

    if isinstance(event, dict) and 'POST' in event:
        post_str = json.dumps(event['POST'])
        event['POST'] = post_str

    if isinstance(event, dict) and 'GET' in event:
        get_str = json.dumps(event['GET'])
        event['GET'] = get_str

    if event_type in ['problem_check', 'problem_save', 'problem_reset'
                      ] and data['event_source'] == 'browser':
        if isinstance(event, (str, unicode)):
            event = {'data': json.dumps(event)}

    if isinstance(event, (str, unicode)):
        #if event and data['event_js']:
        #    sys.stderr.write('unexpected STRING event: ' + json.dumps(data, indent=4) + '\n')
        event = {'data': json.dumps(event)}

    if isinstance(event, (list, )):
        event = {'data': json.dumps(event)}

    def make_str(key):
        if event is not None and 'state' in event:
            state = event['state']
            if key in state:
                state[key] = json.dumps(state[key])

    make_str('input_state')
    make_str('correct_map')
    make_str('student_answers')

    def make_str0(key):
        ev = event or {}
        if key in ev:
            ev[key] = json.dumps(ev[key])

    make_str0('correct_map')
    make_str0('answers')
    make_str0('submission')
    make_str0('old_state')
    make_str0('new_state')
    make_str0('permutation')
    make_str0('options_selected')
    make_str0('corrections')

    def make_str2(key):
        context = data.get('context', {})
        if key in context:
            context[key] = json.dumps(context[key])

    make_str2('course_user_tags')

    def move_unknown_fields_from_context_to_context_agent(
            keys):  # needed to handle new fields from mobile client
        context = data.get('context', {})
        agent = {'oldagent': context.get('agent', "")}
        for key in keys:
            if '.' in key:
                (prefix, subkey) = key.split('.', 1)
                if prefix in context:
                    subcontext = context[prefix]
                    if subkey in subcontext:
                        agent[key] = subcontext[subkey]
                        subcontext.pop(subkey)
            else:
                if key in context:
                    agent[key] = context[key]
                    context.pop(key)
        context['agent'] = json.dumps(agent)

    # 31-Jan-15: handle new "module.usage_key" field in context, e.g.:
    #
    #    "module": {
    #        "display_name": "Radiation Exposure",
    #        "usage_key": "i4x://MITx/6.00.1x_5/problem/ps03:ps03-Radiation-Exposure"
    #    },
    # 28-May-16: context.asides

    mobile_api_context_fields = [
        'application',
        'client',
        'received_at',
        'component',
        "open_in_browser_url",
        "module.usage_key",
        "module.original_usage_version",
        "module.original_usage_key",
        "asides",
    ]
    move_unknown_fields_from_context_to_context_agent(
        mobile_api_context_fields)

    #----------------------------------------
    # new fields which are not in schema get moved as JSON strings to the pre-existing "mongoid" field,
    # which is unused except in very old records
    # do this, for example, for the "referer" and "accept_language" fields

    def move_fields_to_mongoid(field_paths):
        '''
        field_path is a list of lists which gives the recursive dict keys to traverse to get to the field to move.
        Move that field, with the path intact, into the mongoid field.
        '''
        mongoid = data.get('mongoid')
        if not isinstance(mongoid, dict):
            mongoid = {'old_mongoid': mongoid}

        def move_field_value(ddict, vdict, fp):
            '''recursively traverse dict to get and move value for specified field path'''
            key = fp[0]
            if len(fp) == 1:  # base case
                if key in ddict:
                    fval = ddict.get(key)
                    vdict[key] = fval  # copy to new values dict
                    ddict.pop(
                        key)  # remove key from current path within data dict
                    return fval
                return None
            if not key in vdict:
                vdict[key] = {}
            return move_field_value(ddict.get(key, {}), vdict[key], fp[1:])

        vdict = mongoid
        for field_path in field_paths:
            move_field_value(data, vdict, field_path)
        data['mongoid'] = json.dumps(vdict)

    # 16-Mar-15: remove event_struct.requested_skip_interval

    move_fields_to_mongoid([
        ['referer'],
        ['accept_language'],
        ['event_struct', 'requested_skip_interval'],
        ['event_struct', 'submitted_answer'],
        ['event_struct', 'num_attempts'],
        ['event_struct', 'task_id'],  # 05oct15
        ['event_struct', 'content'],  # 11jan16
        ['nonInteraction'],  # 24aug15
        ['label'],  # 24aug15
        ['event_struct', 'widget_placement'],  # 08may16
        ['event_struct', 'tab_count'],  # 08may16
        ['event_struct', 'current_tab'],  # 08may16
        ['event_struct', 'target_tab'],  # 08may16
        ['event_struct', 'state', 'has_saved_answers'],  # 06dec2016
        ['context', 'label'],  # 24aug15
        ['roles'],  # 06sep2017 rp
        ['environment'],  # 06sep2017 rp
        ['minion_id'],  # 06sep2017 rp
        ['event_struct', 'duration'],  # 22nov2017 ic
        ['event_struct', 'play_medium']
    ])

    #----------------------------------------
    # general checks

    def fix_dash(key):
        ev = event or {}
        if key in ev:
            newkey = key.replace('-', '_')
            ev[newkey] = ev[key]
            ev.pop(key)

    fix_dash('child-id')

    def check_empty(data, *keys):
        # print "--> keys=%s, data=%s" % (str(keys), data)
        key = keys[0]
        if isinstance(data, dict) and key in data:
            if len(keys) == 1:
                if data[key] in ["", u'']:
                    # print "---> popped %s" % key
                    data.pop(key)
            else:
                check_empty(data[key], *keys[1:])

    check_empty(data, 'context', "user_id")

    data.pop('event_js')  # leftover from mongo import script

    #-----------------------------------------
    # check for null values in speed_change_video
    # Error encountered parsing LAC data from Oct. 2013
    # Requires that we also be able to convert the value to a float

    def string_is_float(s):
        try:
            float(s)
            return True
        except ValueError:
            return False

    if data.get('event_type') == 'speed_change_video':
        if 'event_struct' in data and 'new_speed' in data['event_struct']:
            # First check if string is float
            if string_is_float(data['event_struct']['new_speed']):
                # Second check if value is null
                if isnan(float(data['event_struct']['new_speed'])):
                    data['event_struct'].pop('new_speed')

    # check for any funny keys, recursively
    funny_key_sections = []

    def check_for_funny_keys(entry, name='toplevel'):
        for key, val in entry.iteritems():
            if key.startswith('i4x-') or key.startswith('xblock.'):
                sys.stderr.write(
                    "[rephrase] oops, funny key at %s in entry: %s, data=%s\n"
                    % (name, entry, ''))
                funny_key_sections.append(name)
                return True
            if len(key) > 25:
                sys.stderr.write(
                    "[rephrase] suspicious key at %s in entry: %s, data=%s\n" %
                    (name, entry, ''))

            if key[0].isdigit():
                sys.stderr.write(
                    "[rephrase] oops, funny key at %s in entry: %s, data=%s\n"
                    % (name, entry, ''))
                funny_key_sections.append(name)
                return True

            if '-' in key or '.' in key:
                # bad key name!  rename it, chaning "-" to "_"
                newkey = key.replace('-', '_').replace('.', '__')
                sys.stderr.write(
                    "[rephrase] oops, bad keyname at %s in entry: %s newkey+%s\n"
                    % (name, entry, newkey))
                entry[newkey] = val
                entry.pop(key)
                key = newkey
            if isinstance(val, dict):
                ret = check_for_funny_keys(val, name + '/' + key)
                if ret is True:
                    sys.stderr.write(
                        "        coercing section %s to become a string\n" %
                        (name + "/" + key))
                    entry[key] = json.dumps(val)
        return False

    check_for_funny_keys(data)

    try:
        check_schema(linecnt, data, coerce=True)
    except Exception as err:
        sys.stderr.write('[%d] oops, err=%s, failed in check_schema %s\n' %
                         (linecnt, str(err), json.dumps(data, indent=4)))
        sys.stderr.write(traceback.format_exc())
        return
def do_rephrase(data, do_schema_check=True, linecnt=0):

    # add course_id?
    if "course_id" not in data:
        cid = data.get("context", {}).get("course_id", "")
        if cid:
            data["course_id"] = cid

    # add module_id?
    if "module_id" not in data:
        add_module_id(data)

    # ensure event is dict when possible
    if not "event_js" in data:
        try:
            event = data["event"]
            if not type(event) == dict:
                event = json.loads(event)
            event_js = True
        except Exception as err:
            event_js = False

        data["event"] = event
        data["event_js"] = event_js

    # ----------------------------------------
    # "event" is used for too many kinds of data, with colliding types for fields.
    # thus, in general, we can't store it as a fixed schema field.
    #
    # so we turn "event" into a JSON string.
    #
    # for specific kinds of events ("event_type"), we keep a parsed version of
    # "event", e.g. for problem_* event types.
    #
    # store that parsed version as "event_struct"

    event = None
    if "event" in data:
        event = data["event"]
        data["event"] = json.dumps(data["event"])

    # now the real rephrasing

    event_type = data.get("event_type", None)

    # ----------------------------------------
    # types to keep

    KNOWN_TYPES = [
        "play_video",
        "seq_goto",
        "seq_next",
        "seq_prev",
        "seek_video",
        "load_video",
        "save_problem_success",
        "save_problem_fail",
        "reset_problem_success",
        "reset_problem_fail",
        "show_answer",
        "edx.course.enrollment.activated",
        "edx.course.enrollment.deactivated",
        "edx.course.enrollment.mode_changed",
        "edx.course.enrollment.upgrade.succeeded",
        "speed_change_video",
        "problem_check",
        "problem_save",
        "problem_reset",
    ]

    if (
        type(event) == dict
        and (("problem_" in event_type) or event_type in KNOWN_TYPES)
        and not ("video_embedded" in event_type or "harvardx.button" in event_type or "harvardx." in event_type)
    ):
        data["event_struct"] = event
    elif type(event) == dict:  # default to always including GET and POST when available
        data["event_struct"] = {"GET": json.dumps(event.get("GET")), "POST": json.dumps(event.get("POST"))}
        data["event_struct"]["query"] = event.get("query")
    else:
        if "event_struct" in data:
            data.pop("event_struct")

    # ----------------------------------------
    # special cases

    if "_id" in data:
        data["mongoid"] = data["_id"]["$oid"]
        data.pop("_id")

    if type(event) == dict and "POST" in event:
        post_str = json.dumps(event["POST"])
        event["POST"] = post_str

    if type(event) == dict and "GET" in event:
        get_str = json.dumps(event["GET"])
        event["GET"] = get_str

    if event_type in ["problem_check", "problem_save", "problem_reset"] and data["event_source"] == "browser":
        if type(event) in [str, unicode]:
            event = {"data": json.dumps(event)}

    if type(event) in [str, unicode]:
        # if event and data['event_js']:
        #    sys.stderr.write('unexpected STRING event: ' + json.dumps(data, indent=4) + '\n')
        event = {"data": json.dumps(event)}

    if type(event) in [list]:
        event = {"data": json.dumps(event)}

    def make_str(key):
        if event is not None and "state" in event:
            state = event["state"]
            if key in state:
                state[key] = json.dumps(state[key])

    make_str("input_state")
    make_str("correct_map")
    make_str("student_answers")

    def make_str0(key):
        ev = event or {}
        if key in ev:
            ev[key] = json.dumps(ev[key])

    make_str0("correct_map")
    make_str0("answers")
    make_str0("submission")
    make_str0("old_state")
    make_str0("new_state")
    make_str0("permutation")
    make_str0("options_selected")
    make_str0("corrections")

    def make_str2(key):
        context = data.get("context", {})
        if key in context:
            context[key] = json.dumps(context[key])

    make_str2("course_user_tags")

    def move_unknown_fields_from_context_to_context_agent(keys):  # needed to handle new fields from mobile client
        context = data.get("context", {})
        agent = {"oldagent": context.get("agent", "")}
        for key in keys:
            if "." in key:
                (prefix, subkey) = key.split(".", 1)
                if prefix in context:
                    subcontext = context[prefix]
                    if subkey in subcontext:
                        agent[key] = subcontext[subkey]
                        subcontext.pop(subkey)
            else:
                if key in context:
                    agent[key] = context[key]
                    context.pop(key)
        context["agent"] = json.dumps(agent)

    # 31-Jan-15: handle new "module.usage_key" field in context, e.g.:
    #
    #    "module": {
    #        "display_name": "Radiation Exposure",
    #        "usage_key": "i4x://MITx/6.00.1x_5/problem/ps03:ps03-Radiation-Exposure"
    #    },

    mobile_api_context_fields = [
        "application",
        "client",
        "received_at",
        "component",
        "open_in_browser_url",
        "module.usage_key",
        "module.original_usage_version",
        "module.original_usage_key",
    ]
    move_unknown_fields_from_context_to_context_agent(mobile_api_context_fields)

    # ----------------------------------------
    # new fields which are not in schema get moved as JSON strings to the pre-existing "mongoid" field,
    # which is unused except in very old records
    # do this, for example, for the "referer" and "accept_language" fields

    def move_fields_to_mongoid(field_paths):
        """
        field_path is a list of lists which gives the recursive dict keys to traverse to get to the field to move.
        Move that field, with the path intact, into the mongoid field.
        """
        mongoid = data.get("mongoid")
        if not type(mongoid) == dict:
            mongoid = {"old_mongoid": mongoid}

        def move_field_value(ddict, vdict, fp):
            """recursively traverse dict to get and move value for specified field path"""
            key = fp[0]
            if len(fp) == 1:  # base case
                if key in ddict:
                    fval = ddict.get(key)
                    vdict[key] = fval  # copy to new values dict
                    ddict.pop(key)  # remove key from current path within data dict
                    return fval
                return None

            if not key in vdict:
                vdict[key] = {}

            return move_field_value(ddict.get(key, {}), vdict[key], fp[1:])

        vdict = mongoid
        for field_path in field_paths:
            move_field_value(data, vdict, field_path)

        data["mongoid"] = json.dumps(vdict)

    # 16-Mar-15: remove event_struct.requested_skip_interval

    move_fields_to_mongoid(
        [
            ["referer"],
            ["accept_language"],
            ["event_struct", "requested_skip_interval"],
            ["event_struct", "submitted_answer"],
            ["event_struct", "num_attempts"],
            ["event_struct", "task_id"],  # 05oct15
            ["nonInteraction"],  # 24aug15
            ["label"],  # 24aug15
        ]
    )

    # ----------------------------------------
    # general checks

    def fix_dash(key):
        ev = event or {}
        if key in ev:
            newkey = key.replace("-", "_")
            ev[newkey] = ev[key]
            ev.pop(key)

    fix_dash("child-id")

    def check_empty(data, *keys):
        # print "--> keys=%s, data=%s" % (str(keys), data)
        key = keys[0]
        if type(data) == dict and key in data:
            if len(keys) == 1:
                if data[key] in ["", u""]:
                    # print "---> popped %s" % key
                    data.pop(key)
            else:
                check_empty(data[key], *keys[1:])

    check_empty(data, "context", "user_id")

    data.pop("event_js")  # leftover from mongo import script

    # -----------------------------------------
    # check for null values in speed_change_video
    # Error encountered parsing LAC data from Oct. 2013
    # Requires that we also be able to convert the value to a float

    def string_is_float(s):
        try:
            float(s)
            return True
        except ValueError:
            return False

    if data["event_type"] == "speed_change_video":
        if "event_struct" in data and "new_speed" in data["event_struct"]:
            # First check if string is float
            if string_is_float(data["event_struct"]["new_speed"]):
                # Second check if value is null
                if isnan(float(data["event_struct"]["new_speed"])):
                    data["event_struct"].pop("new_speed")

    # check for any funny keys, recursively
    funny_key_sections = []

    def check_for_funny_keys(entry, name="toplevel"):
        for key, val in entry.iteritems():
            if key.startswith("i4x-") or key.startswith("xblock."):
                sys.stderr.write("[rephrase] oops, funny key at %s in entry: %s, data=%s\n" % (name, entry, ""))
                funny_key_sections.append(name)
                return True
            if len(key) > 25:
                sys.stderr.write("[rephrase] suspicious key at %s in entry: %s, data=%s\n" % (name, entry, ""))

            if key[0] in "0123456789":
                sys.stderr.write("[rephrase] oops, funny key at %s in entry: %s, data=%s\n" % (name, entry, ""))
                funny_key_sections.append(name)
                return True

            if "-" in key or "." in key:
                # bad key name!  rename it, chaning "-" to "_"
                newkey = key.replace("-", "_").replace(".", "__")
                sys.stderr.write("[rephrase] oops, bad keyname at %s in entry: %s newkey+%s\n" % (name, entry, newkey))
                entry[newkey] = val
                entry.pop(key)
                key = newkey
            if type(val) == dict:
                ret = check_for_funny_keys(val, name + "/" + key)
                if ret is True:
                    sys.stderr.write("        coercing section %s to become a string\n" % (name + "/" + key))
                    entry[key] = json.dumps(val)
        return False

    check_for_funny_keys(data)

    try:
        check_schema(linecnt, data, coerce=True)
    except Exception as err:
        sys.stderr.write(
            "[%d] oops, err=%s, failed in check_schema %s\n" % (linecnt, str(err), json.dumps(data, indent=4))
        )
        sys.stderr.write(traceback.format_exc())
        return