def process_course(course_id, basedir, datedir, use_dataset_latest, verbose=False):
    sdir = load_course_sql.find_course_sql_dir(course_id, 
                                               basedir=basedir,
                                               datedir=datedir,
                                               use_dataset_latest=use_dataset_latest,
                                               )
    edx2course_axis.DATADIR = sdir
    edx2course_axis.VERBOSE_WARNINGS = verbose
    fn = sdir / 'course.xml.tar.gz'
    if not os.path.exists(fn):
        fn = sdir / 'course-prod-analytics.xml.tar.gz'
        if not os.path.exists(fn):
            fn = sdir / 'course-prod-edge-analytics.xml.tar.gz'
            if not os.path.exists(fn):
                print "---> oops, cannot generate course axis for %s, file %s (or 'course.xml.tar.gz' or 'course-prod-edge-analytics.xml.tar.gz') missing!" % (course_id, fn)
                sys.stdout.flush()
                return

    # TODO: only create new axis if the table is missing, or the course axis is not already created

    try:
        edx2course_axis.process_xml_tar_gz_file(fn,
                                                use_dataset_latest=use_dataset_latest,
                                                force_course_id=course_id)
    except Exception as err:
        print err
def upload_grades_persistent_data(cid, basedir, datedir, use_dataset_latest=False, subsection=False):
    """
    Upload grades_persistent csv.gz to Google Storage,
    create the BigQuery table,
    then insert the data into the table.

    :param cid: the course id
    :param basedir: the base directory path
    :param datedir: the date directory name (represented as YYYY-MM-DD)
    :param use_dataset_latest: should the most recent dataset be used?
    :param subsection: should grades_persistentsubsection be uploaded?
    :type cid: str
    :type basedir: str
    :type datedir: str
    :type use_dataset_latest: bool
    :type subsection: bool
    """
    gsdir = path(gsutil.gs_path_from_course_id(cid, use_dataset_latest=use_dataset_latest))

    if subsection:
        csv_name = "grades_persistentsubsectiongrade.csv.gz"
        temp_name = "grades_persistentsubsectiongrade_temp.csv.gz"
        table = "grades_persistent_subsection"
    else:
        csv_name = "grades_persistentcoursegrade.csv.gz"
        temp_name = "grades_persistentcoursegrade_temp.csv.gz"
        table = "grades_persistent"

    sdir = load_course_sql.find_course_sql_dir(cid,
                                               basedir=basedir,
                                               datedir=datedir,
                                               use_dataset_latest=(use_dataset_latest),
                                               )

    csvfn = sdir / csv_name
    tempfn = sdir / temp_name

    mypath = os.path.dirname(os.path.realpath(__file__))
    the_schema = json.loads(open('%s/schemas/schema_%s.json' % (mypath, table)).read())[table]

    if not os.path.exists(csvfn):
        print "[edx2bigquery] make_grades_persistent: missing file %s, skipping" % csvfn
        return

    if not subsection:
        cleanup_rows_from_grade_persistent(csvfn, tempfn)
    else:
        cleanup_rows_from_grade_persistent(csvfn, tempfn, field_to_fix="first_attempted")

    gsutil.upload_file_to_gs(csvfn, gsdir, options="-z csv", verbose=True)

    dataset = bqutil.course_id2dataset(cid, use_dataset_latest=use_dataset_latest)
    bqutil.create_dataset_if_nonexistent(dataset)  # create dataset if not already existent

    bqutil.load_data_to_table(dataset,
                              table,
                              gsdir / csv_name,
                              the_schema,
                              format="csv",
                              skiprows=1)
def get_stats_module_usage(
    course_id,
    basedir="X-Year-2-data-sql",
    datedir="2013-09-21",
    use_dataset_latest=False,
):
    '''
    Get data from the stats_module_usage table, if it doesn't already exist as a local file.
    Compute it if necessary.
    '''
    dataset = bqutil.course_id2dataset(course_id,
                                       use_dataset_latest=use_dataset_latest)

    sql = """   SELECT 
                    module_type, module_id, count(*) as ncount 
                FROM [{dataset}.studentmodule] 
                group by module_id, module_type
                order by module_id
          """.format(dataset=dataset)

    table = 'stats_module_usage'
    course_dir = find_course_sql_dir(course_id, basedir, datedir,
                                     use_dataset_latest)
    csvfn = course_dir / (table + ".csv")

    data = {}
    if csvfn.exists():
        # read file into data structure
        for k in list(csv.DictReader(open(csvfn))):
            midfrag = tuple(k['module_id'].split('/')[-2:])
            data[midfrag] = k
    else:
        # download if it is already computed, or recompute if needed
        bqdat = bqutil.get_bq_table(dataset, table, sql=sql)
        if bqdat is None:
            bqdat = {'data': []}

        fields = ["module_type", "module_id", "ncount"]
        fp = open(csvfn, 'w')
        cdw = csv.DictWriter(fp, fieldnames=fields)
        cdw.writeheader()
        for k in bqdat['data']:
            midfrag = tuple(k['module_id'].split('/')[-2:])
            data[midfrag] = k
            try:
                k['module_id'] = k['module_id'].encode('utf8')
                cdw.writerow(k)
            except Exception as err:
                print "Error writing row %s, err=%s" % (k, str(err))
        fp.close()

    print "[analyze_content] got %d lines of studentmodule usage data" % len(
        data)
    return data
Example #4
0
def do_user_part_csv(course_id, basedir=None, datedir=None, 
                  use_dataset_latest=False,
                  verbose=False,
                  pin_date=None):
    sdir = find_course_sql_dir(course_id, 
                               basedir=basedir,
                               datedir=datedir,
                               use_dataset_latest=(use_dataset_latest and not pin_date),
    )
    # upload to google storage
    dfn = sdir / "user_api_usercoursetag.csv.gz"

    if not os.path.exists(dfn):
        print("[load_user_part] Missing %s, skipping" % dfn)
        return

    # reformat True / False to 1/0 for "value" field
    if verbose:
        print("[load_user_part] extracting user partition data from %s" % dfn)
        sys.stdout.flush()

    cdr = csv.DictReader(gzip.GzipFile(dfn))
    fields = cdr.fieldnames
    if verbose:
        print("fieldnames = %s" % fields)
    fixed_data = []
    bmap = {'true': 1, 'false': 0}
    for row in cdr:
        vstr = row['value'].lower()
        row['value'] = bmap.get(vstr, vstr)
        fixed_data.append(row)

    ofnb = 'user_partitions.csv.gz'
    odfn = sdir / ofnb
    with gzip.GzipFile(odfn, 'w') as ofp:
        cdw = csv.DictWriter(ofp, fieldnames=fields)
        cdw.writeheader()
        cdw.writerows(fixed_data)
    if verbose:
        print("[load_user_part] Wrote %d rows of user partition data to %s" % (len(fixed_data), odfn))
        sys.stdout.flush()

    gsdir = path(gsutil.gs_path_from_course_id(course_id, use_dataset_latest=use_dataset_latest))
    gsutil.upload_file_to_gs(odfn, gsdir / ofnb, verbose=False)
    
    mypath = os.path.dirname(os.path.realpath(__file__))
    schema = json.loads(open('%s/schemas/schema_user_partitions.json' % mypath).read())['user_partitions']

    # import into BigQuery
    table = "user_partitions"
    dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest)
    bqutil.load_data_to_table(dataset, table, gsdir / ofnb, schema, format='csv', skiprows=1)
    def extractResearchData( self, course_id, tablename, the_dataset=None, rdp=None, rdp_format='csv', output_bucket=None, basedir='', datedir='', do_gzip=True):
	'''
		Get research data output into tables and archive onto server
	'''
	
	# Archive location
	if course_id is not None: # Individual Course Research Data Products

		self.gsp = gsutil.gs_path_from_course_id( course_id=course_id, gsbucket=output_bucket, use_dataset_latest=True )
		gsfilename  = "%s/%s" % ( self.gsp, RESEARCH_DATA_PRODUCTS[ rdp ] )

	else: 
		print "ERROR! Must specify course_id's.  Aborting."
		return

	try:
		# Copy to Google Storage
		msg = "[researchData]: Copying Research Data table %s to %s" % ( tablename, gsfilename )
		print msg
		#gsfilename  = "%s/%s-*.csv.gz" % ( self.gsp, tablename ) # temp
		gsfilename  = "%s/%s.csv.gz" % ( self.gsp, tablename ) # temp
		ret = bqutil.extract_table_to_gs( the_dataset, tablename, gsfilename, format=rdp_format, do_gzip=True, wait=True)
		msg = "[researchData]: CSV download link: %s" % gsutil.gs_download_link( gsfilename )
		print msg
		sys.stdout.flush()
	
	except Exception as err:

		print str(err)
		if ('BQ Error creating table' in str(err) ):
			msg = "[researchData]: Retrying... by sharding."
			print msg
			sys.stdout.flush()
			gsfilename  = "%s/%s-*.csv.gz" % ( self.gsp, tablename )
			print gsfilename
			sys.stdout.flush()
			ret = bqutil.extract_table_to_gs( the_dataset, tablename, gsfilename, format=rdp_format, do_gzip=True, wait=True)
			msg = "[researchData]: CSV download link: %s" % gsutil.gs_download_link( gsfilename )
			print msg
			sys.stdout.flush()
	

	# Copy from Google Storage to Secure Data Warehouse for archiving
	archiveLocation = find_course_sql_dir(course_id=course_id, basedir=basedir, datedir=datedir, use_dataset_latest=True)
	#time.sleep( CFG.TIME_TO_WAIT_30s ) # delay needed to allow for GS to upload file fully (This should be size dependent, and may not be enough time)
	msg = "[researchData]: Archiving Research Data table %s from %s to %s" % ( tablename, gsfilename, archiveLocation )
	print msg
        sys.stdout.flush()
	gsutil.upload_file_to_gs(src=gsfilename, dst=archiveLocation, verbose=True)

	pass
def get_stats_module_usage(course_id,
                           basedir="X-Year-2-data-sql", 
                           datedir="2013-09-21", 
                           use_dataset_latest=False,
                           ):
    '''
    Get data from the stats_module_usage table, if it doesn't already exist as a local file.
    Compute it if necessary.
    '''
    dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest)

    sql = """   SELECT 
                    module_type, module_id, count(*) as ncount 
                FROM [{dataset}.studentmodule] 
                group by module_id, module_type
                order by module_id
          """.format(dataset=dataset)

    table = 'stats_module_usage'
    course_dir = find_course_sql_dir(course_id, basedir, datedir, use_dataset_latest)
    csvfn = course_dir / (table + ".csv")

    data = {}
    if csvfn.exists():
        # read file into data structure
        for k in list(csv.DictReader(open(csvfn))):
            midfrag = tuple(k['module_id'].split('/')[-2:])
            data[midfrag] = k
    else:
        # download if it is already computed, or recompute if needed
        bqdat = bqutil.get_bq_table(dataset, table, sql=sql)
        if bqdat is None:
            bqdat = {'data': []}

        fields = [ "module_type", "module_id", "ncount" ]
        fp = open(csvfn, 'w')
        cdw = csv.DictWriter(fp, fieldnames=fields)
        cdw.writeheader()
        for k in bqdat['data']:
            midfrag = tuple(k['module_id'].split('/')[-2:])
            data[midfrag] = k
            try:
                k['module_id'] = k['module_id'].encode('utf8')
                cdw.writerow(k)
            except Exception as err:
                print "Error writing row %s, err=%s" % (k, str(err))
        fp.close()

    print "[analyze_content] got %d lines of studentmodule usage data" % len(data)
    return data
Example #7
0
def process_course(
    course_id,
    basedir,
    datedir,
    use_dataset_latest,
    use_local_files,
    verbose=False,
    pin_date=None,
    stop_on_error=True,
):
    if pin_date:
        datedir = pin_date
    sdir = load_course_sql.find_course_sql_dir(
        course_id,
        basedir=basedir,
        datedir=datedir,
        use_dataset_latest=(use_dataset_latest and not pin_date),
    )
    edx2course_axis.DATADIR = sdir
    edx2course_axis.VERBOSE_WARNINGS = verbose

    fn_to_try = getattr(edx2bigquery_config, 'COURSE_FILES_PREFIX_NAMES', [])

    for fntt in fn_to_try:
        fn = sdir / fntt
        if os.path.exists(fn):
            break

    if not os.path.exists(fn):
        print "---> oops, cannot generate course axis for %s, file %s (or 'course.xml.tar.gz' or 'course-prod-edge-analytics.xml.tar.gz') missing!" % (
            course_id, fn)
        sys.stdout.flush()
        return

    # TODO: only create new axis if the table is missing, or the course axis is not already created

    try:
        edx2course_axis.process_xml_tar_gz_file(
            fn,
            use_local_files=use_local_files,
            use_dataset_latest=use_dataset_latest,
            force_course_id=course_id,
        )
    except Exception as err:
        print err
        traceback.print_exc()
        if stop_on_error:
            raise
Example #8
0
def make_gp_table(course_id, basedir=None, datedir=None, 
                  use_dataset_latest=False,
                  verbose=False,
                  pin_date=None):

    if pin_date:
        datedir = pin_date

    sdir = load_course_sql.find_course_sql_dir(course_id, 
                                               basedir=basedir,
                                               datedir=datedir,
                                               use_dataset_latest=(use_dataset_latest and not pin_date),
                                               )

    fn_to_try = ['course.xml.tar.gz',
                'course-prod-analytics.xml.tar.gz',
                'course-prod-edge-analytics.xml.tar.gz',
                'course-prod-edx-replica.xml.tar.gz',
            ]
    
    for fntt in fn_to_try:
        fn = sdir / fntt
        if os.path.exists(fn):
            break
    if not os.path.exists(fn):
        msg = "---> oops, cannot get couese content (with grading policy file) for %s, file %s (or 'course.xml.tar.gz' or 'course-prod-edge-analytics.xml.tar.gz') missing!" % (course_id, fn)
        raise Exception(msg)

    gpstr, gpfn = read_grading_policy_from_tar_file(fn)
    fields, gptab, schema = load_grading_policy(gpstr, verbose=verbose, gpfn=gpfn)
    
    ofnb = 'grading_policy.csv'
    ofn = sdir / ofnb
    ofp = open(ofn, 'w')
    cdw = csv.DictWriter(ofp, fieldnames=fields)
    cdw.writeheader()
    cdw.writerows(gptab)
    ofp.close()

    # upload to google storage
    gsdir = path(gsutil.gs_path_from_course_id(course_id, use_dataset_latest=use_dataset_latest))
    gsutil.upload_file_to_gs(ofn, gsdir / ofnb, verbose=False)
    
    # import into BigQuery
    table = "grading_policy"
    dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest)
    bqutil.load_data_to_table(dataset, table, gsdir / ofnb, schema, format='csv', skiprows=1)
def make_gp_table(course_id, basedir=None, datedir=None, 
                  use_dataset_latest=False,
                  verbose=False,
                  pin_date=None):

    if pin_date:
        datedir = pin_date

    sdir = load_course_sql.find_course_sql_dir(course_id, 
                                               basedir=basedir,
                                               datedir=datedir,
                                               use_dataset_latest=(use_dataset_latest and not pin_date),
                                               )

    fn_to_try = ['course.xml.tar.gz',
                'course-prod-analytics.xml.tar.gz',
                'course-prod-edge-analytics.xml.tar.gz',
                'course-prod-edx-replica.xml.tar.gz',
            ]
    
    for fntt in fn_to_try:
        fn = sdir / fntt
        if os.path.exists(fn):
            break
    if not os.path.exists(fn):
        msg = "---> oops, cannot get couese content (with grading policy file) for %s, file %s (or 'course.xml.tar.gz' or 'course-prod-edge-analytics.xml.tar.gz') missing!" % (course_id, fn)
        raise Exception(msg)

    gpstr, gpfn = read_grading_policy_from_tar_file(fn)
    fields, gptab, schema = load_grading_policy(gpstr, verbose=verbose, gpfn=gpfn)
    
    ofnb = 'grading_policy.csv'
    ofn = sdir / ofnb
    ofp = open(ofn, 'w')
    cdw = csv.DictWriter(ofp, fieldnames=fields)
    cdw.writeheader()
    cdw.writerows(gptab)
    ofp.close()

    # upload to google storage
    gsdir = path(gsutil.gs_path_from_course_id(course_id, use_dataset_latest=use_dataset_latest))
    gsutil.upload_file_to_gs(ofn, gsdir / ofnb, verbose=False)
    
    # import into BigQuery
    table = "grading_policy"
    dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest)
    bqutil.load_data_to_table(dataset, table, gsdir / ofnb, schema, format='csv', skiprows=1)
Example #10
0
def process_course(course_id,
                   basedir,
                   datedir,
                   use_dataset_latest,
                   verbose=False,
                   pin_date=None):
    if pin_date:
        datedir = pin_date
    sdir = load_course_sql.find_course_sql_dir(
        course_id,
        basedir=basedir,
        datedir=datedir,
        use_dataset_latest=(use_dataset_latest and not pin_date),
    )
    edx2course_axis.DATADIR = sdir
    edx2course_axis.VERBOSE_WARNINGS = verbose

    fn_to_try = [
        'course.xml.tar.gz',
        'course-prod-analytics.xml.tar.gz',
        'course-prod-edge-analytics.xml.tar.gz',
        'course-prod-edx-replica.xml.tar.gz',
    ]

    for fntt in fn_to_try:
        fn = sdir / fntt
        if os.path.exists(fn):
            break

    if not os.path.exists(fn):
        print "---> oops, cannot generate course axis for %s, file %s (or 'course.xml.tar.gz' or 'course-prod-edge-analytics.xml.tar.gz') missing!" % (
            course_id, fn)
        sys.stdout.flush()
        return

    # TODO: only create new axis if the table is missing, or the course axis is not already created

    try:
        edx2course_axis.process_xml_tar_gz_file(
            fn,
            use_dataset_latest=use_dataset_latest,
            force_course_id=course_id)
    except Exception as err:
        print err
        traceback.print_exc()
def process_courses(clist, basedir, listings):

    data = { y['course_id']: y for y in [ json.loads(x) for x in open(listings) ] }

    for course_id in clist:
        wrap = data[course_id].get('Course Wrap')
        try:
            (month, day, year) = map(int, wrap.split('/'))
        except Exception as err:
            print "Cannot parse date %s for course %s" % (wrap, course_id)
            continue
    
        wrap_dn = "%04d-%02d-%02d" % (year, month, day)
    
        datedir = ""
        use_dataset_latest = True
        course_dir = find_course_sql_dir(course_id, basedir, datedir, use_dataset_latest, verbose=False)
    
        sql_dir = course_dir.dirname()
    
        dirs_available = list(glob.glob( sql_dir / '20*' / "course*.tar.gz" ))
        dirs_available.sort()
        # print "course %s, wrap %s, sql_dir=%s, dirs_available=%s" % (course_id, wrap_dn, sql_dir, dirs_available)
    
        chosen_dir = None
        just_chose = False
        for dn in dirs_available:
            ddate = dn.rsplit('/', 2)[1]
            if ddate < wrap_dn:
                chosen_dir = dn
                just_chose = True
            elif just_chose:
                chosen_dir = dn	# shift to the date just after wrap
                just_chose = False
    
        if not chosen_dir and dirs_available:
            chosen_dir = dirs_available[0]
    
        # print "course %s, wrap %s, suggestd pin dir=%s" % (course_id, wrap_dn, chosen_dir)
        
        if chosen_dir:
            print "    '%s': '%s'," % (course_id, chosen_dir.rsplit('/',2)[1])
        else:
            print "--> course %s, wrap %s, suggestd pin dir NOT FOUND" % (course_id, wrap_dn)
def process_course(course_id, basedir, datedir, use_dataset_latest, verbose=False, pin_date=None, stop_on_error=True):
    if pin_date:
        datedir = pin_date
    sdir = load_course_sql.find_course_sql_dir(course_id, 
                                               basedir=basedir,
                                               datedir=datedir,
                                               use_dataset_latest=(use_dataset_latest and not pin_date),
                                               )
    edx2course_axis.DATADIR = sdir
    edx2course_axis.VERBOSE_WARNINGS = verbose

    fn_to_try = ['course.xml.tar.gz',
                'course-prod-analytics.xml.tar.gz',
                'course-prod-edge-analytics.xml.tar.gz',
                'course-prod-edx-replica.xml.tar.gz',
            ]

    for fntt in fn_to_try:
        fn = sdir / fntt
        if os.path.exists(fn):
            break

    if not os.path.exists(fn):
        print "---> oops, cannot generate course axis for %s, file %s (or 'course.xml.tar.gz' or 'course-prod-edge-analytics.xml.tar.gz') missing!" % (course_id, fn)
        sys.stdout.flush()
        return

    # TODO: only create new axis if the table is missing, or the course axis is not already created

    try:
        edx2course_axis.process_xml_tar_gz_file(fn,
                                                use_dataset_latest=use_dataset_latest,
                                                force_course_id=course_id)
    except Exception as err:
        print err
        traceback.print_exc()
        if stop_on_error:
            raise
def process_file(course_id, basedir=None, datedir=None, use_dataset_latest=False):

    basedir = path(basedir or '')
    course_dir = course_id.replace('/','__')
    lfp = find_course_sql_dir(course_id, basedir, datedir, use_dataset_latest=use_dataset_latest)

    cdir = lfp
    print "Processing %s from files in %s" % (course_id, cdir)
    sys.stdout.flush()

    mypath = os.path.dirname(os.path.realpath(__file__))
    SCHEMA_FILE = '%s/schemas/schema_user_info_combo.json' % mypath
    
    the_dict_schema = schema2dict(json.loads(open(SCHEMA_FILE).read())['user_info_combo'])
    
    uic = defaultdict(dict)		# dict with key = user_id, and val = dict to be written out as JSON line
    
    def copy_elements(src, dest, fields, prefix="", skip_empty=False):
        for key in fields:
            if skip_empty and (not key in src):
                src[key] = None
            if src[key]=='NULL':
                continue
            if key=='course_id' and src[key].startswith('course-v1:'):
                # special handling for mangled "opaque keys" version of course_id, e.g. course-v1:MITx+6.00.2x_3+1T2015
                src[key] = src[key].split(':',1)[1].replace('+','/')
            dest[prefix + key] = src[key]
    
    def openfile(fn_in, mode='r', add_dir=True):
        if add_dir:
            fn = cdir / fn_in
        else:
            fn = fn_in
        if (not os.path.exists(fn)) and (not fn.endswith('.gz')):
            fn += ".gz"
        if mode=='r' and not os.path.exists(fn):
            newfn = convert_sql(fn)		# try converting from *.sql file, if that exists
            if not newfn:
                return None			# failure, no file found, return None
            fn = newfn
        if fn.endswith('.gz'):
            return gzip.GzipFile(fn, mode)
        return open(fn, mode)
    
    def tsv2csv(fn_in, fn_out):
        import csv
        fp = openfile(fn_out, 'w', add_dir=False)
        csvfp = csv.writer(fp)
        for line in openfile(fn_in, add_dir=False):
            csvfp.writerow(line[:-1].split('\t'))
        fp.close()
    
    def convert_sql(fnroot):
        '''
        Returns filename if suitable file exists or was created by conversion of tab separated values to comma separated values.
        Returns False otherwise.
        '''
        if fnroot.endswith('.gz'):
            fnroot = fnroot[:-3]
        if fnroot.endswith('.csv'):
            fnroot = fnroot[:-4]
        if os.path.exists(fnroot + ".csv"):
            return fnroot + ".csv"
        if os.path.exists(fnroot + ".csv.gz"):
            return fnroot + ".csv.gz"
        if os.path.exists(fnroot + ".sql") or os.path.exists(fnroot + ".sql.gz"):
            infn = fnroot + '.sql'
            outfn = fnroot + '.csv.gz'
            print "--> Converting %s to %s" % (infn, outfn)
            tsv2csv(infn, outfn)
            return outfn
        return False

    nusers = 0
    fields = ['username', 'email', 'is_staff', 'last_login', 'date_joined']
    for line in csv.DictReader(openfile('users.csv')):
        uid = int(line['id'])
        copy_elements(line, uic[uid], fields)
        uic[uid]['user_id'] = uid
        nusers += 1
        uic[uid]['y1_anomalous'] = None
    
    print "  %d users loaded from users.csv" % nusers

    fp = openfile('profiles.csv')
    if fp is None:
        print "--> Skipping profiles.csv, file does not exist"
    else:
        nprofiles = 0
        fields = ['name', 'language', 'location', 'meta', 'courseware', 
                  'gender', 'mailing_address', 'year_of_birth', 'level_of_education', 'goals', 
                  'allow_certificate', 'country', 'city']
        for line in csv.DictReader(fp):
            uid = int(line['user_id'])
            copy_elements(line, uic[uid], fields, prefix="profile_")
            nprofiles += 1
        print "  %d profiles loaded from profiles.csv" % nprofiles
    
    fp = openfile('enrollment.csv')
    if fp is None:
        print "--> Skipping enrollment.csv, file does not exist"
    else:
        nenrollments = 0
        fields = ['course_id', 'created', 'is_active', 'mode', ]
        for line in csv.DictReader(fp):
            uid = int(line['user_id'])
            copy_elements(line, uic[uid], fields, prefix="enrollment_")
            nenrollments += 1
        print "  %d enrollments loaded from profiles.csv" % nenrollments
    
    # see if from_mongodb files are present for this course; if so, merge in that data
    mongodir = cdir.dirname() / 'from_mongodb'
    if mongodir.exists():
        print "--> %s exists, merging in users, profile, and enrollment data from mongodb" % mongodir
        sys.stdout.flush()
        fp = gzip.GzipFile(mongodir / "users.json.gz")
        fields = ['username', 'email', 'is_staff', 'last_login', 'date_joined']
        nadded = 0
        for line in fp:
            pdata = json.loads(line)
            uid = int(pdata['_id'])
            if not uid in uic:
                copy_elements(pdata, uic[uid], fields, skip_empty=True)
                uic[uid]['user_id'] = uid
                nadded += 1
        fp.close()
        print "  %d additional users loaded from %s/users.json.gz" % (nadded, mongodir)
                
        fp = gzip.GzipFile(mongodir / "profiles.json.gz")
        fields = ['name', 'language', 'location', 'meta', 'courseware', 
                  'gender', 'mailing_address', 'year_of_birth', 'level_of_education', 'goals', 
                  'allow_certificate', 'country', 'city']
        nadd_profiles = 0
        def fix_unicode(elem, fields):
            for k in fields:
                if (k in elem) and elem[k]:
                    elem[k] = elem[k].encode('utf8')

        for line in fp:
            pdata = json.loads(line.decode('utf8'))
            uid = int(pdata['user_id'])
            if not uic[uid].get('profile_name', None):
                copy_elements(pdata, uic[uid], fields, prefix="profile_", skip_empty=True)
                fix_unicode(uic[uid], ['profile_name', 'profile_mailing_address', 'profile_goals', 'profile_location', 'profile_language'])
                uic[uid]['y1_anomalous'] = 1
                nadd_profiles += 1
        fp.close()
        print "  %d additional profiles loaded from %s/profiles.json.gz" % (nadd_profiles, mongodir)
                
        # if datedir is specified, then do not add entries from mongodb where the enrollment happened after the datedir cutoff
        cutoff = None
        if datedir:
            cutoff = "%s 00:00:00" % datedir

        fp = gzip.GzipFile(mongodir / "enrollment.json.gz")
        fields = ['course_id', 'created', 'is_active', 'mode', ]
        nadd_enrollment = 0
        n_removed_after_cutoff = 0
        for line in fp:
            pdata = json.loads(line.decode('utf8'))
            uid = int(pdata['user_id'])
            if not uic[uid].get('enrollment_course_id', None):
                if cutoff and (pdata['created'] > cutoff) and (uic[uid].get('y1_anomalous')==1):	# remove if enrolled after datedir cutoff
                    uic.pop(uid)
                    n_removed_after_cutoff += 1
                else:
                    copy_elements(pdata, uic[uid], fields, prefix="enrollment_", skip_empty=True)
                    nadd_enrollment += 1
        fp.close()
        print "  %d additional enrollments loaded from %s/enrollment.json.gz" % (nadd_enrollment, mongodir)

        print "     from mongodb files, added %s (of %s) new users (%s profiles, %s enrollments, %s after cutoff %s)" % (nadded - n_removed_after_cutoff,
                                                                                                                         nadded, nadd_profiles, nadd_enrollment,
                                                                                                                         n_removed_after_cutoff,
                                                                                                                         cutoff)
        sys.stdout.flush()

    fp = openfile('certificates.csv')
    if fp is None:
        print "--> Skipping certificates.csv, file does not exist"
    else:
        for line in csv.DictReader(fp):
            uid = int(line['user_id'])
            fields = ['download_url', 'grade', 'course_id', 'key', 'distinction', 'status', 
                      'verify_uuid', 'download_uuid', 'name', 'created_date', 'modified_date', 'error_reason', 'mode',]
            copy_elements(line, uic[uid], fields, prefix="certificate_")
            if 'user_id' not in uic[uid]:
                uic[uid]['user_id'] = uid
    
    # sanity check for entries with user_id but missing username
    nmissing_uname = 0
    for uid, entry in uic.iteritems():
        if (not 'username' in entry) or (not entry['username']):
            nmissing_uname += 1
            if nmissing_uname < 10:
                print "missing username: %s" % entry
    print "--> %d entries missing username" % nmissing_uname
    sys.stdout.flush()
    
    # sanity check for entries missing course_id
    nmissing_cid = 0
    for uid, entry in uic.iteritems():
        if (not 'enrollment_course_id' in entry) or (not entry['enrollment_course_id']):
            nmissing_cid += 1
            entry['enrollment_course_id'] = course_id
    print "--> %d entries missing enrollment_course_id (all fixed by setting to %s)" % (nmissing_cid, course_id)
    sys.stdout.flush()

    fp = openfile('user_id_map.csv')
    if fp is None:
        print "--> Skipping user_id_map.csv, file does not exist"
    else:
        for line in csv.DictReader(fp):
            uid = int(line['id'])
            fields = ['hash_id']
            copy_elements(line, uic[uid], fields, prefix="id_map_")
    
    # sort by userid
    uidset = uic.keys()
    uidset.sort()
    
    # write out result, checking schema along the way
    
    fieldnames = the_dict_schema.keys()
    ofp = openfile('user_info_combo.json.gz', 'w')
    ocsv = csv.DictWriter(openfile('user_info_combo.csv.gz', 'w'), fieldnames=fieldnames)
    ocsv.writeheader()
    
    for uid in uidset:
        data = uic[uid]
        check_schema(uid, data, the_ds=the_dict_schema, coerce=True)
        if ('enrollment_course_id' not in data) and ('certificate_course_id' not in data):
            print "Oops!  missing course_id in user_info_combo line: inconsistent SQL?"
            print "data = %s" % data
            print "Suppressing this row"
            continue
        row_course_id = data.get('enrollment_course_id', data.get('certificate_course_id', ''))
        if not row_course_id==course_id:
            print "Oops!  course_id=%s in user_info_combo line: inconsistent with expected=%s" % (row_course_id, course_id)
            print "data = %s" % data
            print "Suppressing this row"
            continue
        ofp.write(json.dumps(data) + '\n')
        try:
            ocsv.writerow(data)
        except Exception as err:
            print "failed to write data=%s" % data
            raise
    
    print "Done with make_user_info_combo for %s" % course_id
    sys.stdout.flush()
def fix_missing_grades(
        course_id,
        getGrades=None,
        download_only=None,
        course_id_type='opaque',  # Assume newer courses with format 'course-v1:HarvardX+course+term'
        org_list=None,  # org list is passed in order to properly grab data from edX instructor dashboard
        basedir=None,
        datedir=None,
        use_dataset_latest=False):

    lfp = find_course_sql_dir(course_id=course_id,
                              basedir=basedir)  # , org_list=org_list)
    print lfp
    gradereport_dir = lfp + '/from_edxinstructordash'
    print gradereport_dir

    if not gradereport_dir.exists():
        os.mkdir(gradereport_dir)

    # Download grade report for single course
    def do_grade_report_export(ofn_dir, ofn, course_id):

        # Properly convert to opaque, if needed
        # Use Case: course_list contains transparent course ids,
        # but need to pass opaque course id for edX instructor dashboard
        # course_id_type can be set using --course-id-type opaque, for example, on the command line
        # Important to properly pass the original course id, as it exists in the edX platform
        course_id = parseCourseIdField(course=course_id,
                                       org_list=org_list,
                                       course_id_type=course_id_type)
        print 'Processing course %s' % course_id
        sys.stdout.flush()

        try:
            # Make Grade Report Request
            getGrades.make_grade_report_request(course_id)

            # Check current set of grade reports
            grade_reports_dict = getGrades.get_grade_reports(
                course_id, ofn_dir)

            if download_only:
                print 'Download grade report only specified'
                sys.stdout.flush()
                getGrades.get_latest_grade_report(grade_reports_dict, ofn,
                                                  ofn_dir)

            else:
                # Keep checking until the Grade Report is Ready
                grade_report_ready = False
                while not grade_report_ready:
                    current_time = datetime.datetime.now()
                    print "Checking grade report status at %s" % current_time
                    sys.stdout.flush()
                    time.sleep(TIME_TO_WAIT)
                    if grade_report_download_ready(course_id):
                        grade_reports_dict = getGrades.get_grade_reports(
                            course_id, ofn_dir)
                        getGrades.get_latest_grade_report(
                            grade_reports_dict, ofn, ofn_dir)
                        grade_report_ready = True

        except Exception as err:
            print str(err)
            grade_report_downloaded = False

            # Try one last time
            try:
                # If failure, then just try grabbing the latest grade report, just in case
                grade_reports_dict = getGrades.get_grade_reports(
                    course_id, ofn_dir)
                getGrades.get_latest_grade_report(grade_reports_dict, ofn,
                                                  ofn_dir)
                grade_report_downloaded = True
            except Exception as err:
                print 'Failure on second attempt %s' % str(err)
                sys.stdout.flush()
                raise

            if grade_report_downloaded:
                print 'Success on second attempt %s'
                sys.stdout.flush()
            else:
                print 'Failure on second attempt %s' % str(err)
                sys.stdout.flush()
                raise

    def grade_report_download_ready(course_id):

        getGrades.set_course_id(course_id)
        if getGrades.list_instructor_tasks()['tasks']:
            return False
        else:
            return True

    # Do single Course
    do_grade_report_export(gradereport_dir, "grade_report.csv", course_id)
def analyze_course_content(course_id, 
                           listings_file=None,
                           basedir="X-Year-2-data-sql", 
                           datedir="2013-09-21", 
                           use_dataset_latest=False,
                           do_upload=False,
                           courses=None,
                           verbose=True,
                           ):
    '''
    Compute course_content table, which quantifies:

    - number of chapter, sequential, vertical modules
    - number of video modules
    - number of problem, *openended, mentoring modules
    - number of dicussion, annotatable, word_cloud modules

    Do this using the course "xbundle" file, produced when the course axis is computed.

    Include only modules which had nontrivial use, to rule out the staff and un-shown content. 
    Do the exclusion based on count of module appearing in the studentmodule table, based on 
    stats_module_usage for each course.

    Also, from the course listings file, compute the number of weeks the course was open.

    If do_upload (triggered by --force-recompute) then upload all accumulated data to the course report dataset 
    as the "stats_course_content" table.  Also generate a "course_summary_stats" table, stored in the
    course_report_ORG or course_report_latest dataset.  The course_summary_stats table combines
    data from many reports,, including stats_course_content, the medians report, the listings file,
    broad_stats_by_course, and time_on_task_stats_by_course.
    
    '''

    if do_upload:
        if use_dataset_latest:
            org = "latest"
        else:
            org = courses[0].split('/',1)[0]	# extract org from first course_id in courses

        crname = 'course_report_%s' % org

        gspath = gsutil.gs_path_from_course_id(crname)
        gsfnp = gspath / CCDATA
        gsutil.upload_file_to_gs(CCDATA, gsfnp)
        tableid = "stats_course_content"
        dataset = crname

        mypath = os.path.dirname(os.path.realpath(__file__))
        SCHEMA_FILE = '%s/schemas/schema_content_stats.json' % mypath

        try:
            the_schema = json.loads(open(SCHEMA_FILE).read())[tableid]
        except Exception as err:
            print "Oops!  Failed to load schema file for %s.  Error: %s" % (tableid, str(err))
            raise

        if 0:
            bqutil.load_data_to_table(dataset, tableid, gsfnp, the_schema, wait=True, verbose=False,
                                      format='csv', skiprows=1)

        table = 'course_metainfo'
        course_tables = ',\n'.join([('[%s.course_metainfo]' % bqutil.course_id2dataset(x)) for x in courses])
        sql = "select * from {course_tables}".format(course_tables=course_tables)
        print "--> Creating %s.%s using %s" % (dataset, table, sql)

        if 1:
            metainfo_dataset = bqutil.get_bq_table(dataset, table, sql=sql, 
                                          newer_than=datetime.datetime(2015, 1, 16, 3, 0),
                                          )
            # bqutil.create_bq_table(dataset, table, sql, overwrite=True)


        #-----------------------------------------------------------------------------
        # make course_summary_stats table
        #
        # This is a combination of the broad_stats_by_course table (if that exists), and course_metainfo.
        # Also use (and create if necessary) the nregistered_by_wrap table.

        # get the broad_stats_by_course data
        bsbc = bqutil.get_table_data(dataset, 'broad_stats_by_course')

        table_list = bqutil.get_list_of_table_ids(dataset)

        latest_person_course = max([ x for x in table_list if x.startswith('person_course_')])
        print "Latest person_course table in %s is %s" % (dataset, latest_person_course)
        
        sql = """
                SELECT pc.course_id as course_id, 
                    cminfo.wrap_date as wrap_date,
                    count(*) as nregistered,
                    sum(case when pc.start_time < cminfo.wrap_date then 1 else 0 end) nregistered_by_wrap,
                    sum(case when pc.start_time < cminfo.wrap_date then 1 else 0 end) / nregistered * 100 nregistered_by_wrap_pct,
                FROM
                    [{dataset}.{person_course}] as pc
                left join (
                 SELECT course_id,
                      TIMESTAMP(concat(wrap_year, "-", wrap_month, '-', wrap_day, ' 23:59:59')) as wrap_date,
                 FROM (
                  SELECT course_id, 
                    regexp_extract(value, r'(\d+)/\d+/\d+') as wrap_month,
                    regexp_extract(value, r'\d+/(\d+)/\d+') as wrap_day,
                    regexp_extract(value, r'\d+/\d+/(\d+)') as wrap_year,
                  FROM [{dataset}.course_metainfo]
                  where key='listings_Course Wrap'
                 )) as cminfo
                on pc.course_id = cminfo.course_id
                
                group by course_id, wrap_date
                order by course_id
        """.format(dataset=dataset, person_course=latest_person_course)

        nr_by_wrap = bqutil.get_bq_table(dataset, 'nregistered_by_wrap', sql=sql, key={'name': 'course_id'})

        # rates for registrants before and during course
        
        sql = """
                SELECT 
                    *,
                    ncertified / nregistered * 100 as pct_certified_of_reg,
                    ncertified_and_registered_before_launch / nregistered_before_launch * 100 as pct_certified_reg_before_launch,
                    ncertified_and_registered_during_course / nregistered_during_course * 100 as pct_certified_reg_during_course,
                    ncertified / nregistered_by_wrap * 100 as pct_certified_of_reg_by_wrap,
                    ncertified / nviewed * 100 as pct_certified_of_viewed,
                    ncertified / nviewed_by_wrap * 100 as pct_certified_of_viewed_by_wrap,
                    ncertified_by_ewrap / nviewed_by_ewrap * 100 as pct_certified_of_viewed_by_ewrap,
                FROM
                (
                # ------------------------
                # get aggregate data
                SELECT pc.course_id as course_id, 
                    cminfo.wrap_date as wrap_date,
                    count(*) as nregistered,
                    sum(case when pc.certified then 1 else 0 end) ncertified,
                    sum(case when (TIMESTAMP(pc.cert_created_date) < cminfo.ewrap_date) and (pc.certified and pc.viewed) then 1 else 0 end) ncertified_by_ewrap,
                    sum(case when pc.viewed then 1 else 0 end) nviewed,
                    sum(case when pc.start_time < cminfo.wrap_date then 1 else 0 end) nregistered_by_wrap,
                    sum(case when pc.start_time < cminfo.wrap_date then 1 else 0 end) / nregistered * 100 nregistered_by_wrap_pct,
                    sum(case when (pc.start_time < cminfo.wrap_date) and pc.viewed then 1 else 0 end) nviewed_by_wrap,
                    sum(case when (pc.start_time < cminfo.ewrap_date) and pc.viewed then 1 else 0 end) nviewed_by_ewrap,
                    sum(case when pc.start_time < cminfo.launch_date then 1 else 0 end) nregistered_before_launch,
                    sum(case when pc.start_time < cminfo.launch_date 
                              and pc.certified
                              then 1 else 0 end) ncertified_and_registered_before_launch,
                    sum(case when (pc.start_time >= cminfo.launch_date) 
                              and (pc.start_time < cminfo.wrap_date) then 1 else 0 end) nregistered_during_course,
                    sum(case when (pc.start_time >= cminfo.launch_date) 
                              and (pc.start_time < cminfo.wrap_date) 
                              and pc.certified
                              then 1 else 0 end) ncertified_and_registered_during_course,
                FROM
                    [{dataset}.{person_course}] as pc
                left join (
                
                # --------------------
                #  get course launch and wrap dates from course_metainfo

       SELECT AA.course_id as course_id, 
              AA.wrap_date as wrap_date,
              AA.launch_date as launch_date,
              BB.ewrap_date as ewrap_date,
       FROM (
               #  inner get course launch and wrap dates from course_metainfo
                SELECT A.course_id as course_id,
                  A.wrap_date as wrap_date,
                  B.launch_date as launch_date,
                from
                (
                 SELECT course_id,
                      TIMESTAMP(concat(wrap_year, "-", wrap_month, '-', wrap_day, ' 23:59:59')) as wrap_date,
                 FROM (
                  SELECT course_id, 
                    regexp_extract(value, r'(\d+)/\d+/\d+') as wrap_month,
                    regexp_extract(value, r'\d+/(\d+)/\d+') as wrap_day,
                    regexp_extract(value, r'\d+/\d+/(\d+)') as wrap_year,
                  FROM [{dataset}.course_metainfo]
                  where key='listings_Course Wrap'
                 )
                ) as A
                left outer join 
                (
                 SELECT course_id,
                      TIMESTAMP(concat(launch_year, "-", launch_month, '-', launch_day)) as launch_date,
                 FROM (
                  SELECT course_id, 
                    regexp_extract(value, r'(\d+)/\d+/\d+') as launch_month,
                    regexp_extract(value, r'\d+/(\d+)/\d+') as launch_day,
                    regexp_extract(value, r'\d+/\d+/(\d+)') as launch_year,
                  FROM [{dataset}.course_metainfo]
                  where key='listings_Course Launch'
                 )
                ) as B
                on A.course_id = B.course_id 
                # end inner course_metainfo subquery
            ) as AA
            left outer join
            (
                 SELECT course_id,
                      TIMESTAMP(concat(wrap_year, "-", wrap_month, '-', wrap_day, ' 23:59:59')) as ewrap_date,
                 FROM (
                  SELECT course_id, 
                    regexp_extract(value, r'(\d+)/\d+/\d+') as wrap_month,
                    regexp_extract(value, r'\d+/(\d+)/\d+') as wrap_day,
                    regexp_extract(value, r'\d+/\d+/(\d+)') as wrap_year,
                  FROM [{dataset}.course_metainfo]
                  where key='listings_Empirical Course Wrap'
                 )
            ) as BB
            on AA.course_id = BB.course_id

                # end course_metainfo subquery
                # --------------------
                
                ) as cminfo
                on pc.course_id = cminfo.course_id
                
                group by course_id, wrap_date
                order by course_id
                # ---- end get aggregate data
                )
                order by course_id
        """.format(dataset=dataset, person_course=latest_person_course)

        print "--> Assembling course_summary_stats from %s" % 'stats_cert_rates_by_registration'
        sys.stdout.flush()
        cert_by_reg = bqutil.get_bq_table(dataset, 'stats_cert_rates_by_registration', sql=sql, 
                                          newer_than=datetime.datetime(2015, 1, 16, 3, 0),
                                          key={'name': 'course_id'})

        # start assembling course_summary_stats

        c_sum_stats = defaultdict(OrderedDict)
        for entry in bsbc['data']:
            course_id = entry['course_id']
            cmci = c_sum_stats[course_id]
            cmci.update(entry)
            cnbw = nr_by_wrap['data_by_key'][course_id]
            nbw = int(cnbw['nregistered_by_wrap'])
            cmci['nbw_wrap_date'] = cnbw['wrap_date']
            cmci['nregistered_by_wrap'] = nbw
            cmci['nregistered_by_wrap_pct'] = cnbw['nregistered_by_wrap_pct']
            cmci['frac_female'] = float(entry['n_female_viewed']) / (float(entry['n_male_viewed']) + float(entry['n_female_viewed']))
            ncert = float(cmci['certified_sum'])
            if ncert:
                cmci['certified_of_nregistered_by_wrap_pct'] = nbw / ncert * 100.0
            else:
                cmci['certified_of_nregistered_by_wrap_pct'] = None
            cbr = cert_by_reg['data_by_key'][course_id]
            for field, value in cbr.items():
                cmci['cbr_%s' % field] = value

        # add medians for viewed, explored, and certified

        msbc_tables = {'msbc_viewed': "viewed_median_stats_by_course",
                       'msbc_explored': 'explored_median_stats_by_course',
                       'msbc_certified': 'certified_median_stats_by_course',
                       'msbc_verified': 'verified_median_stats_by_course',
                       }
        for prefix, mtab in msbc_tables.items():
            print "--> Merging median stats data from %s" % mtab
            sys.stdout.flush()
            bqdat = bqutil.get_table_data(dataset, mtab)
            for entry in bqdat['data']:
                course_id = entry['course_id']
                cmci = c_sum_stats[course_id]
                for field, value in entry.items():
                    cmci['%s_%s' % (prefix, field)] = value

        # add time on task data

        tot_table = "time_on_task_stats_by_course"
        prefix = "ToT"
        print "--> Merging time on task data from %s" % tot_table
        sys.stdout.flush()
        try:
            bqdat = bqutil.get_table_data(dataset, tot_table)
        except Exception as err:
            bqdat = {'data': {}}
        for entry in bqdat['data']:
            course_id = entry['course_id']
            cmci = c_sum_stats[course_id]
            for field, value in entry.items():
                if field=='course_id':
                    continue
                cmci['%s_%s' % (prefix, field)] = value

        # add serial time on task data

        tot_table = "time_on_task_serial_stats_by_course"
        prefix = "SToT"
        print "--> Merging serial time on task data from %s" % tot_table
        sys.stdout.flush()
        try:
            bqdat = bqutil.get_table_data(dataset, tot_table)
        except Exception as err:
            bqdat = {'data': {}}
        for entry in bqdat['data']:
            course_id = entry['course_id']
            cmci = c_sum_stats[course_id]
            for field, value in entry.items():
                if field=='course_id':
                    continue
                cmci['%s_%s' % (prefix, field)] = value

        # add show_answer stats

        tot_table = "show_answer_stats_by_course"
        prefix = "SAS"
        print "--> Merging show_answer stats data from %s" % tot_table
        sys.stdout.flush()
        try:
            bqdat = bqutil.get_table_data(dataset, tot_table)
        except Exception as err:
            bqdat = {'data': {}}
        for entry in bqdat['data']:
            course_id = entry['course_id']
            cmci = c_sum_stats[course_id]
            for field, value in entry.items():
                if field=='course_id':
                    continue
                cmci['%s_%s' % (prefix, field)] = value

        # setup list of keys, for CSV output

        css_keys = c_sum_stats.values()[0].keys()

        # retrieve course_metainfo table, pivot, add that to summary_stats

        print "--> Merging course_metainfo from %s" % table
        sys.stdout.flush()
        bqdat = bqutil.get_table_data(dataset, table)

        def make_key(key):
            key = key.strip()
            key = key.replace(' ', '_').replace("'", "_").replace('/', '_').replace('(','').replace(')','').replace('-', '_').replace(',', '')
            return key

        listings_keys = map(make_key, ["Institution", "Semester", "New or Rerun", "Andrew Recodes New/Rerun", 
                                       "Course Number", "Short Title", "Andrew's Short Titles", "Title", 
                                       "Instructors", "Registration Open", "Course Launch", "Course Wrap", "course_id",
                                       "Empirical Course Wrap", "Andrew's Order", "certifies", "MinPassGrade",
                                       '4-way Category by name', "4-way (CS, STEM, HSocSciGov, HumHistRel)"
                                       ])
        listings_keys.reverse()
        
        for lk in listings_keys:
            css_keys.insert(1, "listings_%s" % lk)

        COUNTS_TO_KEEP = ['discussion', 'problem', 'optionresponse', 'checkboxgroup', 'optioninput', 
                          'choiceresponse', 'video', 'choicegroup', 'vertical', 'choice', 'sequential', 
                          'multiplechoiceresponse', 'numericalresponse', 'chapter', 'solution', 'img', 
                          'formulaequationinput', 'responseparam', 'selfassessment', 'track', 'task', 'rubric', 
                          'stringresponse', 'combinedopenended', 'description', 'textline', 'prompt', 'category', 
                          'option', 'lti', 'annotationresponse', 
                          'annotatable', 'colgroup', 'tag_prompt', 'comment', 'annotationinput', 'image', 
                          'options', 'comment_prompt', 'conditional', 
                          'answer', 'poll_question', 'section', 'wrapper', 'map', 'area', 
                          'customtag', 'transcript', 
                          'split_test', 'word_cloud', 
                          'openended', 'openendedparam', 'answer_display', 'code', 
                          'drag_and_drop_input', 'customresponse', 'draggable', 'mentoring', 
                          'textannotation', 'imageannotation', 'videosequence', 
                          'feedbackprompt', 'assessments', 'openassessment', 'assessment', 'explanation', 'criterion']

        for entry in bqdat['data']:
            thekey = make_key(entry['key'])
            # if thekey.startswith('count_') and thekey[6:] not in COUNTS_TO_KEEP:
            #     continue
            if thekey.startswith('listings_') and thekey[9:] not in listings_keys:
                # print "dropping key=%s for course_id=%s" % (thekey, entry['course_id'])
                continue
            c_sum_stats[entry['course_id']][thekey] = entry['value']
            #if 'certifies' in thekey:
            #    print "course_id=%s, key=%s, value=%s" % (entry['course_id'], thekey, entry['value'])
            if thekey not in css_keys:
                css_keys.append(thekey)

        # compute forum_posts_per_week
        for course_id, entry in c_sum_stats.items():
            nfps = entry.get('nforum_posts_sum', 0)
            if nfps:
                fppw = int(nfps) / float(entry['nweeks'])
                entry['nforum_posts_per_week'] = fppw
                print "    course: %s, assessments_per_week=%s, forum_posts_per_week=%s" % (course_id, entry['total_assessments_per_week'], fppw)
            else:
                entry['nforum_posts_per_week'] = None
        css_keys.append('nforum_posts_per_week')

        # read in listings file and merge that in also
        if listings_file:
            if listings_file.endswith('.csv'):
                listings = csv.DictReader(open(listings_file))
            else:
                listings = [ json.loads(x) for x in open(listings_file) ]
            for entry in listings:
                course_id = entry['course_id']
                if course_id not in c_sum_stats:
                    continue
                cmci = c_sum_stats[course_id]
                for field, value in entry.items():
                    lkey = "listings_%s" % make_key(field)
                    if not (lkey in cmci) or (not cmci[lkey]):
                        cmci[lkey] = value

        print "Storing these fields: %s" % css_keys

        # get schema
        mypath = os.path.dirname(os.path.realpath(__file__))
        the_schema = json.loads(open('%s/schemas/schema_combined_course_summary_stats.json' % mypath).read())
        schema_dict = { x['name'] : x for x in the_schema }

        # write out CSV
        css_table = "course_summary_stats"
        ofn = "%s__%s.csv" % (dataset, css_table)
        ofn2 = "%s__%s.json" % (dataset, css_table)
        print "Writing data to %s and %s" % (ofn, ofn2)

        ofp = open(ofn, 'w')
        ofp2 = open(ofn2, 'w')
        dw = csv.DictWriter(ofp, fieldnames=css_keys)
        dw.writeheader()
        for cid, entry in c_sum_stats.items():
            for ek in entry:
                if ek not in schema_dict:
                    entry.pop(ek)
                # entry[ek] = str(entry[ek])	# coerce to be string
            ofp2.write(json.dumps(entry) + "\n")
            for key in css_keys:
                if key not in entry:
                    entry[key] = None
            dw.writerow(entry)
        ofp.close()
        ofp2.close()

        # upload to bigquery
        # the_schema = [ { 'type': 'STRING', 'name': x } for x in css_keys ]
        if 1:
            gsfnp = gspath / dataset / (css_table + ".json")
            gsutil.upload_file_to_gs(ofn2, gsfnp)
            # bqutil.load_data_to_table(dataset, css_table, gsfnp, the_schema, wait=True, verbose=False,
            #                           format='csv', skiprows=1)
            bqutil.load_data_to_table(dataset, css_table, gsfnp, the_schema, wait=True, verbose=False)

        return

    
    print "-"*60 + " %s" % course_id

    # get nweeks from listings
    lfn = path(listings_file)
    if not lfn.exists():
        print "[analyze_content] course listings file %s doesn't exist!" % lfn
        return

    data = None
    for k in csv.DictReader(open(lfn)):
        if k['course_id']==course_id:
            data = k
            break

    if not data:
        print "[analyze_content] no entry for %s found in course listings file %s!" % (course_id, lfn)
        return

    def date_parse(field):
        (m, d, y) = map(int, data[field].split('/'))
        return datetime.datetime(y, m, d)

    launch = date_parse('Course Launch')
    wrap = date_parse('Course Wrap')
    ndays = (wrap - launch).days
    nweeks = ndays / 7.0

    print "Course length = %6.2f weeks (%d days)" % (nweeks, ndays)

    course_dir = find_course_sql_dir(course_id, basedir, datedir, use_dataset_latest)
    cfn = gsutil.path_from_course_id(course_id)

    xbfn = course_dir / ("xbundle_%s.xml" % cfn)
    
    if not xbfn.exists():
        print "[analyze_content] cannot find xbundle file %s for %s!" % (xbfn, course_id)
        return

    print "[analyze_content] For %s using %s" % (course_id, xbfn)
    
    # get module usage data
    mudata = get_stats_module_usage(course_id, basedir, datedir, use_dataset_latest)

    xml = etree.parse(open(xbfn)).getroot()
    
    counts = defaultdict(int)
    nexcluded = defaultdict(int)

    IGNORE = ['html', 'p', 'div', 'iframe', 'ol', 'li', 'ul', 'blockquote', 'h1', 'em', 'b', 'h2', 'h3', 'body', 'span', 'strong',
              'a', 'sub', 'strike', 'table', 'td', 'tr', 's', 'tbody', 'sup', 'sub', 'strike', 'i', 's', 'pre', 'policy', 'metadata',
              'grading_policy', 'br', 'center',  'wiki', 'course', 'font', 'tt', 'it', 'dl', 'startouttext', 'endouttext', 'h4', 
              'head', 'source', 'dt', 'hr', 'u', 'style', 'dd', 'script', 'th', 'p', 'P', 'TABLE', 'TD', 'small', 'text', 'title']

    def walk_tree(elem):
        if  type(elem.tag)==str and (elem.tag.lower() not in IGNORE):
            counts[elem.tag.lower()] += 1
        for k in elem:
            midfrag = (k.tag, k.get('url_name_orig', None))
            if (midfrag in mudata) and int(mudata[midfrag]['ncount']) < 20:
                nexcluded[k.tag] += 1
                if verbose:
                    print "    -> excluding %s (%s), ncount=%s" % (k.get('display_name', '<no_display_name>').encode('utf8'), 
                                                                   midfrag, 
                                                                   mudata.get(midfrag, {}).get('ncount'))
                continue
            walk_tree(k)

    walk_tree(xml)
    print counts

    # combine some into "qual_axis" and others into "quant_axis"
    qual_axis = ['openassessment', 'optionresponse', 'multiplechoiceresponse', 
                 # 'discussion', 
                 'choiceresponse', 'word_cloud', 
                 'combinedopenended', 'choiceresponse', 'stringresponse', 'textannotation', 'openended', 'lti']
    quant_axis = ['formularesponse', 'numericalresponse', 'customresponse', 'symbolicresponse', 'coderesponse',
                  'imageresponse']

    nqual = 0
    nquant = 0
    for tag, count in counts.items():
        if tag in qual_axis:
            nqual += count
        if tag in quant_axis:
            nquant += count
    
    print "nqual=%d, nquant=%d" % (nqual, nquant)

    nqual_per_week = nqual / nweeks
    nquant_per_week = nquant / nweeks
    total_per_week = nqual_per_week + nquant_per_week

    print "per week: nqual=%6.2f, nquant=%6.2f total=%6.2f" % (nqual_per_week, nquant_per_week, total_per_week)

    # save this overall data in CCDATA
    lock_file(CCDATA)
    ccdfn = path(CCDATA)
    ccd = {}
    if ccdfn.exists():
        for k in csv.DictReader(open(ccdfn)):
            ccd[k['course_id']] = k
    
    ccd[course_id] = {'course_id': course_id,
                      'nweeks': nweeks,
                      'nqual_per_week': nqual_per_week,
                      'nquant_per_week': nquant_per_week,
                      'total_assessments_per_week' : total_per_week,
                      }

    # fields = ccd[ccd.keys()[0]].keys()
    fields = ['course_id', 'nquant_per_week', 'total_assessments_per_week', 'nqual_per_week', 'nweeks']
    cfp = open(ccdfn, 'w')
    dw = csv.DictWriter(cfp, fieldnames=fields)
    dw.writeheader()
    for cid, entry in ccd.items():
        dw.writerow(entry)
    cfp.close()
    lock_file(CCDATA, release=True)

    # store data in course_metainfo table, which has one (course_id, key, value) on each line
    # keys include nweeks, nqual, nquant, count_* for module types *

    cmfields = OrderedDict()
    cmfields['course_id'] = course_id
    cmfields['course_length_days'] = str(ndays)
    cmfields.update({ ('listings_%s' % key) : value for key, value in data.items() })	# from course listings
    cmfields.update(ccd[course_id].copy())

    # cmfields.update({ ('count_%s' % key) : str(value) for key, value in counts.items() })	# from content counts

    for key in sorted(counts):	# store counts in sorted order, so that the later generated CSV file can have a predictable structure
        value = counts[key]
        cmfields['count_%s' % key] =  str(value) 	# from content counts

    cmfields.update({ ('nexcluded_sub_20_%s' % key) : str(value) for key, value in nexcluded.items() })	# from content counts

    course_dir = find_course_sql_dir(course_id, basedir, datedir, use_dataset_latest)
    csvfn = course_dir / CMINFO

    # manual overriding of the automatically computed fields can be done by storing course_id,key,value data
    # in the CMINFO_OVERRIDES file

    csvfn_overrides = course_dir / CMINFO_OVERRIDES
    if csvfn_overrides.exists():
        print "--> Loading manual override information from %s" % csvfn_overrides
        for ovent in csv.DictReader(open(csvfn_overrides)):
            if not ovent['course_id']==course_id:
                print "===> ERROR! override file has entry with wrong course_id: %s" % ovent
                continue
            print "    overriding key=%s with value=%s" % (ovent['key'], ovent['value'])
            cmfields[ovent['key']] = ovent['value']

    print "--> Course metainfo writing to %s" % csvfn

    fp = open(csvfn, 'w')

    cdw = csv.DictWriter(fp, fieldnames=['course_id', 'key', 'value'])
    cdw.writeheader()
    
    for k, v in cmfields.items():
        cdw.writerow({'course_id': course_id, 'key': k, 'value': v})
        
    fp.close()

    table = 'course_metainfo'
    dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest)

    gsfnp = gsutil.gs_path_from_course_id(course_id, use_dataset_latest=use_dataset_latest) / CMINFO
    print "--> Course metainfo uploading to %s then to %s.%s" % (gsfnp, dataset, table)

    gsutil.upload_file_to_gs(csvfn, gsfnp)

    mypath = os.path.dirname(os.path.realpath(__file__))
    SCHEMA_FILE = '%s/schemas/schema_course_metainfo.json' % mypath
    the_schema = json.loads(open(SCHEMA_FILE).read())[table]

    bqutil.load_data_to_table(dataset, table, gsfnp, the_schema, wait=True, verbose=False, format='csv', skiprows=1)
Example #16
0
def analyze_course_content(
    course_id,
    listings_file=None,
    basedir="X-Year-2-data-sql",
    datedir="2013-09-21",
    use_dataset_latest=False,
    do_upload=False,
    courses=None,
    verbose=True,
    pin_date=None,
):
    '''
    Compute course_content table, which quantifies:

    - number of chapter, sequential, vertical modules
    - number of video modules
    - number of problem, *openended, mentoring modules
    - number of dicussion, annotatable, word_cloud modules

    Do this using the course "xbundle" file, produced when the course axis is computed.

    Include only modules which had nontrivial use, to rule out the staff and un-shown content. 
    Do the exclusion based on count of module appearing in the studentmodule table, based on 
    stats_module_usage for each course.

    Also, from the course listings file, compute the number of weeks the course was open.

    If do_upload (triggered by --force-recompute) then upload all accumulated data to the course report dataset 
    as the "stats_course_content" table.  Also generate a "course_summary_stats" table, stored in the
    course_report_ORG or course_report_latest dataset.  The course_summary_stats table combines
    data from many reports,, including stats_course_content, the medians report, the listings file,
    broad_stats_by_course, and time_on_task_stats_by_course.
    
    '''

    if do_upload:
        if use_dataset_latest:
            org = "latest"
        else:
            org = courses[0].split(
                '/', 1)[0]  # extract org from first course_id in courses

        crname = 'course_report_%s' % org

        gspath = gsutil.gs_path_from_course_id(crname)
        gsfnp = gspath / CCDATA
        gsutil.upload_file_to_gs(CCDATA, gsfnp)
        tableid = "stats_course_content"
        dataset = crname

        mypath = os.path.dirname(os.path.realpath(__file__))
        SCHEMA_FILE = '%s/schemas/schema_content_stats.json' % mypath

        try:
            the_schema = json.loads(open(SCHEMA_FILE).read())[tableid]
        except Exception as err:
            print "Oops!  Failed to load schema file for %s.  Error: %s" % (
                tableid, str(err))
            raise

        if 0:
            bqutil.load_data_to_table(dataset,
                                      tableid,
                                      gsfnp,
                                      the_schema,
                                      wait=True,
                                      verbose=False,
                                      format='csv',
                                      skiprows=1)

        table = 'course_metainfo'
        course_tables = ',\n'.join([
            ('[%s.course_metainfo]' % bqutil.course_id2dataset(x))
            for x in courses
        ])
        sql = "select * from {course_tables}".format(
            course_tables=course_tables)
        print "--> Creating %s.%s using %s" % (dataset, table, sql)

        if 1:
            metainfo_dataset = bqutil.get_bq_table(
                dataset,
                table,
                sql=sql,
                newer_than=datetime.datetime(2015, 1, 16, 3, 0),
            )
            # bqutil.create_bq_table(dataset, table, sql, overwrite=True)

        #-----------------------------------------------------------------------------
        # make course_summary_stats table
        #
        # This is a combination of the broad_stats_by_course table (if that exists), and course_metainfo.
        # Also use (and create if necessary) the nregistered_by_wrap table.

        # get the broad_stats_by_course data
        bsbc = bqutil.get_table_data(dataset, 'broad_stats_by_course')

        table_list = bqutil.get_list_of_table_ids(dataset)

        latest_person_course = max(
            [x for x in table_list if x.startswith('person_course_')])
        print "Latest person_course table in %s is %s" % (dataset,
                                                          latest_person_course)

        sql = """
                SELECT pc.course_id as course_id, 
                    cminfo.wrap_date as wrap_date,
                    count(*) as nregistered,
                    sum(case when pc.start_time < cminfo.wrap_date then 1 else 0 end) nregistered_by_wrap,
                    sum(case when pc.start_time < cminfo.wrap_date then 1 else 0 end) / nregistered * 100 nregistered_by_wrap_pct,
                FROM
                    [{dataset}.{person_course}] as pc
                left join (
                 SELECT course_id,
                      TIMESTAMP(concat(wrap_year, "-", wrap_month, '-', wrap_day, ' 23:59:59')) as wrap_date,
                 FROM (
                  SELECT course_id, 
                    regexp_extract(value, r'(\d+)/\d+/\d+') as wrap_month,
                    regexp_extract(value, r'\d+/(\d+)/\d+') as wrap_day,
                    regexp_extract(value, r'\d+/\d+/(\d+)') as wrap_year,
                  FROM [{dataset}.course_metainfo]
                  where key='listings_Course Wrap'
                 )) as cminfo
                on pc.course_id = cminfo.course_id
                
                group by course_id, wrap_date
                order by course_id
        """.format(dataset=dataset, person_course=latest_person_course)

        nr_by_wrap = bqutil.get_bq_table(dataset,
                                         'nregistered_by_wrap',
                                         sql=sql,
                                         key={'name': 'course_id'})

        # rates for registrants before and during course

        sql = """
                SELECT 
                    *,
                    ncertified / nregistered * 100 as pct_certified_of_reg,
                    ncertified_and_registered_before_launch / nregistered_before_launch * 100 as pct_certified_reg_before_launch,
                    ncertified_and_registered_during_course / nregistered_during_course * 100 as pct_certified_reg_during_course,
                    ncertified / nregistered_by_wrap * 100 as pct_certified_of_reg_by_wrap,
                    ncertified / nviewed * 100 as pct_certified_of_viewed,
                    ncertified / nviewed_by_wrap * 100 as pct_certified_of_viewed_by_wrap,
                    ncertified_by_ewrap / nviewed_by_ewrap * 100 as pct_certified_of_viewed_by_ewrap,
                FROM
                (
                # ------------------------
                # get aggregate data
                SELECT pc.course_id as course_id, 
                    cminfo.wrap_date as wrap_date,
                    count(*) as nregistered,
                    sum(case when pc.certified then 1 else 0 end) ncertified,
                    sum(case when (TIMESTAMP(pc.cert_created_date) < cminfo.ewrap_date) and (pc.certified and pc.viewed) then 1 else 0 end) ncertified_by_ewrap,
                    sum(case when pc.viewed then 1 else 0 end) nviewed,
                    sum(case when pc.start_time < cminfo.wrap_date then 1 else 0 end) nregistered_by_wrap,
                    sum(case when pc.start_time < cminfo.wrap_date then 1 else 0 end) / nregistered * 100 nregistered_by_wrap_pct,
                    sum(case when (pc.start_time < cminfo.wrap_date) and pc.viewed then 1 else 0 end) nviewed_by_wrap,
                    sum(case when (pc.start_time < cminfo.ewrap_date) and pc.viewed then 1 else 0 end) nviewed_by_ewrap,
                    sum(case when pc.start_time < cminfo.launch_date then 1 else 0 end) nregistered_before_launch,
                    sum(case when pc.start_time < cminfo.launch_date 
                              and pc.certified
                              then 1 else 0 end) ncertified_and_registered_before_launch,
                    sum(case when (pc.start_time >= cminfo.launch_date) 
                              and (pc.start_time < cminfo.wrap_date) then 1 else 0 end) nregistered_during_course,
                    sum(case when (pc.start_time >= cminfo.launch_date) 
                              and (pc.start_time < cminfo.wrap_date) 
                              and pc.certified
                              then 1 else 0 end) ncertified_and_registered_during_course,
                FROM
                    [{dataset}.{person_course}] as pc
                left join (
                
                # --------------------
                #  get course launch and wrap dates from course_metainfo

       SELECT AA.course_id as course_id, 
              AA.wrap_date as wrap_date,
              AA.launch_date as launch_date,
              BB.ewrap_date as ewrap_date,
       FROM (
               #  inner get course launch and wrap dates from course_metainfo
                SELECT A.course_id as course_id,
                  A.wrap_date as wrap_date,
                  B.launch_date as launch_date,
                from
                (
                 SELECT course_id,
                      TIMESTAMP(concat(wrap_year, "-", wrap_month, '-', wrap_day, ' 23:59:59')) as wrap_date,
                 FROM (
                  SELECT course_id, 
                    regexp_extract(value, r'(\d+)/\d+/\d+') as wrap_month,
                    regexp_extract(value, r'\d+/(\d+)/\d+') as wrap_day,
                    regexp_extract(value, r'\d+/\d+/(\d+)') as wrap_year,
                  FROM [{dataset}.course_metainfo]
                  where key='listings_Course Wrap'
                 )
                ) as A
                left outer join 
                (
                 SELECT course_id,
                      TIMESTAMP(concat(launch_year, "-", launch_month, '-', launch_day)) as launch_date,
                 FROM (
                  SELECT course_id, 
                    regexp_extract(value, r'(\d+)/\d+/\d+') as launch_month,
                    regexp_extract(value, r'\d+/(\d+)/\d+') as launch_day,
                    regexp_extract(value, r'\d+/\d+/(\d+)') as launch_year,
                  FROM [{dataset}.course_metainfo]
                  where key='listings_Course Launch'
                 )
                ) as B
                on A.course_id = B.course_id 
                # end inner course_metainfo subquery
            ) as AA
            left outer join
            (
                 SELECT course_id,
                      TIMESTAMP(concat(wrap_year, "-", wrap_month, '-', wrap_day, ' 23:59:59')) as ewrap_date,
                 FROM (
                  SELECT course_id, 
                    regexp_extract(value, r'(\d+)/\d+/\d+') as wrap_month,
                    regexp_extract(value, r'\d+/(\d+)/\d+') as wrap_day,
                    regexp_extract(value, r'\d+/\d+/(\d+)') as wrap_year,
                  FROM [{dataset}.course_metainfo]
                  where key='listings_Empirical Course Wrap'
                 )
            ) as BB
            on AA.course_id = BB.course_id

                # end course_metainfo subquery
                # --------------------
                
                ) as cminfo
                on pc.course_id = cminfo.course_id
                
                group by course_id, wrap_date
                order by course_id
                # ---- end get aggregate data
                )
                order by course_id
        """.format(dataset=dataset, person_course=latest_person_course)

        print "--> Assembling course_summary_stats from %s" % 'stats_cert_rates_by_registration'
        sys.stdout.flush()
        cert_by_reg = bqutil.get_bq_table(dataset,
                                          'stats_cert_rates_by_registration',
                                          sql=sql,
                                          newer_than=datetime.datetime(
                                              2015, 1, 16, 3, 0),
                                          key={'name': 'course_id'})

        # start assembling course_summary_stats

        c_sum_stats = defaultdict(OrderedDict)
        for entry in bsbc['data']:
            course_id = entry['course_id']
            cmci = c_sum_stats[course_id]
            cmci.update(entry)
            cnbw = nr_by_wrap['data_by_key'][course_id]
            nbw = int(cnbw['nregistered_by_wrap'])
            cmci['nbw_wrap_date'] = cnbw['wrap_date']
            cmci['nregistered_by_wrap'] = nbw
            cmci['nregistered_by_wrap_pct'] = cnbw['nregistered_by_wrap_pct']
            cmci['frac_female'] = float(entry['n_female_viewed']) / (float(
                entry['n_male_viewed']) + float(entry['n_female_viewed']))
            ncert = float(cmci['certified_sum'])
            if ncert:
                cmci[
                    'certified_of_nregistered_by_wrap_pct'] = nbw / ncert * 100.0
            else:
                cmci['certified_of_nregistered_by_wrap_pct'] = None
            cbr = cert_by_reg['data_by_key'][course_id]
            for field, value in cbr.items():
                cmci['cbr_%s' % field] = value

        # add medians for viewed, explored, and certified

        msbc_tables = {
            'msbc_viewed': "viewed_median_stats_by_course",
            'msbc_explored': 'explored_median_stats_by_course',
            'msbc_certified': 'certified_median_stats_by_course',
            'msbc_verified': 'verified_median_stats_by_course',
        }
        for prefix, mtab in msbc_tables.items():
            print "--> Merging median stats data from %s" % mtab
            sys.stdout.flush()
            bqdat = bqutil.get_table_data(dataset, mtab)
            for entry in bqdat['data']:
                course_id = entry['course_id']
                cmci = c_sum_stats[course_id]
                for field, value in entry.items():
                    cmci['%s_%s' % (prefix, field)] = value

        # add time on task data

        tot_table = "time_on_task_stats_by_course"
        prefix = "ToT"
        print "--> Merging time on task data from %s" % tot_table
        sys.stdout.flush()
        try:
            bqdat = bqutil.get_table_data(dataset, tot_table)
        except Exception as err:
            bqdat = {'data': {}}
        for entry in bqdat['data']:
            course_id = entry['course_id']
            cmci = c_sum_stats[course_id]
            for field, value in entry.items():
                if field == 'course_id':
                    continue
                cmci['%s_%s' % (prefix, field)] = value

        # add serial time on task data

        tot_table = "time_on_task_serial_stats_by_course"
        prefix = "SToT"
        print "--> Merging serial time on task data from %s" % tot_table
        sys.stdout.flush()
        try:
            bqdat = bqutil.get_table_data(dataset, tot_table)
        except Exception as err:
            bqdat = {'data': {}}
        for entry in bqdat['data']:
            course_id = entry['course_id']
            cmci = c_sum_stats[course_id]
            for field, value in entry.items():
                if field == 'course_id':
                    continue
                cmci['%s_%s' % (prefix, field)] = value

        # add show_answer stats

        tot_table = "show_answer_stats_by_course"
        prefix = "SAS"
        print "--> Merging show_answer stats data from %s" % tot_table
        sys.stdout.flush()
        try:
            bqdat = bqutil.get_table_data(dataset, tot_table)
        except Exception as err:
            bqdat = {'data': {}}
        for entry in bqdat['data']:
            course_id = entry['course_id']
            cmci = c_sum_stats[course_id]
            for field, value in entry.items():
                if field == 'course_id':
                    continue
                cmci['%s_%s' % (prefix, field)] = value

        # setup list of keys, for CSV output

        css_keys = c_sum_stats.values()[0].keys()

        # retrieve course_metainfo table, pivot, add that to summary_stats

        print "--> Merging course_metainfo from %s" % table
        sys.stdout.flush()
        bqdat = bqutil.get_table_data(dataset, table)

        listings_keys = map(make_key, [
            "Institution", "Semester", "New or Rerun",
            "Andrew Recodes New/Rerun", "Course Number", "Short Title",
            "Andrew's Short Titles", "Title", "Instructors",
            "Registration Open", "Course Launch", "Course Wrap", "course_id",
            "Empirical Course Wrap", "Andrew's Order", "certifies",
            "MinPassGrade", '4-way Category by name',
            "4-way (CS, STEM, HSocSciGov, HumHistRel)"
        ])
        listings_keys.reverse()

        for lk in listings_keys:
            css_keys.insert(1, "listings_%s" % lk)

        COUNTS_TO_KEEP = [
            'discussion', 'problem', 'optionresponse', 'checkboxgroup',
            'optioninput', 'choiceresponse', 'video', 'choicegroup',
            'vertical', 'choice', 'sequential', 'multiplechoiceresponse',
            'numericalresponse', 'chapter', 'solution', 'img',
            'formulaequationinput', 'responseparam', 'selfassessment', 'track',
            'task', 'rubric', 'stringresponse', 'combinedopenended',
            'description', 'textline', 'prompt', 'category', 'option', 'lti',
            'annotationresponse', 'annotatable', 'colgroup', 'tag_prompt',
            'comment', 'annotationinput', 'image', 'options', 'comment_prompt',
            'conditional', 'answer', 'poll_question', 'section', 'wrapper',
            'map', 'area', 'customtag', 'transcript', 'split_test',
            'word_cloud', 'openended', 'openendedparam', 'answer_display',
            'code', 'drag_and_drop_input', 'customresponse', 'draggable',
            'mentoring', 'textannotation', 'imageannotation', 'videosequence',
            'feedbackprompt', 'assessments', 'openassessment', 'assessment',
            'explanation', 'criterion'
        ]

        for entry in bqdat['data']:
            thekey = make_key(entry['key'])
            # if thekey.startswith('count_') and thekey[6:] not in COUNTS_TO_KEEP:
            #     continue
            if thekey.startswith(
                    'listings_') and thekey[9:] not in listings_keys:
                # print "dropping key=%s for course_id=%s" % (thekey, entry['course_id'])
                continue
            c_sum_stats[entry['course_id']][thekey] = entry['value']
            #if 'certifies' in thekey:
            #    print "course_id=%s, key=%s, value=%s" % (entry['course_id'], thekey, entry['value'])
            if thekey not in css_keys:
                css_keys.append(thekey)

        # compute forum_posts_per_week
        for course_id, entry in c_sum_stats.items():
            nfps = entry.get('nforum_posts_sum', 0)
            if nfps:
                fppw = int(nfps) / float(entry['nweeks'])
                entry['nforum_posts_per_week'] = fppw
                print "    course: %s, assessments_per_week=%s, forum_posts_per_week=%s" % (
                    course_id, entry['total_assessments_per_week'], fppw)
            else:
                entry['nforum_posts_per_week'] = None
        css_keys.append('nforum_posts_per_week')

        # read in listings file and merge that in also
        if listings_file:
            if listings_file.endswith('.csv'):
                listings = csv.DictReader(open(listings_file))
            else:
                listings = [json.loads(x) for x in open(listings_file)]
            for entry in listings:
                course_id = entry['course_id']
                if course_id not in c_sum_stats:
                    continue
                cmci = c_sum_stats[course_id]
                for field, value in entry.items():
                    lkey = "listings_%s" % make_key(field)
                    if not (lkey in cmci) or (not cmci[lkey]):
                        cmci[lkey] = value

        print "Storing these fields: %s" % css_keys

        # get schema
        mypath = os.path.dirname(os.path.realpath(__file__))
        the_schema = json.loads(
            open('%s/schemas/schema_combined_course_summary_stats.json' %
                 mypath).read())
        schema_dict = {x['name']: x for x in the_schema}

        # write out CSV
        css_table = "course_summary_stats"
        ofn = "%s__%s.csv" % (dataset, css_table)
        ofn2 = "%s__%s.json" % (dataset, css_table)
        print "Writing data to %s and %s" % (ofn, ofn2)

        ofp = open(ofn, 'w')
        ofp2 = open(ofn2, 'w')
        dw = csv.DictWriter(ofp, fieldnames=css_keys)
        dw.writeheader()
        for cid, entry in c_sum_stats.items():
            for ek in entry:
                if ek not in schema_dict:
                    entry.pop(ek)
                # entry[ek] = str(entry[ek])	# coerce to be string
            ofp2.write(json.dumps(entry) + "\n")
            for key in css_keys:
                if key not in entry:
                    entry[key] = None
            dw.writerow(entry)
        ofp.close()
        ofp2.close()

        # upload to bigquery
        # the_schema = [ { 'type': 'STRING', 'name': x } for x in css_keys ]
        if 1:
            gsfnp = gspath / dataset / (css_table + ".json")
            gsutil.upload_file_to_gs(ofn2, gsfnp)
            # bqutil.load_data_to_table(dataset, css_table, gsfnp, the_schema, wait=True, verbose=False,
            #                           format='csv', skiprows=1)
            bqutil.load_data_to_table(dataset,
                                      css_table,
                                      gsfnp,
                                      the_schema,
                                      wait=True,
                                      verbose=False)

        return

    print "-" * 60 + " %s" % course_id

    # get nweeks from listings
    lfn = path(listings_file)
    if not lfn.exists():
        print "[analyze_content] course listings file %s doesn't exist!" % lfn
        return

    data = None
    if listings_file.endswith('.json'):
        data_feed = map(json.loads, open(lfn))
    else:
        data_feed = csv.DictReader(open(lfn))
    for k in data_feed:
        if not 'course_id' in k:
            print "Strange course listings row, no course_id in %s" % k
            raise Exception("Missing course_id")
        if k['course_id'] == course_id:
            data = k
            break

    if not data:
        print "[analyze_content] no entry for %s found in course listings file %s!" % (
            course_id, lfn)
        return

    def date_parse(field):
        (m, d, y) = map(int, data[field].split('/'))
        return datetime.datetime(y, m, d)

    launch = date_parse('Course Launch')
    wrap = date_parse('Course Wrap')
    ndays = (wrap - launch).days
    nweeks = ndays / 7.0

    print "Course length = %6.2f weeks (%d days)" % (nweeks, ndays)

    if pin_date:
        datedir = pin_date
    course_dir = find_course_sql_dir(course_id, basedir, datedir,
                                     use_dataset_latest and not pin_date)
    cfn = gsutil.path_from_course_id(course_id)

    xbfn = course_dir / ("xbundle_%s.xml" % cfn)

    if not xbfn.exists():
        print "[analyze_content] cannot find xbundle file %s for %s!" % (
            xbfn, course_id)

        if use_dataset_latest:
            # try looking in earlier directories for xbundle file
            import glob
            spath = course_dir / ("../*/xbundle_%s.xml" % cfn)
            files = list(glob.glob(spath))
            if files:
                xbfn = path(files[-1])
            if not xbfn.exists():
                print "   --> also cannot find any %s ; aborting!" % spath
            else:
                print "   --> Found and using instead: %s " % xbfn
        if not xbfn.exists():
            raise Exception("[analyze_content] missing xbundle file %s" % xbfn)

    # if there is an xbundle*.fixed file, use that instead of the normal one
    if os.path.exists(str(xbfn) + ".fixed"):
        xbfn = path(str(xbfn) + ".fixed")

    print "[analyze_content] For %s using %s" % (course_id, xbfn)

    # get module usage data
    mudata = get_stats_module_usage(course_id, basedir, datedir,
                                    use_dataset_latest)

    xml = etree.parse(open(xbfn)).getroot()

    counts = defaultdict(int)
    nexcluded = defaultdict(int)

    IGNORE = [
        'html', 'p', 'div', 'iframe', 'ol', 'li', 'ul', 'blockquote', 'h1',
        'em', 'b', 'h2', 'h3', 'body', 'span', 'strong', 'a', 'sub', 'strike',
        'table', 'td', 'tr', 's', 'tbody', 'sup', 'sub', 'strike', 'i', 's',
        'pre', 'policy', 'metadata', 'grading_policy', 'br', 'center', 'wiki',
        'course', 'font', 'tt', 'it', 'dl', 'startouttext', 'endouttext', 'h4',
        'head', 'source', 'dt', 'hr', 'u', 'style', 'dd', 'script', 'th', 'p',
        'P', 'TABLE', 'TD', 'small', 'text', 'title'
    ]

    problem_stats = defaultdict(int)

    def does_problem_have_random_script(problem):
        '''
        return 1 if problem has a script with "random." in it
        else return 0
        '''
        for elem in problem.findall('.//script'):
            if elem.text and ('random.' in elem.text):
                return 1
        return 0

    # walk through xbundle
    def walk_tree(elem, policy=None):
        '''
        Walk XML tree recursively.
        elem = current element
        policy = dict of attributes for children to inherit, with fields like due, graded, showanswer
        '''
        policy = policy or {}
        if type(elem.tag) == str and (elem.tag.lower() not in IGNORE):
            counts[elem.tag.lower()] += 1
        if elem.tag in [
                "sequential", "problem", "problemset", "course", "chapter"
        ]:  # very old courses may use inheritance from course & chapter
            keys = ["due", "graded", "format", "showanswer", "start"]
            for k in keys:  # copy inheritable attributes, if they are specified
                val = elem.get(k)
                if val:
                    policy[k] = val
        if elem.tag == "problem":  # accumulate statistics about problems: how many have show_answer = [past_due, closed] ?  have random. in script?
            problem_stats['n_capa_problems'] += 1
            if policy.get('showanswer'):
                problem_stats["n_showanswer_%s" %
                              policy.get('showanswer')] += 1
            else:
                problem_stats[
                    'n_shownanswer_finished'] += 1  # DEFAULT showanswer = finished  (make sure this remains true)
                # see https://github.com/edx/edx-platform/blob/master/common/lib/xmodule/xmodule/capa_base.py#L118
                # finished = Show the answer after the student has answered the problem correctly, the student has no attempts left, or the problem due date has passed.
            problem_stats[
                'n_random_script'] += does_problem_have_random_script(elem)

            if policy.get('graded') == 'true' or policy.get(
                    'graded') == 'True':
                problem_stats['n_capa_problems_graded'] += 1
                problem_stats[
                    'n_graded_random_script'] += does_problem_have_random_script(
                        elem)
                if policy.get('showanswer'):
                    problem_stats["n_graded_showanswer_%s" %
                                  policy.get('showanswer')] += 1
                else:
                    problem_stats[
                        'n_graded_shownanswer_finished'] += 1  # DEFAULT showanswer = finished  (make sure this remains true)

        for k in elem:
            midfrag = (k.tag, k.get('url_name_orig', None))
            if (midfrag in mudata) and int(mudata[midfrag]['ncount']) < 20:
                nexcluded[k.tag] += 1
                if verbose:
                    try:
                        print "    -> excluding %s (%s), ncount=%s" % (
                            k.get('display_name',
                                  '<no_display_name>').encode('utf8'), midfrag,
                            mudata.get(midfrag, {}).get('ncount'))
                    except Exception as err:
                        print "    -> excluding ", k
                continue
            walk_tree(k, policy.copy())

    walk_tree(xml)
    print "--> Count of individual element tags throughout XML: ", counts

    print "--> problem_stats:", json.dumps(problem_stats, indent=4)

    # combine some into "qual_axis" and others into "quant_axis"
    qual_axis = [
        'openassessment',
        'optionresponse',
        'multiplechoiceresponse',
        # 'discussion',
        'choiceresponse',
        'word_cloud',
        'combinedopenended',
        'choiceresponse',
        'stringresponse',
        'textannotation',
        'openended',
        'lti'
    ]
    quant_axis = [
        'formularesponse', 'numericalresponse', 'customresponse',
        'symbolicresponse', 'coderesponse', 'imageresponse'
    ]

    nqual = 0
    nquant = 0
    for tag, count in counts.items():
        if tag in qual_axis:
            nqual += count
        if tag in quant_axis:
            nquant += count

    print "nqual=%d, nquant=%d" % (nqual, nquant)

    nqual_per_week = nqual / nweeks
    nquant_per_week = nquant / nweeks
    total_per_week = nqual_per_week + nquant_per_week

    print "per week: nqual=%6.2f, nquant=%6.2f total=%6.2f" % (
        nqual_per_week, nquant_per_week, total_per_week)

    # save this overall data in CCDATA
    lock_file(CCDATA)
    ccdfn = path(CCDATA)
    ccd = {}
    if ccdfn.exists():
        for k in csv.DictReader(open(ccdfn)):
            ccd[k['course_id']] = k

    ccd[course_id] = {
        'course_id': course_id,
        'nweeks': nweeks,
        'nqual_per_week': nqual_per_week,
        'nquant_per_week': nquant_per_week,
        'total_assessments_per_week': total_per_week,
    }

    # fields = ccd[ccd.keys()[0]].keys()
    fields = [
        'course_id', 'nquant_per_week', 'total_assessments_per_week',
        'nqual_per_week', 'nweeks'
    ]
    cfp = open(ccdfn, 'w')
    dw = csv.DictWriter(cfp, fieldnames=fields)
    dw.writeheader()
    for cid, entry in ccd.items():
        dw.writerow(entry)
    cfp.close()
    lock_file(CCDATA, release=True)

    # store data in course_metainfo table, which has one (course_id, key, value) on each line
    # keys include nweeks, nqual, nquant, count_* for module types *

    cmfields = OrderedDict()
    cmfields['course_id'] = course_id
    cmfields['course_length_days'] = str(ndays)
    cmfields.update(
        {make_key('listings_%s' % key): value
         for key, value in data.items()})  # from course listings
    cmfields.update(ccd[course_id].copy())

    # cmfields.update({ ('count_%s' % key) : str(value) for key, value in counts.items() })	# from content counts

    cmfields['filename_xbundle'] = xbfn
    cmfields['filename_listings'] = lfn

    for key in sorted(
            counts
    ):  # store counts in sorted order, so that the later generated CSV file can have a predictable structure
        value = counts[key]
        cmfields['count_%s' % key] = str(value)  # from content counts

    for key in sorted(problem_stats):  # store problem stats
        value = problem_stats[key]
        cmfields['problem_stat_%s' % key] = str(value)

    cmfields.update({('nexcluded_sub_20_%s' % key): str(value)
                     for key, value in nexcluded.items()
                     })  # from content counts

    course_dir = find_course_sql_dir(course_id, basedir, datedir,
                                     use_dataset_latest)
    csvfn = course_dir / CMINFO

    # manual overriding of the automatically computed fields can be done by storing course_id,key,value data
    # in the CMINFO_OVERRIDES file

    csvfn_overrides = course_dir / CMINFO_OVERRIDES
    if csvfn_overrides.exists():
        print "--> Loading manual override information from %s" % csvfn_overrides
        for ovent in csv.DictReader(open(csvfn_overrides)):
            if not ovent['course_id'] == course_id:
                print "===> ERROR! override file has entry with wrong course_id: %s" % ovent
                continue
            print "    overriding key=%s with value=%s" % (ovent['key'],
                                                           ovent['value'])
            cmfields[ovent['key']] = ovent['value']

    print "--> Course metainfo writing to %s" % csvfn

    fp = open(csvfn, 'w')

    cdw = csv.DictWriter(fp, fieldnames=['course_id', 'key', 'value'])
    cdw.writeheader()

    for k, v in cmfields.items():
        cdw.writerow({'course_id': course_id, 'key': k, 'value': v})

    fp.close()

    # build and output course_listings_and_metainfo

    dataset = bqutil.course_id2dataset(course_id,
                                       use_dataset_latest=use_dataset_latest)

    mypath = os.path.dirname(os.path.realpath(__file__))
    clm_table = "course_listing_and_metainfo"
    clm_schema_file = '%s/schemas/schema_%s.json' % (mypath, clm_table)
    clm_schema = json.loads(open(clm_schema_file).read())

    clm = {}
    for finfo in clm_schema:
        field = finfo['name']
        clm[field] = cmfields.get(field)
    clm_fnb = clm_table + ".json"
    clm_fn = course_dir / clm_fnb
    open(clm_fn, 'w').write(json.dumps(clm))

    gsfnp = gsutil.gs_path_from_course_id(
        course_id, use_dataset_latest=use_dataset_latest) / clm_fnb
    print "--> Course listing + metainfo uploading to %s then to %s.%s" % (
        gsfnp, dataset, clm_table)
    sys.stdout.flush()
    gsutil.upload_file_to_gs(clm_fn, gsfnp)
    bqutil.load_data_to_table(dataset,
                              clm_table,
                              gsfnp,
                              clm_schema,
                              wait=True,
                              verbose=False)

    # output course_metainfo

    table = 'course_metainfo'
    dataset = bqutil.course_id2dataset(course_id,
                                       use_dataset_latest=use_dataset_latest)

    gsfnp = gsutil.gs_path_from_course_id(
        course_id, use_dataset_latest=use_dataset_latest) / CMINFO
    print "--> Course metainfo uploading to %s then to %s.%s" % (
        gsfnp, dataset, table)
    sys.stdout.flush()

    gsutil.upload_file_to_gs(csvfn, gsfnp)

    mypath = os.path.dirname(os.path.realpath(__file__))
    SCHEMA_FILE = '%s/schemas/schema_course_metainfo.json' % mypath
    the_schema = json.loads(open(SCHEMA_FILE).read())[table]

    bqutil.load_data_to_table(dataset,
                              table,
                              gsfnp,
                              the_schema,
                              wait=True,
                              verbose=False,
                              format='csv',
                              skiprows=1)
Example #17
0
def rephrase_forum_json_for_course(
    course_id,
    gsbucket="gs://x-data",
    basedir="X-Year-2-data-sql",
    datedir=None,
    do_gs_copy=False,
    use_dataset_latest=False,
):

    print "Loading SQL for course %s into BigQuery (start: %s)" % (
        course_id, datetime.datetime.now())
    sys.stdout.flush()

    lfp = find_course_sql_dir(course_id,
                              basedir,
                              datedir,
                              use_dataset_latest=use_dataset_latest)

    print "Using this directory for local files: ", lfp
    sys.stdout.flush()

    fn = 'forum.mongo'
    gsdir = gsutil.gs_path_from_course_id(course_id, gsbucket,
                                          use_dataset_latest)

    def openfile(fn, mode='r'):
        if (not os.path.exists(lfp / fn)) and (not fn.endswith('.gz')):
            fn += ".gz"
        if fn.endswith('.gz'):
            return gzip.GzipFile(lfp / fn, mode)
        return open(lfp / fn, mode)

    fp = openfile(fn)

    ofn = lfp / "forum-rephrased.json.gz"
    ofncsv = "forum.csv.gz"  # To match table name in BQ
    ofncsv_lfp = lfp / ofncsv

    dataset = bqutil.course_id2dataset(course_id,
                                       use_dataset_latest=use_dataset_latest)
    bqutil.create_dataset_if_nonexistent(dataset)

    if os.path.exists(ofn) and os.path.exists(ofncsv_lfp):

        tables = bqutil.get_list_of_table_ids(dataset)
        if not 'forum' in tables:
            print "Already done?  But no forums table loaded into datasaet %s.  Redoing." % dataset
        else:
            print "Already done %s -> %s (skipping)" % (fn, ofn)
            print "Already done %s -> %s (skipping)" % (fn, ofncsv_lfp)
            sys.stdout.flush()
            return

    print "Processing %s -> writing to %s and %s (%s)" % (
        fn, ofn, ofncsv, datetime.datetime.now())
    sys.stdout.flush()

    # Setup CSV header
    ocsv = csv.DictWriter(openfile(ofncsv, 'w'),
                          fieldnames=SCHEMA_DICT.keys(),
                          quoting=csv.QUOTE_NONNUMERIC)
    ocsv.writeheader()

    cnt = 0
    ofp = gzip.GzipFile('tmp.json.gz', 'w')
    data = OrderedDict()
    for line in fp:
        cnt += 1
        # Write JSON row
        newline = do_rephrase_line(line, linecnt=cnt)
        ofp.write(newline)

        try:
            #Write CSV row
            data = json.loads(newline)
            ocsv.writerow(data)
        except Exception as err:
            print "Error writing CSV output row %s=%s" % (cnt, data)
            raise

    ofp.close()

    print "...done (%s)" % datetime.datetime.now()

    if cnt == 0:
        print "...but cnt=0 entries found, skipping forum loading"
        sys.stdout.flush()
        return

    print "...copying to gsc"
    sys.stdout.flush()

    # do upload twice, because GSE file metadata doesn't always make it to BigQuery right away?
    gsfn = gsdir + '/' + "forum-rephrased.json.gz"
    cmd = 'gsutil cp tmp.json.gz %s' % (gsfn)
    os.system(cmd)
    os.system(cmd)

    table = 'forum'
    bqutil.load_data_to_table(dataset, table, gsfn, SCHEMA, wait=True)
    msg = "Original data from %s" % (lfp / fn)
    bqutil.add_description_to_table(dataset, table, msg, append=True)

    os.system('mv tmp.json.gz "%s"' % (ofn))

    print "...done (%s)" % datetime.datetime.now()
    sys.stdout.flush()
def upload_grades_persistent_data(cid,
                                  basedir,
                                  datedir,
                                  use_dataset_latest=False,
                                  subsection=False):
    """
    Upload grades_persistent csv.gz to Google Storage,
    create the BigQuery table,
    then insert the data into the table.

    :param cid: the course id
    :param basedir: the base directory path
    :param datedir: the date directory name (represented as YYYY-MM-DD)
    :param use_dataset_latest: should the most recent dataset be used?
    :param subsection: should grades_persistentsubsection be uploaded?
    :type cid: str
    :type basedir: str
    :type datedir: str
    :type use_dataset_latest: bool
    :type subsection: bool
    """
    gsdir = path(
        gsutil.gs_path_from_course_id(cid,
                                      use_dataset_latest=use_dataset_latest))

    if subsection:
        csv_name = "grades_persistentsubsectiongrade.csv.gz"
        temp_name = "grades_persistentsubsectiongrade_temp.csv.gz"
        table = "grades_persistent_subsection"
    else:
        csv_name = "grades_persistentcoursegrade.csv.gz"
        temp_name = "grades_persistentcoursegrade_temp.csv.gz"
        table = "grades_persistent"

    sdir = load_course_sql.find_course_sql_dir(
        cid,
        basedir=basedir,
        datedir=datedir,
        use_dataset_latest=(use_dataset_latest),
    )

    csvfn = sdir / csv_name
    tempfn = sdir / temp_name

    mypath = os.path.dirname(os.path.realpath(__file__))
    the_schema = json.loads(
        open('%s/schemas/schema_%s.json' % (mypath, table)).read())[table]

    if not os.path.exists(csvfn):
        print "[edx2bigquery] make_grades_persistent: missing file %s, skipping" % csvfn
        return

    if not subsection:
        cleanup_rows_from_grade_persistent(csvfn, tempfn)
    else:
        cleanup_rows_from_grade_persistent(csvfn,
                                           tempfn,
                                           field_to_fix="first_attempted")

    gsutil.upload_file_to_gs(csvfn, gsdir, options="-z csv", verbose=True)

    dataset = bqutil.course_id2dataset(cid,
                                       use_dataset_latest=use_dataset_latest)
    bqutil.create_dataset_if_nonexistent(
        dataset)  # create dataset if not already existent

    bqutil.load_data_to_table(dataset,
                              table,
                              gsdir / csv_name,
                              the_schema,
                              format="csv",
                              skiprows=1)
def process_file(course_id,
                 basedir=None,
                 datedir=None,
                 use_dataset_latest=False):

    basedir = path(basedir or '')
    course_dir = course_id.replace('/', '__')
    lfp = find_course_sql_dir(course_id,
                              basedir,
                              datedir,
                              use_dataset_latest=use_dataset_latest)

    cdir = lfp
    print "Processing %s from files in %s" % (course_id, cdir)
    sys.stdout.flush()

    mypath = os.path.dirname(os.path.realpath(__file__))
    SCHEMA_FILE = '%s/schemas/schema_user_info_combo.json' % mypath

    the_dict_schema = schema2dict(
        json.loads(open(SCHEMA_FILE).read())['user_info_combo'])

    uic = defaultdict(
        dict
    )  # dict with key = user_id, and val = dict to be written out as JSON line

    def copy_elements(src, dest, fields, prefix="", skip_empty=False):
        for key in fields:
            if skip_empty and (not key in src):
                src[key] = None
            if src[key] == 'NULL':
                continue
            if key == 'course_id' and src[key].startswith('course-v1:'):
                # special handling for mangled "opaque keys" version of course_id, e.g. course-v1:MITx+6.00.2x_3+1T2015
                src[key] = src[key].split(':', 1)[1].replace('+', '/')
            dest[prefix + key] = src[key]

    def openfile(fn_in, mode='r', add_dir=True):
        if add_dir:
            fn = cdir / fn_in
        else:
            fn = fn_in
        if (not os.path.exists(fn)) and (not fn.endswith('.gz')):
            fn += ".gz"
        if mode == 'r' and not os.path.exists(fn):
            newfn = convert_sql(
                fn)  # try converting from *.sql file, if that exists
            if not newfn:
                return None  # failure, no file found, return None
            fn = newfn
        if fn.endswith('.gz'):
            return gzip.GzipFile(fn, mode)
        return open(fn, mode)

    def tsv2csv(fn_in, fn_out):
        import csv
        fp = openfile(fn_out, 'w', add_dir=False)
        csvfp = csv.writer(fp)
        for line in openfile(fn_in, add_dir=False):
            csvfp.writerow(line[:-1].split('\t'))
        fp.close()

    def convert_sql(fnroot):
        '''
        Returns filename if suitable file exists or was created by conversion of tab separated values to comma separated values.
        Returns False otherwise.
        '''
        if fnroot.endswith('.gz'):
            fnroot = fnroot[:-3]
        if fnroot.endswith('.csv'):
            fnroot = fnroot[:-4]
        if os.path.exists(fnroot + ".csv"):
            return fnroot + ".csv"
        if os.path.exists(fnroot + ".csv.gz"):
            return fnroot + ".csv.gz"
        if os.path.exists(fnroot + ".sql") or os.path.exists(fnroot +
                                                             ".sql.gz"):
            infn = fnroot + '.sql'
            outfn = fnroot + '.csv.gz'
            print "--> Converting %s to %s" % (infn, outfn)
            tsv2csv(infn, outfn)
            return outfn
        return False

    nusers = 0
    fields = ['username', 'email', 'is_staff', 'last_login', 'date_joined']
    for line in csv.DictReader(openfile('users.csv')):
        uid = int(line['id'])
        copy_elements(line, uic[uid], fields)
        uic[uid]['user_id'] = uid
        nusers += 1
        uic[uid]['y1_anomalous'] = None
        uic[uid]['edxinstructordash_Grade'] = None
        uic[uid]['edxinstructordash_Grade_timestamp'] = None

    print "  %d users loaded from users.csv" % nusers

    fp = openfile('profiles.csv')
    if fp is None:
        print "--> Skipping profiles.csv, file does not exist"
    else:
        nprofiles = 0
        fields = [
            'name', 'language', 'location', 'meta', 'courseware', 'gender',
            'mailing_address', 'year_of_birth', 'level_of_education', 'goals',
            'allow_certificate', 'country', 'city'
        ]
        for line in csv.DictReader(fp):
            uid = int(line['user_id'])
            copy_elements(line, uic[uid], fields, prefix="profile_")
            nprofiles += 1
        print "  %d profiles loaded from profiles.csv" % nprofiles

    fp = openfile('enrollment.csv')
    if fp is None:
        print "--> Skipping enrollment.csv, file does not exist"
    else:
        nenrollments = 0
        fields = [
            'course_id',
            'created',
            'is_active',
            'mode',
        ]
        for line in csv.DictReader(fp):
            uid = int(line['user_id'])
            copy_elements(line, uic[uid], fields, prefix="enrollment_")
            nenrollments += 1
        print "  %d enrollments loaded from profiles.csv" % nenrollments

    # see if from_mongodb files are present for this course; if so, merge in that data
    mongodir = cdir.dirname() / 'from_mongodb'
    if mongodir.exists():
        print "--> %s exists, merging in users, profile, and enrollment data from mongodb" % mongodir
        sys.stdout.flush()
        fp = gzip.GzipFile(mongodir / "users.json.gz")
        fields = ['username', 'email', 'is_staff', 'last_login', 'date_joined']
        nadded = 0
        for line in fp:
            pdata = json.loads(line)
            uid = int(pdata['_id'])
            if not uid in uic:
                copy_elements(pdata, uic[uid], fields, skip_empty=True)
                uic[uid]['user_id'] = uid
                nadded += 1
        fp.close()
        print "  %d additional users loaded from %s/users.json.gz" % (nadded,
                                                                      mongodir)

        fp = gzip.GzipFile(mongodir / "profiles.json.gz")
        fields = [
            'name', 'language', 'location', 'meta', 'courseware', 'gender',
            'mailing_address', 'year_of_birth', 'level_of_education', 'goals',
            'allow_certificate', 'country', 'city'
        ]
        nadd_profiles = 0

        def fix_unicode(elem, fields):
            for k in fields:
                if (k in elem) and elem[k]:
                    elem[k] = elem[k].encode('utf8')

        for line in fp:
            pdata = json.loads(line.decode('utf8'))
            uid = int(pdata['user_id'])
            if not uic[uid].get('profile_name', None):
                copy_elements(pdata,
                              uic[uid],
                              fields,
                              prefix="profile_",
                              skip_empty=True)
                fix_unicode(uic[uid], [
                    'profile_name', 'profile_mailing_address', 'profile_goals',
                    'profile_location', 'profile_language'
                ])
                uic[uid]['y1_anomalous'] = 1
                nadd_profiles += 1
        fp.close()
        print "  %d additional profiles loaded from %s/profiles.json.gz" % (
            nadd_profiles, mongodir)

        # if datedir is specified, then do not add entries from mongodb where the enrollment happened after the datedir cutoff
        cutoff = None
        if datedir:
            cutoff = "%s 00:00:00" % datedir

        fp = gzip.GzipFile(mongodir / "enrollment.json.gz")
        fields = [
            'course_id',
            'created',
            'is_active',
            'mode',
        ]
        nadd_enrollment = 0
        n_removed_after_cutoff = 0
        for line in fp:
            pdata = json.loads(line.decode('utf8'))
            uid = int(pdata['user_id'])
            if not uic[uid].get('enrollment_course_id', None):
                if cutoff and (pdata['created'] > cutoff) and (
                        uic[uid].get('y1_anomalous')
                        == 1):  # remove if enrolled after datedir cutoff
                    uic.pop(uid)
                    n_removed_after_cutoff += 1
                else:
                    copy_elements(pdata,
                                  uic[uid],
                                  fields,
                                  prefix="enrollment_",
                                  skip_empty=True)
                    nadd_enrollment += 1
        fp.close()
        print "  %d additional enrollments loaded from %s/enrollment.json.gz" % (
            nadd_enrollment, mongodir)

        print "     from mongodb files, added %s (of %s) new users (%s profiles, %s enrollments, %s after cutoff %s)" % (
            nadded - n_removed_after_cutoff, nadded, nadd_profiles,
            nadd_enrollment, n_removed_after_cutoff, cutoff)
        sys.stdout.flush()

    # See if instructor grade reports are present for this course; if so, merge in that data
    edxinstructordash = cdir.dirname() / 'from_edxinstructordash'
    if edxinstructordash.exists():
        print "--> %s exists, merging in users, profile, and enrollment data from_edxinstructordash" % edxinstructordash
        sys.stdout.flush()

        grade_report_fn = (edxinstructordash / 'grade_report.csv')
        fp = openfile(grade_report_fn, add_dir=False)
        if fp is None:
            print "--> Skipping grade_report.csv, file does not exist in dir from_edxinstructordash"
        nadded = 0
        print fp
        for line in csv.DictReader(fp):
            uid = int(line['Student ID'])
            fields = ['Grade', 'Grade_timestamp']
            #['course_id','Student ID','Email','Username','Grade' ]
            #'Enrollment Track',' Verification Status','Certificate Eligible','Certificate Delivered','Certificate Type' ]
            copy_elements(line, uic[uid], fields, prefix="edxinstructordash_")
            nadded += 1
        fp.close()
        print "  %d grades loaded from %s/grade_report.csv" % (
            nadded, edxinstructordash)
        sys.stdout.flush()

    fp = openfile('certificates.csv')
    if fp is None:
        print "--> Skipping certificates.csv, file does not exist"
    else:
        for line in csv.DictReader(fp):
            uid = int(line['user_id'])
            fields = [
                'download_url',
                'grade',
                'course_id',
                'key',
                'distinction',
                'status',
                'verify_uuid',
                'download_uuid',
                'name',
                'created_date',
                'modified_date',
                'error_reason',
                'mode',
            ]
            copy_elements(line, uic[uid], fields, prefix="certificate_")
            if 'user_id' not in uic[uid]:
                uic[uid]['user_id'] = uid

    # sanity check for entries with user_id but missing username
    nmissing_uname = 0
    for uid, entry in uic.iteritems():
        if (not 'username' in entry) or (not entry['username']):
            nmissing_uname += 1
            if nmissing_uname < 10:
                print "missing username: %s" % entry
    print "--> %d entries missing username" % nmissing_uname
    sys.stdout.flush()

    # sanity check for entries missing course_id
    nmissing_cid = 0
    for uid, entry in uic.iteritems():
        if (not 'enrollment_course_id'
                in entry) or (not entry['enrollment_course_id']):
            nmissing_cid += 1
            entry['enrollment_course_id'] = course_id
    print "--> %d entries missing enrollment_course_id (all fixed by setting to %s)" % (
        nmissing_cid, course_id)
    sys.stdout.flush()

    fp = openfile('user_id_map.csv')
    if fp is None:
        print "--> Skipping user_id_map.csv, file does not exist"
    else:
        for line in csv.DictReader(fp):
            uid = int(line['id'])
            fields = ['hash_id']
            copy_elements(line, uic[uid], fields, prefix="id_map_")

    # sort by userid
    uidset = uic.keys()
    uidset.sort()

    # write out result, checking schema along the way

    fieldnames = the_dict_schema.keys()
    ofp = openfile('user_info_combo.json.gz', 'w')
    ocsv = csv.DictWriter(openfile('user_info_combo.csv.gz', 'w'),
                          fieldnames=fieldnames)
    ocsv.writeheader()

    for uid in uidset:
        data = uic[uid]
        check_schema(uid, data, the_ds=the_dict_schema, coerce=True)
        if ('enrollment_course_id' not in data) and ('certificate_course_id'
                                                     not in data):
            print "Oops!  missing course_id in user_info_combo line: inconsistent SQL?"
            print "data = %s" % data
            print "Suppressing this row"
            continue
        row_course_id = data.get('enrollment_course_id',
                                 data.get('certificate_course_id', ''))
        if not row_course_id == course_id:
            print "Oops!  course_id=%s in user_info_combo line: inconsistent with expected=%s" % (
                row_course_id, course_id)
            print "data = %s" % data
            print "Suppressing this row"
            continue
        ofp.write(json.dumps(data) + '\n')
        try:
            ocsv.writerow(data)
        except Exception as err:
            print "failed to write data=%s" % data
            raise

    print "Done with make_user_info_combo for %s" % course_id
    sys.stdout.flush()
Example #20
0
def process_file(course_id, basedir=None, datedir=None, use_dataset_latest=False):

    basedir = path(basedir or '')
    course_dir = course_id.replace('/','__')
    lfp = find_course_sql_dir(course_id, basedir, datedir, use_dataset_latest=use_dataset_latest)

    cdir = lfp
    print "Processing %s role data in %s" % (course_id, cdir)
    sys.stdout.flush()

    mypath = os.path.dirname(os.path.realpath(__file__))
    SCHEMA_FILE = '%s/schemas/schema_roles.json' % mypath
    the_dict_schema = schema2dict(json.loads(open(SCHEMA_FILE).read())['staff']) 
    
    role_data = defaultdict(dict)		# dict with key = user_id, and val = dict to be written out as JSON line
    
    def convertLongToWide( longData, unique_id, column_header, values, columns ):
        import numpy as np
        print "    Currently %d roles assigned... Dropping duplicates" % len(longData)
        sys.stdout.flush()
    
        ld2 = longData.drop_duplicates(subset=[unique_id, column_header])#, keep='last')
    
        print "    %d unique roles assigned" % len(ld2)
        print "    Processing role data with these known roles: %s" % columns
        sys.stdout.flush()
        wideData = ld2.pivot( index=unique_id, columns=column_header, values=values )
        wideData = pd.DataFrame( wideData , index=wideData.index ).reset_index()
        
        # Ensuring all specified columns exist
        cols = wideData.columns.tolist()
        for c in columns:
            if c not in cols:
                wideData[ c ] = pd.Series([np.nan])
        return wideData
    
    def createUniqueId( longData ):
        import numpy as np
        if pd.notnull(longData['user_id']) and pd.notnull(['course_id']):
            uid = str(longData['user_id']) + str('__') + str(longData['course_id'])
        else:
            uid = 'NULL'
        return uid
    
    def splitUniqueId( wideData ):
        import numpy as np
        wideData['user_id'] = pd.Series([np.nan])
        wideData['course_id'] = pd.Series([np.nan])
        wideData[['user_id', 'course_id']] = wideData['uid'].apply( lambda x: pd.Series(x.split('__') ) )
        wideData['course_id'] = wideData['course_id'].apply( forceTransparentCourseId )
        return wideData
    
    def forceTransparentCourseId( course_id ):
        
        #if key=='course_id' and src[key].startswith('course-v1:'):
        if 'course-v1' in course_id:
            # special handling for mangled "opaque keys" version of course_id, e.g. course-v1:MITx+6.00.2x_3+1T2015
            #src[key] = src[key].split(':',1)[1].replace('+','/')
            return course_id.split(':',1)[1].replace('+','/')
        else:
            return course_id
    
    def openfile(fn_in, mode='r', add_dir=True, pandas_df=False):
        if pandas_df:
            return pd.read_csv( gzip.GzipFile( cdir / fn_in ), sep=',' )
        else:
            if add_dir:
                fn = cdir / fn_in
	    else:
                fn = fn_in
            if fn.endswith('.gz'):
                return gzip.GzipFile(fn, mode)
            return open(fn, mode)

    def createRoleVar( wideData, all_roles ):

	if wideData[ all_roles ].notnull().values.any():
             return str("Staff")
        else:
             return str("Student")
           
    def cleanRoles( longData, unique_id, column_header, values, columns, allfields ):
        
        longData[ values ] = int(1)
        longData['uid'] = longData.apply(createUniqueId, axis=1 )
        wideData = convertLongToWide( longData, unique_id=unique_id, column_header=column_header, values=values, columns=columns )
        wideData = splitUniqueId( wideData )[ allfields ]    
        return wideData
    
    def copy_elements(src, dest, fields, prefix="", skip_empty=False):
        for key in fields:
            if skip_empty and (not key in src):
                src[key] = None
            if src[key]=='NULL':
                continue
            if key=='course_id' and src[key].startswith('course-v1:'):
                # special handling for mangled "opaque keys" version of course_id, e.g. course-v1:MITx+6.00.2x_3+1T2015
                src[key] = src[key].split(':',1)[1].replace('+','/')

            # Ensure integer for floats, or null
            if pd.isnull(src[key]):
                copyKey = None
            elif type(src[key]) == float:
                copyKey = int(float(src[key]))
            else:
                copyKey = src[key]

            dest[prefix + key] = copyKey

    #roledata = pd.read_csv( cdir / ROLE_COURSE_ACCESS, sep=',')
    #rolediscdata = pd.read_csv( cdir / ROLE_FORUM_ACCESS, sep=',')
    roledata = openfile( fn_in=ROLE_COURSE_ACCESS, mode='w', pandas_df=True)
    rolediscdata = openfile( fn_in=ROLE_FORUM_ACCESS, mode='w', pandas_df=True)
    
    # Process Course Access Roles
    known_roles_course = OrderedDict([
                          ('beta_testers', 'roles_isBetaTester'), 
                          ('instructor', 'roles_isInstructor'), 
                          ('staff', 'roles_isStaff'),
                          ('ccx_coach', 'roles_isCCX'),
                          ('finance_admin', 'roles_isFinance'),
                          ('library_user', 'roles_isLibrary'),
                          ('sales_admin', 'roles_isSales')
                          ])
    base_fields = ['user_id', 'course_id']
    fields = base_fields + known_roles_course.keys()
    print "  Cleaning %s" % ROLE_COURSE_ACCESS
    wide_roledata = cleanRoles( roledata, 'uid', 'role', 'value', known_roles_course.keys(), fields )
    
    # Process Forum Discussion Roles
    known_roles_disc = OrderedDict([
                          ('Administrator','forumRoles_isAdmin'),
                          ('CommunityTA','forumRoles_isCommunityTA'),
                          ('Moderator', 'forumRoles_isModerator'), 
                          ('Student', 'forumRoles_isStudent')
                           ])
    base_fields = ['user_id', 'course_id']
    extra_fields = ['roles'] # To be used to create custom mapping based on existing course and discussion forum roles
    fields = base_fields + known_roles_disc.keys()
    print "  Cleaning %s" % ROLE_FORUM_ACCESS
    wide_rolediscdata = cleanRoles( rolediscdata, 'uid', 'name', 'value', known_roles_disc.keys(), fields )
    
    # Compile
    fields_all = base_fields + known_roles_course.keys() + known_roles_disc.keys() + extra_fields
    rename_dict = dict(known_roles_course, **known_roles_disc)
    wideData = pd.merge( wide_roledata, wide_rolediscdata, how='outer', on=["user_id", "course_id"], suffixes=['', '_disc'] )

    # Create Roles var
    all_roles = known_roles_disc.keys() + known_roles_course.keys()
    all_roles.remove('Student')
    wideData['roles'] = wideData.apply( createRoleVar, args=[all_roles], axis=1 )

    # Rename columns
    wideData = wideData[ fields_all ]
    wideData.rename( columns=rename_dict, inplace=True )

    finalFields = wideData.columns.tolist()

    # Write out
    fieldnames = the_dict_schema.keys()
    ofp = openfile('roles.json.gz', 'w')
    ocsv = csv.DictWriter(openfile('roles.csv', 'w'), fieldnames=finalFields)
    ocsv.writeheader()
    
    wideData_dict = wideData.to_dict(orient='record')
    
    for line in wideData_dict:
        uid = int(line['user_id'])
        copy_elements(line, role_data[uid], finalFields, skip_empty=False )
        role_data[uid]['user_id'] = uid
        data = role_data[uid]   
    
        ofp.write(json.dumps(data) + '\n')
        try:
            ocsv.writerow(data)
        except Exception as err:
            print "failed to write data=%s" % data
            raise
    
    print "Done with make_roles for %s" % course_id
    sys.stdout.flush()
def rephrase_forum_json_for_course(course_id, gsbucket="gs://x-data", 
                                   basedir="X-Year-2-data-sql", 
                                   datedir=None, 
                                   do_gs_copy=False,
                                   use_dataset_latest=False,
                                   ):
    
    print "Loading SQL for course %s into BigQuery (start: %s)" % (course_id, datetime.datetime.now())
    sys.stdout.flush()

    lfp = find_course_sql_dir(course_id, basedir, datedir, use_dataset_latest=use_dataset_latest)

    print "Using this directory for local files: ", lfp
    sys.stdout.flush()

    fn = 'forum.mongo'
    gsdir = gsutil.gs_path_from_course_id(course_id, gsbucket, use_dataset_latest)
    
    def openfile(fn, mode='r'):
        if (not os.path.exists(lfp / fn)) and (not fn.endswith('.gz')):
            fn += ".gz"
        if fn.endswith('.gz'):
            return gzip.GzipFile(lfp / fn, mode)
        return open(lfp / fn, mode)

    fp = openfile(fn)

    ofn = lfp / "forum-rephrased.json.gz"

    dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest)
    bqutil.create_dataset_if_nonexistent(dataset)

    if os.path.exists(ofn):

        tables = bqutil.get_list_of_table_ids(dataset)
        if not 'forum' in tables:
            print "Already done?  But no forums table loaded into datasaet %s.  Redoing." % dataset
        else:
            print "Already done %s -> %s (skipping)" % (fn, ofn)
            sys.stdout.flush()
            return

    print "Processing %s -> %s (%s)" % (fn, ofn, datetime.datetime.now())
    sys.stdout.flush()

    cnt = 0
    ofp = gzip.GzipFile('tmp.json.gz', 'w')
    for line in fp:
        cnt += 1
        newline = do_rephrase_line(line, linecnt=cnt)
        ofp.write(newline)
    ofp.close()

    print "...done (%s)" % datetime.datetime.now()

    if cnt==0:
        print "...but cnt=0 entries found, skipping forum loading"
        sys.stdout.flush()
        return

    print "...copying to gsc"
    sys.stdout.flush()

    # do upload twice, because GSE file metadata doesn't always make it to BigQuery right away?
    gsfn = gsdir + '/' + "forum-rephrased.json.gz"
    cmd = 'gsutil cp tmp.json.gz %s' % (gsfn)
    os.system(cmd)
    os.system(cmd)

    table = 'forum'
    bqutil.load_data_to_table(dataset, table, gsfn, SCHEMA, wait=True)
    msg = "Original data from %s" % (lfp / fn)
    bqutil.add_description_to_table(dataset, table, msg, append=True)

    os.system('mv tmp.json.gz "%s"' % (ofn))

    print "...done (%s)" % datetime.datetime.now()
    sys.stdout.flush()
def make_video_stats(course_id, api_key, basedir, datedir, force_recompute, use_dataset_latest, use_latest_sql_dir):
    '''
    Create Video stats for Videos Viewed and Videos Watched.
    First create a video axis, based on course axis. Then use tracking logs to count up videos viewed and videos watched
    '''

    assert api_key is not None, "[analyze videos]: Public API Key is missing from configuration file. Visit https://developers.google.com/console/help/new/#generatingdevkeys for details on how to generate public key, and then add to edx2bigquery_config.py as API_KEY variable"

    # Get Course Dir path
    basedir = path(basedir or '')
    course_dir = course_id.replace('/','__')
    lfp = find_course_sql_dir(course_id, basedir, datedir, use_dataset_latest or use_latest_sql_dir)
    
    # get schema
    mypath = os.path.dirname(os.path.realpath(__file__))
    SCHEMA_FILE = '%s/%s' % ( mypath, SCHEMA_VIDEO_AXIS )
    the_schema = json.loads(open(SCHEMA_FILE).read())[ SCHEMA_VIDEO_AXIS_NAME ]
    the_dict_schema = schema2dict(the_schema)

    # Create initial video axis
    videoAxisExists = False
    dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest)
    va_date = None
    try:
        tinfo = bqutil.get_bq_table_info(dataset, TABLE_VIDEO_AXIS )
        assert tinfo is not None, "[analyze videos] %s.%s does not exist. First time creating table" % ( dataset, TABLE_VIDEO_AXIS )
	videoAxisExists = True
        va_date = tinfo['lastModifiedTime']		# datetime
    except (AssertionError, Exception) as err:
        print "%s --> Attempting to process %s table" % ( str(err), TABLE_VIDEO_AXIS )
        sys.stdout.flush()

    # get course axis time
    ca_date = None
    try:
        tinfo = bqutil.get_bq_table_info(dataset, TABLE_COURSE_AXIS )
        ca_date = tinfo['lastModifiedTime']		# datetime
    except (AssertionError, Exception) as err:
        pass

    if videoAxisExists and (not force_recompute) and ca_date and va_date and (ca_date > va_date):
        force_recompute = True
        print "video_axis exists, but has date %s, older than course_axis date %s; forcing recompute" % (va_date, ca_date)
        sys.stdout.flush()

    if not videoAxisExists or force_recompute:
        force_recompute = True
        createVideoAxis(course_id=course_id, force_recompute=force_recompute, use_dataset_latest=use_dataset_latest)

        # Get video lengths
        va = bqutil.get_table_data(dataset, TABLE_VIDEO_AXIS)
        assert va is not None, "[analyze videos] Possibly no data in video axis table. Check course axis table"
        va_bqdata = va['data']
        fileoutput = lfp / FILENAME_VIDEO_AXIS
        getYoutubeDurations( dataset=dataset, bq_table_input=va_bqdata, api_key=api_key, outputfilename=fileoutput, schema=the_dict_schema, force_recompute=force_recompute )

        # upload and import video axis
        gsfn = gsutil.gs_path_from_course_id(course_id, use_dataset_latest=use_dataset_latest) / FILENAME_VIDEO_AXIS
        gsutil.upload_file_to_gs(fileoutput, gsfn)
        table = TABLE_VIDEO_AXIS
        bqutil.load_data_to_table(dataset, table, gsfn, the_schema, wait=True)

    else:
        print "[analyze videos] %s.%s already exists (and force recompute not specified). Skipping step to generate %s using latest course axis" % ( dataset, TABLE_VIDEO_AXIS, TABLE_VIDEO_AXIS )

    # Lastly, create video stats
    createVideoStats_day( course_id, force_recompute=force_recompute, use_dataset_latest=use_dataset_latest )
    createVideoStats( course_id, force_recompute=force_recompute, use_dataset_latest=use_dataset_latest )

    # also create person_course_video_watched
    createPersonCourseVideo( course_id, force_recompute=force_recompute, use_dataset_latest=use_dataset_latest )
def make_video_stats(course_id, api_key, basedir, datedir, force_recompute,
                     use_dataset_latest):
    '''
    Create Video stats for Videos Viewed and Videos Watched.
    First create a video axis, based on course axis. Then use tracking logs to count up videos viewed and videos watched
    '''

    assert api_key is not None, "[analyze videos]: Public API Key is missing from configuration file. Visit https://developers.google.com/console/help/new/#generatingdevkeys for details on how to generate public key, and then add to edx2bigquery_config.py as API_KEY variable"

    # Get Course Dir path
    basedir = path(basedir or '')
    course_dir = course_id.replace('/', '__')
    lfp = find_course_sql_dir(course_id, basedir, datedir, use_dataset_latest)

    # get schema
    mypath = os.path.dirname(os.path.realpath(__file__))
    SCHEMA_FILE = '%s/%s' % (mypath, SCHEMA_VIDEO_AXIS)
    the_schema = json.loads(open(SCHEMA_FILE).read())[SCHEMA_VIDEO_AXIS_NAME]
    the_dict_schema = schema2dict(the_schema)

    # Create initial video axis
    videoAxisExists = False
    dataset = bqutil.course_id2dataset(course_id,
                                       use_dataset_latest=use_dataset_latest)
    va_date = None
    try:
        tinfo = bqutil.get_bq_table_info(dataset, TABLE_VIDEO_AXIS)
        assert tinfo is not None, "[analyze videos] %s.%s does not exist. First time creating table" % (
            dataset, TABLE_VIDEO_AXIS)
        videoAxisExists = True
        va_date = tinfo['lastModifiedTime']  # datetime
    except (AssertionError, Exception) as err:
        print "%s --> Attempting to process %s table" % (str(err),
                                                         TABLE_VIDEO_AXIS)
        sys.stdout.flush()

    # get course axis time
    ca_date = None
    try:
        tinfo = bqutil.get_bq_table_info(dataset, TABLE_COURSE_AXIS)
        ca_date = tinfo['lastModifiedTime']  # datetime
    except (AssertionError, Exception) as err:
        pass

    if videoAxisExists and (not force_recompute) and ca_date and va_date and (
            ca_date > va_date):
        force_recompute = True
        print "video_axis exists, but has date %s, older than course_axis date %s; forcing recompute" % (
            va_date, ca_date)
        sys.stdout.flush()

    if not videoAxisExists or force_recompute:
        force_recompute = True
        createVideoAxis(course_id=course_id,
                        force_recompute=force_recompute,
                        use_dataset_latest=use_dataset_latest)

        # Get video lengths
        va = bqutil.get_table_data(dataset, TABLE_VIDEO_AXIS)
        assert va is not None, "[analyze videos] Possibly no data in video axis table. Check course axis table"
        va_bqdata = va['data']
        fileoutput = lfp / FILENAME_VIDEO_AXIS
        getYoutubeDurations(dataset=dataset,
                            bq_table_input=va_bqdata,
                            api_key=api_key,
                            outputfilename=fileoutput,
                            schema=the_dict_schema,
                            force_recompute=force_recompute)

        # upload and import video axis
        gsfn = gsutil.gs_path_from_course_id(
            course_id,
            use_dataset_latest=use_dataset_latest) / FILENAME_VIDEO_AXIS
        gsutil.upload_file_to_gs(fileoutput, gsfn)
        table = TABLE_VIDEO_AXIS
        bqutil.load_data_to_table(dataset, table, gsfn, the_schema, wait=True)

    else:
        print "[analyze videos] %s.%s already exists (and force recompute not specified). Skipping step to generate %s using latest course axis" % (
            dataset, TABLE_VIDEO_AXIS, TABLE_VIDEO_AXIS)

    # Lastly, create video stats
    createVideoStats_day(course_id,
                         force_recompute=force_recompute,
                         use_dataset_latest=use_dataset_latest)
    createVideoStats(course_id,
                     force_recompute=force_recompute,
                     use_dataset_latest=use_dataset_latest)
def mongo_dump_user_info_files(course_id,
                               basedir=None,
                               datedir=None,
                               dbname=None,
                               use_dataset_latest=False):
    '''
    In early SQL tables from edX, enrollment records were deleted when a learner un-enrolled.
    This would then cause that learner's records to no longer appear in the SQL data
    dump to researchers.  This further caused problems because there would be learners
    who received certificates (and appeared in the certificates tables) but weren't
    in the auth_user or auth_userprofile or enrollment tables.

    One way around this was to incrementally load all the user, userprofile, etc. tables
    from every weekly dump, so that if a learner ever appeared as being registered
    in a course, the learner would stay that way.

    This workaround didn't completely solve the problem, however, for early courses,
    and thus additional data had to be manualy requested from edX.

    These issues were all resolved for courses after ~Spring 2014, when un-enrollment
    was changed such that it did not cause deletion of the enrollment record, but
    rather, just a change of the "active" flag within the enrollment record.

    Here, to workaround the problem, for a given course_id, we generate users.csv, profiles.csv, 
    enrollment.csv, certificates.csv, from collections 
    stored in mongodb, for a specified course.  These mongodb collections were curated
    and produced for the Hx and MITx Year 1 reports.
    
    Uses mongoexport.
    '''

    basedir = path(basedir or '')
    lfp = find_course_sql_dir(course_id,
                              basedir,
                              datedir,
                              use_dataset_latest=use_dataset_latest)
    mongodir = lfp.dirname() / 'from_mongodb'

    print "[mongo_dump_user_info_files] processing %s, output directory = %s" % (
        course_id, mongodir)
    if not mongodir.exists():
        os.mkdir(mongodir)

    def do_mongo_export(collection, ofn, ckey='course_id'):
        query = '{"%s": "%s"}' % (ckey, course_id)
        cmd = "mongoexport -d %s -c %s -q '%s' | gzip -9 > %s" % (
            dbname, collection, query, ofn)
        print "--> %s" % cmd
        sys.stdout.flush()
        os.system(cmd)

    # make users with javascript join

    js = """conn = new Mongo(); 
          db = conn.getDB('%s');
          var cursor = db.student_courseenrollment.find({'course_id': '%s'});
          while (cursor.hasNext()) { 
              var doc = cursor.next();
              udoc = db.auth_user.find({_id: doc.user_id})[0];
              print(JSON.stringify(udoc));
          }
          var course = '%s';
          var cursor = db.certificates_generatedcertificate.find({'course_id': course});
          while (cursor.hasNext()) {
              var doc = cursor.next();
              usc = db.student_courseenrollment.find({'course_id': course, 'user_id': doc.user_id });
              if (usc.length()==0){
                  udoc = db.auth_user.find({_id: doc.user_id})[0];
                  db.auth_userprofile.update({'user_id' : doc.user_id}, {\$addToSet : {courses: course }});
                  print(JSON.stringify(udoc));
              }
          }
         """ % (dbname, course_id, course_id)

    ofn = mongodir / "users.json.gz"
    cmd = 'echo "%s" | mongo --quiet | tail -n +3 | gzip -9 > %s' % (
        js.replace('\n', ''), ofn)
    print "--> %s" % cmd
    sys.stdout.flush()
    os.system(cmd)

    # profiles and enrollment and certificates

    do_mongo_export("auth_userprofile", mongodir / "profiles.json.gz",
                    'courses')
    do_mongo_export("student_courseenrollment",
                    mongodir / "enrollment.json.gz")
    do_mongo_export("certificates_generatedcertificate",
                    mongodir / "certificates.json.gz")
Example #25
0
def rephrase_forum_json_for_course(
    course_id,
    gsbucket="gs://x-data",
    basedir="X-Year-2-data-sql",
    datedir=None,
    do_gs_copy=False,
    use_dataset_latest=False,
):

    print "Loading SQL for course %s into BigQuery (start: %s)" % (
        course_id, datetime.datetime.now())
    sys.stdout.flush()

    lfp = find_course_sql_dir(course_id,
                              basedir,
                              datedir,
                              use_dataset_latest=use_dataset_latest)

    print "Using this directory for local files: ", lfp
    sys.stdout.flush()

    fn = 'forum.mongo'
    gsdir = gsutil.gs_path_from_course_id(course_id, gsbucket,
                                          use_dataset_latest)

    def openfile(fn, mode='r'):
        if (not os.path.exists(lfp / fn)) and (not fn.endswith('.gz')):
            fn += ".gz"
        if fn.endswith('.gz'):
            return gzip.GzipFile(lfp / fn, mode)
        return open(lfp / fn, mode)

    fp = openfile(fn)

    ofn = lfp / "forum-rephrased.json.gz"

    dataset = bqutil.course_id2dataset(course_id,
                                       use_dataset_latest=use_dataset_latest)
    bqutil.create_dataset_if_nonexistent(dataset)

    if os.path.exists(ofn):

        tables = bqutil.get_list_of_table_ids(dataset)
        if not 'forum' in tables:
            print "Already done?  But no forums table loaded into datasaet %s.  Redoing." % dataset
        else:
            print "Already done %s -> %s (skipping)" % (fn, ofn)
            sys.stdout.flush()
            return

    print "Processing %s -> %s (%s)" % (fn, ofn, datetime.datetime.now())
    sys.stdout.flush()

    cnt = 0
    ofp = gzip.GzipFile('tmp.json.gz', 'w')
    for line in fp:
        cnt += 1
        newline = do_rephrase_line(line, linecnt=cnt)
        ofp.write(newline)
    ofp.close()

    print "...done (%s)" % datetime.datetime.now()

    if cnt == 0:
        print "...but cnt=0 entries found, skipping forum loading"
        sys.stdout.flush()
        return

    print "...copying to gsc"
    sys.stdout.flush()

    # do upload twice, because GSE file metadata doesn't always make it to BigQuery right away?
    gsfn = gsdir + '/' + "forum-rephrased.json.gz"
    cmd = 'gsutil cp tmp.json.gz %s' % (gsfn)
    os.system(cmd)
    os.system(cmd)

    table = 'forum'
    bqutil.load_data_to_table(dataset, table, gsfn, SCHEMA, wait=True)
    msg = "Original data from %s" % (lfp / fn)
    bqutil.add_description_to_table(dataset, table, msg, append=True)

    os.system('mv tmp.json.gz "%s"' % (ofn))

    print "...done (%s)" % datetime.datetime.now()
    sys.stdout.flush()
def fix_missing_grades( course_id, 
                        getGrades=None,
                        download_only=None,
                        course_id_type='opaque', # Assume newer courses with format 'course-v1:HarvardX+course+term'
                        org_list=None, # org list is passed in order to properly grab data from edX instructor dashboard
                        basedir=None, 
                        datedir=None, 
                        use_dataset_latest=False):

    lfp = find_course_sql_dir(course_id=course_id, basedir=basedir)# , org_list=org_list)
    print lfp
    gradereport_dir = lfp + '/from_edxinstructordash'
    print gradereport_dir

    if not gradereport_dir.exists():
        os.mkdir(gradereport_dir)

    # Download grade report for single course
    def do_grade_report_export( ofn_dir, ofn, course_id):

        # Properly convert to opaque, if needed
        # Use Case: course_list contains transparent course ids, 
        # but need to pass opaque course id for edX instructor dashboard
        # course_id_type can be set using --course-id-type opaque, for example, on the command line
        # Important to properly pass the original course id, as it exists in the edX platform
	course_id = parseCourseIdField( course=course_id, org_list=org_list, course_id_type=course_id_type )
        print 'Processing course %s' % course_id
        sys.stdout.flush()
        
        try:
            # Make Grade Report Request
            getGrades.make_grade_report_request( course_id )
            
            # Check current set of grade reports
	    grade_reports_dict = getGrades.get_grade_reports( course_id, ofn_dir )

            if download_only:
                    print 'Download grade report only specified'
                    sys.stdout.flush()
		    getGrades.get_latest_grade_report( grade_reports_dict, ofn, ofn_dir )

            else:
		    # Keep checking until the Grade Report is Ready
		    grade_report_ready = False
		    while not grade_report_ready:
			current_time = datetime.datetime.now()
			print "Checking grade report status at %s" % current_time
			sys.stdout.flush()
			time.sleep( TIME_TO_WAIT )
			if grade_report_download_ready( course_id ):
			    grade_reports_dict = getGrades.get_grade_reports( course_id, ofn_dir )
			    getGrades.get_latest_grade_report( grade_reports_dict, ofn, ofn_dir )
			    grade_report_ready = True
	
        except Exception as err:
            print str(err)
            grade_report_downloaded = False

	    # Try one last time
	    try:
		    # If failure, then just try grabbing the latest grade report, just in case
		    grade_reports_dict = getGrades.get_grade_reports( course_id, ofn_dir )
		    getGrades.get_latest_grade_report( grade_reports_dict, ofn, ofn_dir )
		    grade_report_downloaded = True
            except Exception as err:
                    print 'Failure on second attempt %s' % str(err)
                    sys.stdout.flush()
		    raise

            if grade_report_downloaded:
                print 'Success on second attempt %s'
                sys.stdout.flush()
            else:
                print 'Failure on second attempt %s' % str(err)
                sys.stdout.flush()
	        raise


    def grade_report_download_ready( course_id ):
        
        getGrades.set_course_id( course_id)
        if getGrades.list_instructor_tasks()['tasks']:
            return False
        else:
            return True

    # Do single Course
    do_grade_report_export( gradereport_dir,  "grade_report.csv", course_id)
    def extractResearchData(self,
                            course_id,
                            tablename,
                            the_dataset=None,
                            rdp=None,
                            rdp_format='csv',
                            output_bucket=None,
                            basedir='',
                            datedir='',
                            do_gzip=True):
        '''
		Get research data output into tables and archive onto server
	'''

        # Archive location
        if course_id is not None:  # Individual Course Research Data Products

            self.gsp = gsutil.gs_path_from_course_id(course_id=course_id,
                                                     gsbucket=output_bucket,
                                                     use_dataset_latest=True)
            gsfilename = "%s/%s" % (self.gsp, RESEARCH_DATA_PRODUCTS[rdp])

        else:
            print "ERROR! Must specify course_id's.  Aborting."
            return

        try:
            # Copy to Google Storage
            msg = "[researchData]: Copying Research Data table %s to %s" % (
                tablename, gsfilename)
            print msg
            #gsfilename  = "%s/%s-*.csv.gz" % ( self.gsp, tablename ) # temp
            gsfilename = "%s/%s.csv.gz" % (self.gsp, tablename)  # temp
            ret = bqutil.extract_table_to_gs(the_dataset,
                                             tablename,
                                             gsfilename,
                                             format=rdp_format,
                                             do_gzip=True,
                                             wait=True)
            msg = "[researchData]: CSV download link: %s" % gsutil.gs_download_link(
                gsfilename)
            print msg
            sys.stdout.flush()

        except Exception as err:

            print str(err)
            if ('BQ Error creating table' in str(err)):
                msg = "[researchData]: Retrying... by sharding."
                print msg
                sys.stdout.flush()
                gsfilename = "%s/%s-*.csv.gz" % (self.gsp, tablename)
                print gsfilename
                sys.stdout.flush()
                ret = bqutil.extract_table_to_gs(the_dataset,
                                                 tablename,
                                                 gsfilename,
                                                 format=rdp_format,
                                                 do_gzip=True,
                                                 wait=True)
                msg = "[researchData]: CSV download link: %s" % gsutil.gs_download_link(
                    gsfilename)
                print msg
                sys.stdout.flush()

        # Copy from Google Storage to Secure Data Warehouse for archiving
        archiveLocation = find_course_sql_dir(course_id=course_id,
                                              basedir=basedir,
                                              datedir=datedir,
                                              use_dataset_latest=True)
        #time.sleep( CFG.TIME_TO_WAIT_30s ) # delay needed to allow for GS to upload file fully (This should be size dependent, and may not be enough time)
        msg = "[researchData]: Archiving Research Data table %s from %s to %s" % (
            tablename, gsfilename, archiveLocation)
        print msg
        sys.stdout.flush()
        gsutil.upload_file_to_gs(src=gsfilename,
                                 dst=archiveLocation,
                                 verbose=True)

        pass
def mongo_dump_user_info_files(course_id, basedir=None, datedir=None, dbname=None, use_dataset_latest=False):
    '''
    In early SQL tables from edX, enrollment records were deleted when a learner un-enrolled.
    This would then cause that learner's records to no longer appear in the SQL data
    dump to researchers.  This further caused problems because there would be learners
    who received certificates (and appeared in the certificates tables) but weren't
    in the auth_user or auth_userprofile or enrollment tables.

    One way around this was to incrementally load all the user, userprofile, etc. tables
    from every weekly dump, so that if a learner ever appeared as being registered
    in a course, the learner would stay that way.

    This workaround didn't completely solve the problem, however, for early courses,
    and thus additional data had to be manualy requested from edX.

    These issues were all resolved for courses after ~Spring 2014, when un-enrollment
    was changed such that it did not cause deletion of the enrollment record, but
    rather, just a change of the "active" flag within the enrollment record.

    Here, to workaround the problem, for a given course_id, we generate users.csv, profiles.csv, 
    enrollment.csv, certificates.csv, from collections 
    stored in mongodb, for a specified course.  These mongodb collections were curated
    and produced for the Hx and MITx Year 1 reports.
    
    Uses mongoexport.
    '''

    basedir = path(basedir or '')
    lfp = find_course_sql_dir(course_id, basedir, datedir, use_dataset_latest=use_dataset_latest)
    mongodir = lfp.dirname() / 'from_mongodb'

    print "[mongo_dump_user_info_files] processing %s, output directory = %s" % (course_id, mongodir)
    if not mongodir.exists():
        os.mkdir(mongodir)

    def do_mongo_export(collection, ofn, ckey='course_id'):
        query = '{"%s": "%s"}' % (ckey, course_id)
        cmd = "mongoexport -d %s -c %s -q '%s' | gzip -9 > %s" % (dbname, collection, query, ofn)
        print "--> %s" % cmd
        sys.stdout.flush()
        os.system(cmd)

    # make users with javascript join

    js = """conn = new Mongo(); 
          db = conn.getDB('%s');
          var cursor = db.student_courseenrollment.find({'course_id': '%s'});
          while (cursor.hasNext()) { 
              var doc = cursor.next();
              udoc = db.auth_user.find({_id: doc.user_id})[0];
              print(JSON.stringify(udoc));
          }
          var course = '%s';
          var cursor = db.certificates_generatedcertificate.find({'course_id': course});
          while (cursor.hasNext()) {
              var doc = cursor.next();
              usc = db.student_courseenrollment.find({'course_id': course, 'user_id': doc.user_id });
              if (usc.length()==0){
                  udoc = db.auth_user.find({_id: doc.user_id})[0];
                  db.auth_userprofile.update({'user_id' : doc.user_id}, {\$addToSet : {courses: course }});
                  print(JSON.stringify(udoc));
              }
          }
         """ % (dbname, course_id, course_id)

    ofn = mongodir / "users.json.gz"
    cmd = 'echo "%s" | mongo --quiet | tail -n +3 | gzip -9 > %s' % (js.replace('\n',''), ofn)
    print "--> %s" % cmd
    sys.stdout.flush()
    os.system(cmd)

    # profiles and enrollment and certificates

    do_mongo_export("auth_userprofile", mongodir / "profiles.json.gz", 'courses')
    do_mongo_export("student_courseenrollment", mongodir / "enrollment.json.gz")
    do_mongo_export("certificates_generatedcertificate", mongodir / "certificates.json.gz")