def upload_grades_persistent_data(cid, basedir, datedir, use_dataset_latest=False, subsection=False):
    """
    Upload grades_persistent csv.gz to Google Storage,
    create the BigQuery table,
    then insert the data into the table.

    :param cid: the course id
    :param basedir: the base directory path
    :param datedir: the date directory name (represented as YYYY-MM-DD)
    :param use_dataset_latest: should the most recent dataset be used?
    :param subsection: should grades_persistentsubsection be uploaded?
    :type cid: str
    :type basedir: str
    :type datedir: str
    :type use_dataset_latest: bool
    :type subsection: bool
    """
    gsdir = path(gsutil.gs_path_from_course_id(cid, use_dataset_latest=use_dataset_latest))

    if subsection:
        csv_name = "grades_persistentsubsectiongrade.csv.gz"
        temp_name = "grades_persistentsubsectiongrade_temp.csv.gz"
        table = "grades_persistent_subsection"
    else:
        csv_name = "grades_persistentcoursegrade.csv.gz"
        temp_name = "grades_persistentcoursegrade_temp.csv.gz"
        table = "grades_persistent"

    sdir = load_course_sql.find_course_sql_dir(cid,
                                               basedir=basedir,
                                               datedir=datedir,
                                               use_dataset_latest=(use_dataset_latest),
                                               )

    csvfn = sdir / csv_name
    tempfn = sdir / temp_name

    mypath = os.path.dirname(os.path.realpath(__file__))
    the_schema = json.loads(open('%s/schemas/schema_%s.json' % (mypath, table)).read())[table]

    if not os.path.exists(csvfn):
        print "[edx2bigquery] make_grades_persistent: missing file %s, skipping" % csvfn
        return

    if not subsection:
        cleanup_rows_from_grade_persistent(csvfn, tempfn)
    else:
        cleanup_rows_from_grade_persistent(csvfn, tempfn, field_to_fix="first_attempted")

    gsutil.upload_file_to_gs(csvfn, gsdir, options="-z csv", verbose=True)

    dataset = bqutil.course_id2dataset(cid, use_dataset_latest=use_dataset_latest)
    bqutil.create_dataset_if_nonexistent(dataset)  # create dataset if not already existent

    bqutil.load_data_to_table(dataset,
                              table,
                              gsdir / csv_name,
                              the_schema,
                              format="csv",
                              skiprows=1)
def upload_grades_persistent_data(cid,
                                  basedir,
                                  datedir,
                                  use_dataset_latest=False,
                                  subsection=False):
    """Upload grades_persistent csv.gz to Google Storage, create the BigQuery table, then insert the data into the table

    :param cid: the course id
    :param basedir: the base directory path
    :param datedir: the date directory name (represented as YYYY-MM-DD)
    :param use_dataset_latest: should the most recent dataset be used?
    :param subsection: should grades_persistentsubsection be uploaded?
    :type cid: str
    :type basedir: str
    :type datedir: str
    :type use_dataset_latest: bool
    :type subsection: bool
    """
    gsdir = path(
        gsutil.gs_path_from_course_id(cid,
                                      use_dataset_latest=use_dataset_latest))

    if subsection:
        csv_name = "grades_persistentsubsectiongrade.csv.gz"
        temp_name = "grades_persistentsubsectiongrade_temp.csv.gz"
        table = "grades_persistent_subsection"
    else:
        csv_name = "grades_persistentcoursegrade.csv.gz"
        temp_name = "grades_persistentcoursegrade_temp.csv.gz"
        table = "grades_persistent"

    csvfn = '%s/%s/%s/%s' % (basedir, cid.replace('/',
                                                  '__'), datedir, csv_name)
    tempfn = '%s/%s/%s/%s' % (basedir, cid.replace('/',
                                                   '__'), datedir, temp_name)

    mypath = os.path.dirname(os.path.realpath(__file__))
    the_schema = json.loads(
        open('%s/schemas/schema_%s.json' % (mypath, table)).read())[table]

    if not subsection:
        remove_nulls_from_grade_persistent(csvfn, tempfn)

    gsutil.upload_file_to_gs(csvfn, gsdir, options="-z csv", verbose=True)

    dataset = bqutil.course_id2dataset(cid,
                                       use_dataset_latest=use_dataset_latest)
    bqutil.create_dataset_if_nonexistent(
        dataset)  # create dataset if not already existent

    bqutil.load_data_to_table(dataset,
                              table,
                              gsdir / csv_name,
                              the_schema,
                              format="csv",
                              skiprows=1)
    def write_geoip_table(self):
        '''
        Write out the geoipdat table if nchanged > 0
        '''
        if not self.nchanged:
            return

        ofn = 'tmp_geoip_%08d.json' % random.uniform(0, 100000000)
        print "--> new entries added to geoipdat, writing to %s" % (ofn)
        sys.stdout.flush()

        ofp = codecs.open(ofn, 'w', encoding='utf8')
        for key, val in self.geoipdat.iteritems():
            try:
                ofp.write(json.dumps(val) + '\n')
            except Exception as err:
                print "Error!  %s" % err
                sys.stdout.write(repr(val))
                raise
        ofp.close()

        lock_file(self.gipfn)
        try:
            print "--> renaming %s to %s" % (ofn, self.gipfn)
            sys.stdout.flush()
            os.rename(ofn, self.gipfn)
        except Exception as err:
            print "Error %s in renaming gipfn" % str(err)
        lock_file(self.gipfn, release=True)

        mypath = os.path.dirname(os.path.realpath(__file__))
        the_schema = json.loads(
            open('%s/schemas/schema_extra_geoip.json' %
                 mypath).read())['extra_geoip']

        gsp = gsutil.gs_path_from_course_id(self.gipdataset) / self.gipfn
        print "--> Uploading %s to %s" % (self.gipfn, gsp)
        sys.stdout.flush()
        gsutil.upload_file_to_gs(self.gipfn, gsp, '-z json')

        print "--> Importing %s to %s" % (gsp, self.giptable)
        sys.stdout.flush()
        try:
            bqutil.create_dataset_if_nonexistent(self.gipdataset)
        except Exception as err:
            print "--> Warning: failed to create %s, err=%s" % (gsp, err)
        try:
            bqutil.load_data_to_table(self.gipdataset, self.giptable, gsp,
                                      the_schema)
        except Exception as err:
            print "---> ERROR: failed to load %s into BigQuery %s.%s, err=%s" % (
                gsp, self.gipdataset, self.giptable, err)
            print "---> Continuing anyway"
            sys.stdout.flush()
Beispiel #4
0
def do_course_listings(course_listings_fn):
    dataset = 'courses'
    table = 'listings'
    bqutil.create_dataset_if_nonexistent(dataset)
    mypath = os.path.dirname(os.path.realpath(__file__))

    gsfn = gsutil.gs_path_from_course_id('courses') / 'listings.csv'
    gsutil.upload_file_to_gs(course_listings_fn, gsfn)

    schema = json.loads(open('%s/schemas/schema_course_listings.json' % mypath).read())['course_listings']
    bqutil.load_data_to_table(dataset, table, gsfn, schema, wait=True, format='csv', skiprows=1)
Beispiel #5
0
def do_user_part_csv(course_id, basedir=None, datedir=None, 
                  use_dataset_latest=False,
                  verbose=False,
                  pin_date=None):
    sdir = find_course_sql_dir(course_id, 
                               basedir=basedir,
                               datedir=datedir,
                               use_dataset_latest=(use_dataset_latest and not pin_date),
    )
    # upload to google storage
    dfn = sdir / "user_api_usercoursetag.csv.gz"

    if not os.path.exists(dfn):
        print("[load_user_part] Missing %s, skipping" % dfn)
        return

    # reformat True / False to 1/0 for "value" field
    if verbose:
        print("[load_user_part] extracting user partition data from %s" % dfn)
        sys.stdout.flush()

    cdr = csv.DictReader(gzip.GzipFile(dfn))
    fields = cdr.fieldnames
    if verbose:
        print("fieldnames = %s" % fields)
    fixed_data = []
    bmap = {'true': 1, 'false': 0}
    for row in cdr:
        vstr = row['value'].lower()
        row['value'] = bmap.get(vstr, vstr)
        fixed_data.append(row)

    ofnb = 'user_partitions.csv.gz'
    odfn = sdir / ofnb
    with gzip.GzipFile(odfn, 'w') as ofp:
        cdw = csv.DictWriter(ofp, fieldnames=fields)
        cdw.writeheader()
        cdw.writerows(fixed_data)
    if verbose:
        print("[load_user_part] Wrote %d rows of user partition data to %s" % (len(fixed_data), odfn))
        sys.stdout.flush()

    gsdir = path(gsutil.gs_path_from_course_id(course_id, use_dataset_latest=use_dataset_latest))
    gsutil.upload_file_to_gs(odfn, gsdir / ofnb, verbose=False)
    
    mypath = os.path.dirname(os.path.realpath(__file__))
    schema = json.loads(open('%s/schemas/schema_user_partitions.json' % mypath).read())['user_partitions']

    # import into BigQuery
    table = "user_partitions"
    dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest)
    bqutil.load_data_to_table(dataset, table, gsdir / ofnb, schema, format='csv', skiprows=1)
    def write_geoip_table(self):
        '''
        Write out the geoipdat table if nchanged > 0
        '''
        if not self.nchanged:
            return

        ofn = 'tmp_geoip_%08d.json' % random.uniform(0,100000000)
        print "--> new entries added to geoipdat, writing to %s" % (ofn)
        sys.stdout.flush()

        ofp = codecs.open(ofn, 'w', encoding='utf8')
        for key, val in self.geoipdat.iteritems():
            try:
                ofp.write(json.dumps(val)+'\n')
            except Exception as err:
                print "Error!  %s" % err
                sys.stdout.write(repr(val))
                raise
        ofp.close()

        lock_file(self.gipfn)
        try:
            print "--> renaming %s to %s" % (ofn, self.gipfn)
            sys.stdout.flush()
            os.rename(ofn, self.gipfn)
        except Exception as err:
            print "Error %s in renaming gipfn" % str(err)
        lock_file(self.gipfn, release=True)
        
        mypath = os.path.dirname(os.path.realpath(__file__))
        the_schema = json.loads(open('%s/schemas/schema_extra_geoip.json' % mypath).read())['extra_geoip']

        gsp = gsutil.gs_path_from_course_id(self.gipdataset) / self.gipfn
        print "--> Uploading %s to %s" % (self.gipfn, gsp)
        sys.stdout.flush()
        gsutil.upload_file_to_gs(self.gipfn, gsp, '-z json')
        
        print "--> Importing %s to %s" % (gsp, self.giptable)
        sys.stdout.flush()
        try:
            bqutil.create_dataset_if_nonexistent(self.gipdataset)
        except Exception as err:
            print "--> Warning: failed to create %s, err=%s" % (gsp, err)
        try:
            bqutil.load_data_to_table(self.gipdataset, self.giptable, gsp, the_schema)
        except Exception as err:
            print "---> ERROR: failed to load %s into BigQuery %s.%s, err=%s" % (gsp, self.gipdataset, self.giptable, err)
            print "---> Continuing anyway"
            sys.stdout.flush()
def make_gp_table(course_id, basedir=None, datedir=None, 
                  use_dataset_latest=False,
                  verbose=False,
                  pin_date=None):

    if pin_date:
        datedir = pin_date

    sdir = load_course_sql.find_course_sql_dir(course_id, 
                                               basedir=basedir,
                                               datedir=datedir,
                                               use_dataset_latest=(use_dataset_latest and not pin_date),
                                               )

    fn_to_try = ['course.xml.tar.gz',
                'course-prod-analytics.xml.tar.gz',
                'course-prod-edge-analytics.xml.tar.gz',
                'course-prod-edx-replica.xml.tar.gz',
            ]
    
    for fntt in fn_to_try:
        fn = sdir / fntt
        if os.path.exists(fn):
            break
    if not os.path.exists(fn):
        msg = "---> oops, cannot get couese content (with grading policy file) for %s, file %s (or 'course.xml.tar.gz' or 'course-prod-edge-analytics.xml.tar.gz') missing!" % (course_id, fn)
        raise Exception(msg)

    gpstr, gpfn = read_grading_policy_from_tar_file(fn)
    fields, gptab, schema = load_grading_policy(gpstr, verbose=verbose, gpfn=gpfn)
    
    ofnb = 'grading_policy.csv'
    ofn = sdir / ofnb
    ofp = open(ofn, 'w')
    cdw = csv.DictWriter(ofp, fieldnames=fields)
    cdw.writeheader()
    cdw.writerows(gptab)
    ofp.close()

    # upload to google storage
    gsdir = path(gsutil.gs_path_from_course_id(course_id, use_dataset_latest=use_dataset_latest))
    gsutil.upload_file_to_gs(ofn, gsdir / ofnb, verbose=False)
    
    # import into BigQuery
    table = "grading_policy"
    dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest)
    bqutil.load_data_to_table(dataset, table, gsdir / ofnb, schema, format='csv', skiprows=1)
Beispiel #8
0
def make_gp_table(course_id, basedir=None, datedir=None, 
                  use_dataset_latest=False,
                  verbose=False,
                  pin_date=None):

    if pin_date:
        datedir = pin_date

    sdir = load_course_sql.find_course_sql_dir(course_id, 
                                               basedir=basedir,
                                               datedir=datedir,
                                               use_dataset_latest=(use_dataset_latest and not pin_date),
                                               )

    fn_to_try = ['course.xml.tar.gz',
                'course-prod-analytics.xml.tar.gz',
                'course-prod-edge-analytics.xml.tar.gz',
                'course-prod-edx-replica.xml.tar.gz',
            ]
    
    for fntt in fn_to_try:
        fn = sdir / fntt
        if os.path.exists(fn):
            break
    if not os.path.exists(fn):
        msg = "---> oops, cannot get couese content (with grading policy file) for %s, file %s (or 'course.xml.tar.gz' or 'course-prod-edge-analytics.xml.tar.gz') missing!" % (course_id, fn)
        raise Exception(msg)

    gpstr, gpfn = read_grading_policy_from_tar_file(fn)
    fields, gptab, schema = load_grading_policy(gpstr, verbose=verbose, gpfn=gpfn)
    
    ofnb = 'grading_policy.csv'
    ofn = sdir / ofnb
    ofp = open(ofn, 'w')
    cdw = csv.DictWriter(ofp, fieldnames=fields)
    cdw.writeheader()
    cdw.writerows(gptab)
    ofp.close()

    # upload to google storage
    gsdir = path(gsutil.gs_path_from_course_id(course_id, use_dataset_latest=use_dataset_latest))
    gsutil.upload_file_to_gs(ofn, gsdir / ofnb, verbose=False)
    
    # import into BigQuery
    table = "grading_policy"
    dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest)
    bqutil.load_data_to_table(dataset, table, gsdir / ofnb, schema, format='csv', skiprows=1)
Beispiel #9
0
def do_combine(
    course_id_set,
    project_id,
    outdir="DATA",
    nskip=0,
    output_project_id=None,
    output_dataset_id=None,
    output_bucket=None,
    use_dataset_latest=False,
    extract_subset_tables=True,
):
    '''
    Combine individual person_course tables (from the set of specified course_id's) to create one single large
    person_course table.  Do this by downloading each file, checking to make sure they all have the same
    fields, concatenating, and uploading back to bigquery.  This is cheaper than doing a select *, and also
    uncovers person_course files which have the wrong schema (and it works around BQ's limitation on large
    result sizes).  The result is stored in the course_report_latest dataset (if use_dataset_latest), else 
    in course_report_ORG, where ORG is the configured organization name.

    If extract_subset_tables is True, then the subset of those who viewed (ie "participants"), and the subset
    of those who enrolled for IDV, are extracted and saved as person_course_viewed, and person_course_idv.
    (those are created using a select *, for efficiency, despite the cost).
    '''

    print "=" * 77
    print "Concatenating person course datasets from the following courses:"
    print course_id_set
    print "-" * 77

    outdir = path(outdir)
    if not outdir.exists():
        os.mkdir(outdir)

    ofnset = []
    cnt = 0
    for course_id in course_id_set:
        gb = gsutil.gs_path_from_course_id(
            course_id, use_dataset_latest=use_dataset_latest)
        ofn = outdir / ('person_course_%s.csv.gz' %
                        (course_id.replace('/', '__')))
        ofnset.append(ofn)

        if (nskip > 0) and ofn.exists():
            print "%s already exists, not downloading" % ofn
            sys.stdout.flush()
            continue

        if ofn.exists():
            fnset = gsutil.get_gs_file_list(gb)
            local_dt = gsutil.get_local_file_mtime_in_utc(ofn)
            fnb = 'person_course.csv.gz'
            if not fnb in fnset:
                print "%s/%s missing!  skipping %s" % (gb, fnb, course_id)
                continue
            if (fnb in fnset) and (local_dt >= fnset[fnb]['date']):
                print "%s already exists with date %s (gs file date %s), not re-downloading" % (
                    ofn, local_dt, fnset[fnb]['date'])
                sys.stdout.flush()
                continue
            else:
                print "%s already exists but has date %s (gs file date %s), so re-downloading" % (
                    ofn, local_dt, fnset[fnb]['date'])
                sys.stdout.flush()

        cmd = 'gsutil cp %s/person_course.csv.gz %s' % (gb, ofn)
        print "Retrieving %s via %s" % (course_id, cmd)
        sys.stdout.flush()
        os.system(cmd)
        cnt += 1
        #if cnt>2:
        #    break

    org = course_id_set[0].split('/', 1)[0]

    ofn = "person_course_%s_%s.csv" % (
        org, datetime.datetime.now().strftime('%Y-%m-%d-%H%M%S'))

    print "=" * 77
    print "Combining CSV files to produce %s" % ofn
    sys.stdout.flush()

    if (nskip > 1) and os.path.exists(ofn):
        print "%s already exists, not downloading" % ofn
    else:
        first = 1
        header = None
        for zfn in ofnset:
            if first:
                cmd = "zcat %s > %s" % (zfn, ofn)
                header = os.popen("zcat %s | head -1" % zfn).read().strip()
                firstfn = zfn
            else:
                cmd = "zcat %s | tail -n +2 >> %s" % (
                    zfn, ofn
                )  # first row is header; don't keep when concatenating
            print cmd
            first = 0
            new_header = os.popen("zcat %s | head -1" % zfn).read().strip()
            if not header == new_header:
                print "==> Warning!  header mismatch for %s vs %s" % (zfn,
                                                                      firstfn)
                print "    %s has: %s" % (firstfn, header)
                print "    but %s has: %s" % (zfn, new_header)
            sys.stdout.flush()
            os.system(cmd)

    gb = gsutil.gs_path_from_course_id('course_report_%s' % org,
                                       gsbucket=output_bucket)

    print "=" * 77
    print "Uploading combined CSV file to google cloud storage in bucket: %s" % gb
    sys.stdout.flush()
    cmd = "TMPDIR=/var/tmp gsutil cp -z csv %s %s/" % (ofn, gb)
    print cmd
    os.system(cmd)

    gsfn = gb + '/' + ofn
    print "Combined person_course dataset CSV download link: %s" % gsutil.gs_download_link(
        gsfn)

    # import into BigQuery
    crname = ('course_report_%s' % org)
    if use_dataset_latest:
        crname = 'course_report_latest'
    dataset = output_dataset_id or crname
    table = ofn[:-4].replace('-', '_')

    print "Importing into BigQuery as %s:%s.%s" % (project_id, dataset, table)
    sys.stdout.flush()
    mypath = os.path.dirname(os.path.realpath(__file__))
    SCHEMA_FILE = '%s/schemas/schema_person_course.json' % mypath
    the_schema = json.loads(open(SCHEMA_FILE).read())['person_course']

    bqutil.load_data_to_table(dataset,
                              table,
                              gsfn,
                              the_schema,
                              format='csv',
                              skiprows=1,
                              project_id=output_project_id)

    msg = ''
    msg += "Combined person-course dataset, with data from:\n"
    msg += str(course_id_set)
    msg += "\n\n"
    msg += "=" * 100 + "\n"
    msg += "CSV download link: %s" % gsutil.gs_download_link(gsfn)

    bqutil.add_description_to_table(dataset,
                                    table,
                                    msg,
                                    append=True,
                                    project_id=output_project_id)

    # copy the new table (which has a specific date in its name) to a generically named "person_course_latest"
    # so that future SQL queries can simply use this as the latest person course table
    print "-> Copying %s to %s.person_course_latest" % (table, dataset)
    bqutil.copy_bq_table(dataset, table, "person_course_latest")

    if extract_subset_tables:
        do_extract_subset_person_course_tables(dataset, table)

    print "Done"
    sys.stdout.flush()
def analyze_course_content(
    course_id,
    listings_file=None,
    basedir="X-Year-2-data-sql",
    datedir="2013-09-21",
    use_dataset_latest=False,
    do_upload=False,
    courses=None,
    verbose=True,
    pin_date=None,
):
    '''
    Compute course_content table, which quantifies:

    - number of chapter, sequential, vertical modules
    - number of video modules
    - number of problem, *openended, mentoring modules
    - number of dicussion, annotatable, word_cloud modules

    Do this using the course "xbundle" file, produced when the course axis is computed.

    Include only modules which had nontrivial use, to rule out the staff and un-shown content. 
    Do the exclusion based on count of module appearing in the studentmodule table, based on 
    stats_module_usage for each course.

    Also, from the course listings file, compute the number of weeks the course was open.

    If do_upload (triggered by --force-recompute) then upload all accumulated data to the course report dataset 
    as the "stats_course_content" table.  Also generate a "course_summary_stats" table, stored in the
    course_report_ORG or course_report_latest dataset.  The course_summary_stats table combines
    data from many reports,, including stats_course_content, the medians report, the listings file,
    broad_stats_by_course, and time_on_task_stats_by_course.
    
    '''

    if do_upload:
        if use_dataset_latest:
            org = "latest"
        else:
            org = courses[0].split(
                '/', 1)[0]  # extract org from first course_id in courses

        crname = 'course_report_%s' % org

        gspath = gsutil.gs_path_from_course_id(crname)
        gsfnp = gspath / CCDATA
        gsutil.upload_file_to_gs(CCDATA, gsfnp)
        tableid = "stats_course_content"
        dataset = crname

        mypath = os.path.dirname(os.path.realpath(__file__))
        SCHEMA_FILE = '%s/schemas/schema_content_stats.json' % mypath

        try:
            the_schema = json.loads(open(SCHEMA_FILE).read())[tableid]
        except Exception as err:
            print "Oops!  Failed to load schema file for %s.  Error: %s" % (
                tableid, str(err))
            raise

        if 0:
            bqutil.load_data_to_table(dataset,
                                      tableid,
                                      gsfnp,
                                      the_schema,
                                      wait=True,
                                      verbose=False,
                                      format='csv',
                                      skiprows=1)

        table = 'course_metainfo'
        course_tables = ',\n'.join([
            ('[%s.course_metainfo]' % bqutil.course_id2dataset(x))
            for x in courses
        ])
        sql = "select * from {course_tables}".format(
            course_tables=course_tables)
        print "--> Creating %s.%s using %s" % (dataset, table, sql)

        if 1:
            metainfo_dataset = bqutil.get_bq_table(
                dataset,
                table,
                sql=sql,
                newer_than=datetime.datetime(2015, 1, 16, 3, 0),
            )
            # bqutil.create_bq_table(dataset, table, sql, overwrite=True)

        #-----------------------------------------------------------------------------
        # make course_summary_stats table
        #
        # This is a combination of the broad_stats_by_course table (if that exists), and course_metainfo.
        # Also use (and create if necessary) the nregistered_by_wrap table.

        # get the broad_stats_by_course data
        bsbc = bqutil.get_table_data(dataset, 'broad_stats_by_course')

        table_list = bqutil.get_list_of_table_ids(dataset)

        latest_person_course = max(
            [x for x in table_list if x.startswith('person_course_')])
        print "Latest person_course table in %s is %s" % (dataset,
                                                          latest_person_course)

        sql = """
                SELECT pc.course_id as course_id, 
                    cminfo.wrap_date as wrap_date,
                    count(*) as nregistered,
                    sum(case when pc.start_time < cminfo.wrap_date then 1 else 0 end) nregistered_by_wrap,
                    sum(case when pc.start_time < cminfo.wrap_date then 1 else 0 end) / nregistered * 100 nregistered_by_wrap_pct,
                FROM
                    [{dataset}.{person_course}] as pc
                left join (
                 SELECT course_id,
                      TIMESTAMP(concat(wrap_year, "-", wrap_month, '-', wrap_day, ' 23:59:59')) as wrap_date,
                 FROM (
                  SELECT course_id, 
                    regexp_extract(value, r'(\d+)/\d+/\d+') as wrap_month,
                    regexp_extract(value, r'\d+/(\d+)/\d+') as wrap_day,
                    regexp_extract(value, r'\d+/\d+/(\d+)') as wrap_year,
                  FROM [{dataset}.course_metainfo]
                  where key='listings_Course Wrap'
                 )) as cminfo
                on pc.course_id = cminfo.course_id
                
                group by course_id, wrap_date
                order by course_id
        """.format(dataset=dataset, person_course=latest_person_course)

        nr_by_wrap = bqutil.get_bq_table(dataset,
                                         'nregistered_by_wrap',
                                         sql=sql,
                                         key={'name': 'course_id'})

        # rates for registrants before and during course

        sql = """
                SELECT 
                    *,
                    ncertified / nregistered * 100 as pct_certified_of_reg,
                    ncertified_and_registered_before_launch / nregistered_before_launch * 100 as pct_certified_reg_before_launch,
                    ncertified_and_registered_during_course / nregistered_during_course * 100 as pct_certified_reg_during_course,
                    ncertified / nregistered_by_wrap * 100 as pct_certified_of_reg_by_wrap,
                    ncertified / nviewed * 100 as pct_certified_of_viewed,
                    ncertified / nviewed_by_wrap * 100 as pct_certified_of_viewed_by_wrap,
                    ncertified_by_ewrap / nviewed_by_ewrap * 100 as pct_certified_of_viewed_by_ewrap,
                FROM
                (
                # ------------------------
                # get aggregate data
                SELECT pc.course_id as course_id, 
                    cminfo.wrap_date as wrap_date,
                    count(*) as nregistered,
                    sum(case when pc.certified then 1 else 0 end) ncertified,
                    sum(case when (TIMESTAMP(pc.cert_created_date) < cminfo.ewrap_date) and (pc.certified and pc.viewed) then 1 else 0 end) ncertified_by_ewrap,
                    sum(case when pc.viewed then 1 else 0 end) nviewed,
                    sum(case when pc.start_time < cminfo.wrap_date then 1 else 0 end) nregistered_by_wrap,
                    sum(case when pc.start_time < cminfo.wrap_date then 1 else 0 end) / nregistered * 100 nregistered_by_wrap_pct,
                    sum(case when (pc.start_time < cminfo.wrap_date) and pc.viewed then 1 else 0 end) nviewed_by_wrap,
                    sum(case when (pc.start_time < cminfo.ewrap_date) and pc.viewed then 1 else 0 end) nviewed_by_ewrap,
                    sum(case when pc.start_time < cminfo.launch_date then 1 else 0 end) nregistered_before_launch,
                    sum(case when pc.start_time < cminfo.launch_date 
                              and pc.certified
                              then 1 else 0 end) ncertified_and_registered_before_launch,
                    sum(case when (pc.start_time >= cminfo.launch_date) 
                              and (pc.start_time < cminfo.wrap_date) then 1 else 0 end) nregistered_during_course,
                    sum(case when (pc.start_time >= cminfo.launch_date) 
                              and (pc.start_time < cminfo.wrap_date) 
                              and pc.certified
                              then 1 else 0 end) ncertified_and_registered_during_course,
                FROM
                    [{dataset}.{person_course}] as pc
                left join (
                
                # --------------------
                #  get course launch and wrap dates from course_metainfo

       SELECT AA.course_id as course_id, 
              AA.wrap_date as wrap_date,
              AA.launch_date as launch_date,
              BB.ewrap_date as ewrap_date,
       FROM (
               #  inner get course launch and wrap dates from course_metainfo
                SELECT A.course_id as course_id,
                  A.wrap_date as wrap_date,
                  B.launch_date as launch_date,
                from
                (
                 SELECT course_id,
                      TIMESTAMP(concat(wrap_year, "-", wrap_month, '-', wrap_day, ' 23:59:59')) as wrap_date,
                 FROM (
                  SELECT course_id, 
                    regexp_extract(value, r'(\d+)/\d+/\d+') as wrap_month,
                    regexp_extract(value, r'\d+/(\d+)/\d+') as wrap_day,
                    regexp_extract(value, r'\d+/\d+/(\d+)') as wrap_year,
                  FROM [{dataset}.course_metainfo]
                  where key='listings_Course Wrap'
                 )
                ) as A
                left outer join 
                (
                 SELECT course_id,
                      TIMESTAMP(concat(launch_year, "-", launch_month, '-', launch_day)) as launch_date,
                 FROM (
                  SELECT course_id, 
                    regexp_extract(value, r'(\d+)/\d+/\d+') as launch_month,
                    regexp_extract(value, r'\d+/(\d+)/\d+') as launch_day,
                    regexp_extract(value, r'\d+/\d+/(\d+)') as launch_year,
                  FROM [{dataset}.course_metainfo]
                  where key='listings_Course Launch'
                 )
                ) as B
                on A.course_id = B.course_id 
                # end inner course_metainfo subquery
            ) as AA
            left outer join
            (
                 SELECT course_id,
                      TIMESTAMP(concat(wrap_year, "-", wrap_month, '-', wrap_day, ' 23:59:59')) as ewrap_date,
                 FROM (
                  SELECT course_id, 
                    regexp_extract(value, r'(\d+)/\d+/\d+') as wrap_month,
                    regexp_extract(value, r'\d+/(\d+)/\d+') as wrap_day,
                    regexp_extract(value, r'\d+/\d+/(\d+)') as wrap_year,
                  FROM [{dataset}.course_metainfo]
                  where key='listings_Empirical Course Wrap'
                 )
            ) as BB
            on AA.course_id = BB.course_id

                # end course_metainfo subquery
                # --------------------
                
                ) as cminfo
                on pc.course_id = cminfo.course_id
                
                group by course_id, wrap_date
                order by course_id
                # ---- end get aggregate data
                )
                order by course_id
        """.format(dataset=dataset, person_course=latest_person_course)

        print "--> Assembling course_summary_stats from %s" % 'stats_cert_rates_by_registration'
        sys.stdout.flush()
        cert_by_reg = bqutil.get_bq_table(dataset,
                                          'stats_cert_rates_by_registration',
                                          sql=sql,
                                          newer_than=datetime.datetime(
                                              2015, 1, 16, 3, 0),
                                          key={'name': 'course_id'})

        # start assembling course_summary_stats

        c_sum_stats = defaultdict(OrderedDict)
        for entry in bsbc['data']:
            course_id = entry['course_id']
            cmci = c_sum_stats[course_id]
            cmci.update(entry)
            cnbw = nr_by_wrap['data_by_key'][course_id]
            nbw = int(cnbw['nregistered_by_wrap'])
            cmci['nbw_wrap_date'] = cnbw['wrap_date']
            cmci['nregistered_by_wrap'] = nbw
            cmci['nregistered_by_wrap_pct'] = cnbw['nregistered_by_wrap_pct']
            cmci['frac_female'] = float(entry['n_female_viewed']) / (float(
                entry['n_male_viewed']) + float(entry['n_female_viewed']))
            ncert = float(cmci['certified_sum'])
            if ncert:
                cmci[
                    'certified_of_nregistered_by_wrap_pct'] = nbw / ncert * 100.0
            else:
                cmci['certified_of_nregistered_by_wrap_pct'] = None
            cbr = cert_by_reg['data_by_key'][course_id]
            for field, value in cbr.items():
                cmci['cbr_%s' % field] = value

        # add medians for viewed, explored, and certified

        msbc_tables = {
            'msbc_viewed': "viewed_median_stats_by_course",
            'msbc_explored': 'explored_median_stats_by_course',
            'msbc_certified': 'certified_median_stats_by_course',
            'msbc_verified': 'verified_median_stats_by_course',
        }
        for prefix, mtab in msbc_tables.items():
            print "--> Merging median stats data from %s" % mtab
            sys.stdout.flush()
            bqdat = bqutil.get_table_data(dataset, mtab)
            for entry in bqdat['data']:
                course_id = entry['course_id']
                cmci = c_sum_stats[course_id]
                for field, value in entry.items():
                    cmci['%s_%s' % (prefix, field)] = value

        # add time on task data

        tot_table = "time_on_task_stats_by_course"
        prefix = "ToT"
        print "--> Merging time on task data from %s" % tot_table
        sys.stdout.flush()
        try:
            bqdat = bqutil.get_table_data(dataset, tot_table)
        except Exception as err:
            bqdat = {'data': {}}
        for entry in bqdat['data']:
            course_id = entry['course_id']
            cmci = c_sum_stats[course_id]
            for field, value in entry.items():
                if field == 'course_id':
                    continue
                cmci['%s_%s' % (prefix, field)] = value

        # add serial time on task data

        tot_table = "time_on_task_serial_stats_by_course"
        prefix = "SToT"
        print "--> Merging serial time on task data from %s" % tot_table
        sys.stdout.flush()
        try:
            bqdat = bqutil.get_table_data(dataset, tot_table)
        except Exception as err:
            bqdat = {'data': {}}
        for entry in bqdat['data']:
            course_id = entry['course_id']
            cmci = c_sum_stats[course_id]
            for field, value in entry.items():
                if field == 'course_id':
                    continue
                cmci['%s_%s' % (prefix, field)] = value

        # add show_answer stats

        tot_table = "show_answer_stats_by_course"
        prefix = "SAS"
        print "--> Merging show_answer stats data from %s" % tot_table
        sys.stdout.flush()
        try:
            bqdat = bqutil.get_table_data(dataset, tot_table)
        except Exception as err:
            bqdat = {'data': {}}
        for entry in bqdat['data']:
            course_id = entry['course_id']
            cmci = c_sum_stats[course_id]
            for field, value in entry.items():
                if field == 'course_id':
                    continue
                cmci['%s_%s' % (prefix, field)] = value

        # setup list of keys, for CSV output

        css_keys = c_sum_stats.values()[0].keys()

        # retrieve course_metainfo table, pivot, add that to summary_stats

        print "--> Merging course_metainfo from %s" % table
        sys.stdout.flush()
        bqdat = bqutil.get_table_data(dataset, table)

        listings_keys = map(make_key, [
            "Institution", "Semester", "New or Rerun",
            "Andrew Recodes New/Rerun", "Course Number", "Short Title",
            "Andrew's Short Titles", "Title", "Instructors",
            "Registration Open", "Course Launch", "Course Wrap", "course_id",
            "Empirical Course Wrap", "Andrew's Order", "certifies",
            "MinPassGrade", '4-way Category by name',
            "4-way (CS, STEM, HSocSciGov, HumHistRel)"
        ])
        listings_keys.reverse()

        for lk in listings_keys:
            css_keys.insert(1, "listings_%s" % lk)

        COUNTS_TO_KEEP = [
            'discussion', 'problem', 'optionresponse', 'checkboxgroup',
            'optioninput', 'choiceresponse', 'video', 'choicegroup',
            'vertical', 'choice', 'sequential', 'multiplechoiceresponse',
            'numericalresponse', 'chapter', 'solution', 'img',
            'formulaequationinput', 'responseparam', 'selfassessment', 'track',
            'task', 'rubric', 'stringresponse', 'combinedopenended',
            'description', 'textline', 'prompt', 'category', 'option', 'lti',
            'annotationresponse', 'annotatable', 'colgroup', 'tag_prompt',
            'comment', 'annotationinput', 'image', 'options', 'comment_prompt',
            'conditional', 'answer', 'poll_question', 'section', 'wrapper',
            'map', 'area', 'customtag', 'transcript', 'split_test',
            'word_cloud', 'openended', 'openendedparam', 'answer_display',
            'code', 'drag_and_drop_input', 'customresponse', 'draggable',
            'mentoring', 'textannotation', 'imageannotation', 'videosequence',
            'feedbackprompt', 'assessments', 'openassessment', 'assessment',
            'explanation', 'criterion'
        ]

        for entry in bqdat['data']:
            thekey = make_key(entry['key'])
            # if thekey.startswith('count_') and thekey[6:] not in COUNTS_TO_KEEP:
            #     continue
            if thekey.startswith(
                    'listings_') and thekey[9:] not in listings_keys:
                # print "dropping key=%s for course_id=%s" % (thekey, entry['course_id'])
                continue
            c_sum_stats[entry['course_id']][thekey] = entry['value']
            #if 'certifies' in thekey:
            #    print "course_id=%s, key=%s, value=%s" % (entry['course_id'], thekey, entry['value'])
            if thekey not in css_keys:
                css_keys.append(thekey)

        # compute forum_posts_per_week
        for course_id, entry in c_sum_stats.items():
            nfps = entry.get('nforum_posts_sum', 0)
            if nfps:
                fppw = int(nfps) / float(entry['nweeks'])
                entry['nforum_posts_per_week'] = fppw
                print "    course: %s, assessments_per_week=%s, forum_posts_per_week=%s" % (
                    course_id, entry['total_assessments_per_week'], fppw)
            else:
                entry['nforum_posts_per_week'] = None
        css_keys.append('nforum_posts_per_week')

        # read in listings file and merge that in also
        if listings_file:
            if listings_file.endswith('.csv'):
                listings = csv.DictReader(open(listings_file))
            else:
                listings = [json.loads(x) for x in open(listings_file)]
            for entry in listings:
                course_id = entry['course_id']
                if course_id not in c_sum_stats:
                    continue
                cmci = c_sum_stats[course_id]
                for field, value in entry.items():
                    lkey = "listings_%s" % make_key(field)
                    if not (lkey in cmci) or (not cmci[lkey]):
                        cmci[lkey] = value

        print "Storing these fields: %s" % css_keys

        # get schema
        mypath = os.path.dirname(os.path.realpath(__file__))
        the_schema = json.loads(
            open('%s/schemas/schema_combined_course_summary_stats.json' %
                 mypath).read())
        schema_dict = {x['name']: x for x in the_schema}

        # write out CSV
        css_table = "course_summary_stats"
        ofn = "%s__%s.csv" % (dataset, css_table)
        ofn2 = "%s__%s.json" % (dataset, css_table)
        print "Writing data to %s and %s" % (ofn, ofn2)

        ofp = open(ofn, 'w')
        ofp2 = open(ofn2, 'w')
        dw = csv.DictWriter(ofp, fieldnames=css_keys)
        dw.writeheader()
        for cid, entry in c_sum_stats.items():
            for ek in entry:
                if ek not in schema_dict:
                    entry.pop(ek)
                # entry[ek] = str(entry[ek])	# coerce to be string
            ofp2.write(json.dumps(entry) + "\n")
            for key in css_keys:
                if key not in entry:
                    entry[key] = None
            dw.writerow(entry)
        ofp.close()
        ofp2.close()

        # upload to bigquery
        # the_schema = [ { 'type': 'STRING', 'name': x } for x in css_keys ]
        if 1:
            gsfnp = gspath / dataset / (css_table + ".json")
            gsutil.upload_file_to_gs(ofn2, gsfnp)
            # bqutil.load_data_to_table(dataset, css_table, gsfnp, the_schema, wait=True, verbose=False,
            #                           format='csv', skiprows=1)
            bqutil.load_data_to_table(dataset,
                                      css_table,
                                      gsfnp,
                                      the_schema,
                                      wait=True,
                                      verbose=False)

        return

    print "-" * 60 + " %s" % course_id

    # get nweeks from listings
    lfn = path(listings_file)
    if not lfn.exists():
        print "[analyze_content] course listings file %s doesn't exist!" % lfn
        return

    data = None
    if listings_file.endswith('.json'):
        data_feed = map(json.loads, open(lfn))
    else:
        data_feed = csv.DictReader(open(lfn))
    for k in data_feed:
        if not 'course_id' in k:
            print "Strange course listings row, no course_id in %s" % k
            raise Exception("Missing course_id")
        if k['course_id'] == course_id:
            data = k
            break

    if not data:
        print "[analyze_content] no entry for %s found in course listings file %s!" % (
            course_id, lfn)
        return

    def date_parse(field):
        (m, d, y) = map(int, data[field].split('/'))
        return datetime.datetime(y, m, d)

    launch = date_parse('Course Launch')
    wrap = date_parse('Course Wrap')
    ndays = (wrap - launch).days
    nweeks = ndays / 7.0

    print "Course length = %6.2f weeks (%d days)" % (nweeks, ndays)

    if pin_date:
        datedir = pin_date
    course_dir = find_course_sql_dir(course_id, basedir, datedir,
                                     use_dataset_latest and not pin_date)
    cfn = gsutil.path_from_course_id(course_id)

    xbfn = course_dir / ("xbundle_%s.xml" % cfn)

    if not xbfn.exists():
        print "[analyze_content] cannot find xbundle file %s for %s!" % (
            xbfn, course_id)

        if use_dataset_latest:
            # try looking in earlier directories for xbundle file
            import glob
            spath = course_dir / ("../*/xbundle_%s.xml" % cfn)
            files = list(glob.glob(spath))
            if files:
                xbfn = path(files[-1])
            if not xbfn.exists():
                print "   --> also cannot find any %s ; aborting!" % spath
            else:
                print "   --> Found and using instead: %s " % xbfn
        if not xbfn.exists():
            raise Exception("[analyze_content] missing xbundle file %s" % xbfn)

    # if there is an xbundle*.fixed file, use that instead of the normal one
    if os.path.exists(str(xbfn) + ".fixed"):
        xbfn = path(str(xbfn) + ".fixed")

    print "[analyze_content] For %s using %s" % (course_id, xbfn)

    # get module usage data
    mudata = get_stats_module_usage(course_id, basedir, datedir,
                                    use_dataset_latest)

    xml = etree.parse(open(xbfn)).getroot()

    counts = defaultdict(int)
    nexcluded = defaultdict(int)

    IGNORE = [
        'html', 'p', 'div', 'iframe', 'ol', 'li', 'ul', 'blockquote', 'h1',
        'em', 'b', 'h2', 'h3', 'body', 'span', 'strong', 'a', 'sub', 'strike',
        'table', 'td', 'tr', 's', 'tbody', 'sup', 'sub', 'strike', 'i', 's',
        'pre', 'policy', 'metadata', 'grading_policy', 'br', 'center', 'wiki',
        'course', 'font', 'tt', 'it', 'dl', 'startouttext', 'endouttext', 'h4',
        'head', 'source', 'dt', 'hr', 'u', 'style', 'dd', 'script', 'th', 'p',
        'P', 'TABLE', 'TD', 'small', 'text', 'title'
    ]

    problem_stats = defaultdict(int)

    def does_problem_have_random_script(problem):
        '''
        return 1 if problem has a script with "random." in it
        else return 0
        '''
        for elem in problem.findall('.//script'):
            if elem.text and ('random.' in elem.text):
                return 1
        return 0

    # walk through xbundle
    def walk_tree(elem, policy=None):
        '''
        Walk XML tree recursively.
        elem = current element
        policy = dict of attributes for children to inherit, with fields like due, graded, showanswer
        '''
        policy = policy or {}
        if type(elem.tag) == str and (elem.tag.lower() not in IGNORE):
            counts[elem.tag.lower()] += 1
        if elem.tag in [
                "sequential", "problem", "problemset", "course", "chapter"
        ]:  # very old courses may use inheritance from course & chapter
            keys = ["due", "graded", "format", "showanswer", "start"]
            for k in keys:  # copy inheritable attributes, if they are specified
                val = elem.get(k)
                if val:
                    policy[k] = val
        if elem.tag == "problem":  # accumulate statistics about problems: how many have show_answer = [past_due, closed] ?  have random. in script?
            problem_stats['n_capa_problems'] += 1
            if policy.get('showanswer'):
                problem_stats["n_showanswer_%s" %
                              policy.get('showanswer')] += 1
            else:
                problem_stats[
                    'n_shownanswer_finished'] += 1  # DEFAULT showanswer = finished  (make sure this remains true)
                # see https://github.com/edx/edx-platform/blob/master/common/lib/xmodule/xmodule/capa_base.py#L118
                # finished = Show the answer after the student has answered the problem correctly, the student has no attempts left, or the problem due date has passed.
            problem_stats[
                'n_random_script'] += does_problem_have_random_script(elem)

            if policy.get('graded') == 'true' or policy.get(
                    'graded') == 'True':
                problem_stats['n_capa_problems_graded'] += 1
                problem_stats[
                    'n_graded_random_script'] += does_problem_have_random_script(
                        elem)
                if policy.get('showanswer'):
                    problem_stats["n_graded_showanswer_%s" %
                                  policy.get('showanswer')] += 1
                else:
                    problem_stats[
                        'n_graded_shownanswer_finished'] += 1  # DEFAULT showanswer = finished  (make sure this remains true)

        for k in elem:
            midfrag = (k.tag, k.get('url_name_orig', None))
            if (midfrag in mudata) and int(mudata[midfrag]['ncount']) < 20:
                nexcluded[k.tag] += 1
                if verbose:
                    try:
                        print "    -> excluding %s (%s), ncount=%s" % (
                            k.get('display_name',
                                  '<no_display_name>').encode('utf8'), midfrag,
                            mudata.get(midfrag, {}).get('ncount'))
                    except Exception as err:
                        print "    -> excluding ", k
                continue
            walk_tree(k, policy.copy())

    walk_tree(xml)
    print "--> Count of individual element tags throughout XML: ", counts

    print "--> problem_stats:", json.dumps(problem_stats, indent=4)

    # combine some into "qual_axis" and others into "quant_axis"
    qual_axis = [
        'openassessment',
        'optionresponse',
        'multiplechoiceresponse',
        # 'discussion',
        'choiceresponse',
        'word_cloud',
        'combinedopenended',
        'choiceresponse',
        'stringresponse',
        'textannotation',
        'openended',
        'lti'
    ]
    quant_axis = [
        'formularesponse', 'numericalresponse', 'customresponse',
        'symbolicresponse', 'coderesponse', 'imageresponse'
    ]

    nqual = 0
    nquant = 0
    for tag, count in counts.items():
        if tag in qual_axis:
            nqual += count
        if tag in quant_axis:
            nquant += count

    print "nqual=%d, nquant=%d" % (nqual, nquant)

    nqual_per_week = nqual / nweeks
    nquant_per_week = nquant / nweeks
    total_per_week = nqual_per_week + nquant_per_week

    print "per week: nqual=%6.2f, nquant=%6.2f total=%6.2f" % (
        nqual_per_week, nquant_per_week, total_per_week)

    # save this overall data in CCDATA
    lock_file(CCDATA)
    ccdfn = path(CCDATA)
    ccd = {}
    if ccdfn.exists():
        for k in csv.DictReader(open(ccdfn)):
            ccd[k['course_id']] = k

    ccd[course_id] = {
        'course_id': course_id,
        'nweeks': nweeks,
        'nqual_per_week': nqual_per_week,
        'nquant_per_week': nquant_per_week,
        'total_assessments_per_week': total_per_week,
    }

    # fields = ccd[ccd.keys()[0]].keys()
    fields = [
        'course_id', 'nquant_per_week', 'total_assessments_per_week',
        'nqual_per_week', 'nweeks'
    ]
    cfp = open(ccdfn, 'w')
    dw = csv.DictWriter(cfp, fieldnames=fields)
    dw.writeheader()
    for cid, entry in ccd.items():
        dw.writerow(entry)
    cfp.close()
    lock_file(CCDATA, release=True)

    # store data in course_metainfo table, which has one (course_id, key, value) on each line
    # keys include nweeks, nqual, nquant, count_* for module types *

    cmfields = OrderedDict()
    cmfields['course_id'] = course_id
    cmfields['course_length_days'] = str(ndays)
    cmfields.update(
        {make_key('listings_%s' % key): value
         for key, value in data.items()})  # from course listings
    cmfields.update(ccd[course_id].copy())

    # cmfields.update({ ('count_%s' % key) : str(value) for key, value in counts.items() })	# from content counts

    cmfields['filename_xbundle'] = xbfn
    cmfields['filename_listings'] = lfn

    for key in sorted(
            counts
    ):  # store counts in sorted order, so that the later generated CSV file can have a predictable structure
        value = counts[key]
        cmfields['count_%s' % key] = str(value)  # from content counts

    for key in sorted(problem_stats):  # store problem stats
        value = problem_stats[key]
        cmfields['problem_stat_%s' % key] = str(value)

    cmfields.update({('nexcluded_sub_20_%s' % key): str(value)
                     for key, value in nexcluded.items()
                     })  # from content counts

    course_dir = find_course_sql_dir(course_id, basedir, datedir,
                                     use_dataset_latest)
    csvfn = course_dir / CMINFO

    # manual overriding of the automatically computed fields can be done by storing course_id,key,value data
    # in the CMINFO_OVERRIDES file

    csvfn_overrides = course_dir / CMINFO_OVERRIDES
    if csvfn_overrides.exists():
        print "--> Loading manual override information from %s" % csvfn_overrides
        for ovent in csv.DictReader(open(csvfn_overrides)):
            if not ovent['course_id'] == course_id:
                print "===> ERROR! override file has entry with wrong course_id: %s" % ovent
                continue
            print "    overriding key=%s with value=%s" % (ovent['key'],
                                                           ovent['value'])
            cmfields[ovent['key']] = ovent['value']

    print "--> Course metainfo writing to %s" % csvfn

    fp = open(csvfn, 'w')

    cdw = csv.DictWriter(fp, fieldnames=['course_id', 'key', 'value'])
    cdw.writeheader()

    for k, v in cmfields.items():
        cdw.writerow({'course_id': course_id, 'key': k, 'value': v})

    fp.close()

    # build and output course_listings_and_metainfo

    dataset = bqutil.course_id2dataset(course_id,
                                       use_dataset_latest=use_dataset_latest)

    mypath = os.path.dirname(os.path.realpath(__file__))
    clm_table = "course_listing_and_metainfo"
    clm_schema_file = '%s/schemas/schema_%s.json' % (mypath, clm_table)
    clm_schema = json.loads(open(clm_schema_file).read())

    clm = {}
    for finfo in clm_schema:
        field = finfo['name']
        clm[field] = cmfields.get(field)
    clm_fnb = clm_table + ".json"
    clm_fn = course_dir / clm_fnb
    open(clm_fn, 'w').write(json.dumps(clm))

    gsfnp = gsutil.gs_path_from_course_id(
        course_id, use_dataset_latest=use_dataset_latest) / clm_fnb
    print "--> Course listing + metainfo uploading to %s then to %s.%s" % (
        gsfnp, dataset, clm_table)
    sys.stdout.flush()
    gsutil.upload_file_to_gs(clm_fn, gsfnp)
    bqutil.load_data_to_table(dataset,
                              clm_table,
                              gsfnp,
                              clm_schema,
                              wait=True,
                              verbose=False)

    # output course_metainfo

    table = 'course_metainfo'
    dataset = bqutil.course_id2dataset(course_id,
                                       use_dataset_latest=use_dataset_latest)

    gsfnp = gsutil.gs_path_from_course_id(
        course_id, use_dataset_latest=use_dataset_latest) / CMINFO
    print "--> Course metainfo uploading to %s then to %s.%s" % (
        gsfnp, dataset, table)
    sys.stdout.flush()

    gsutil.upload_file_to_gs(csvfn, gsfnp)

    mypath = os.path.dirname(os.path.realpath(__file__))
    SCHEMA_FILE = '%s/schemas/schema_course_metainfo.json' % mypath
    the_schema = json.loads(open(SCHEMA_FILE).read())[table]

    bqutil.load_data_to_table(dataset,
                              table,
                              gsfnp,
                              the_schema,
                              wait=True,
                              verbose=False,
                              format='csv',
                              skiprows=1)
def load_sql_for_course(course_id,
                        gsbucket="gs://x-data",
                        basedir="X-Year-2-data-sql",
                        datedir="2014-09-21",
                        do_gs_copy=False,
                        use_dataset_latest=False):
    '''
    Load SQL files into google cloud storage then import into BigQuery.

    Datasets are typically named by course_id, with "__" replacing "/", and "_" replacing "."

    If use_dataset_latest then "_latest" is appended to the dataset name.  
    Thus, the latest SQL dataset can always be put in a consistently named dataset.
    '''

    print "Loading SQL for course %s into BigQuery (start: %s)" % (
        course_id, datetime.datetime.now())
    sys.stdout.flush()

    lfp = find_course_sql_dir(course_id,
                              basedir,
                              datedir,
                              use_dataset_latest=use_dataset_latest)

    print "Using this directory for local files: ", lfp
    sys.stdout.flush()

    # convert studentmodule if necessary

    fn_sm = lfp / 'studentmodule.csv.gz'
    if not fn_sm.exists():
        fn_sm = lfp / 'studentmodule.csv'
        if not fn_sm.exists():
            fn_sm = lfp / 'studentmodule.sql.gz'
            if not fn_sm.exists():
                fn_sm = lfp / 'studentmodule.sql'
                if not fn_sm.exists():
                    print "Error!  Missing studentmodule.[sql,csv][.gz]"
            if fn_sm.exists():  # have .sql or .sql.gz version: convert to .csv
                newfn = lfp / 'studentmodule.csv.gz'
                print "--> Converting %s to %s" % (fn_sm, newfn)
                tsv2csv(fn_sm, newfn)
                fn_sm = newfn

    if fn_sm.exists():
        # rephrase studentmodule if it's using opaque keys
        fline = ''
        smfp = openfile(fn_sm)
        fline = smfp.readline()  # skip first line - it's a header
        fline = smfp.readline()
        if 'block-v1:' in fline or 'course-v1' in fline:
            rephrase_studentmodule_opaque_keys(fn_sm)

    def convert_sql(fnroot):
        if os.path.exists(fnroot + ".csv") or os.path.exists(fnroot +
                                                             ".csv.gz"):
            return
        if os.path.exists(fnroot + ".sql") or os.path.exists(fnroot +
                                                             ".sql.gz"):
            infn = fnroot + '.sql'
            outfn = fnroot + '.csv.gz'
            print "--> Converting %s to %s" % (infn, outfn)
            tsv2csv(infn, outfn)

    # convert sql files if necesssary
    fnset = [
        'users', 'certificates', 'enrollment', "profiles", 'user_id_map',
        'rolecourse', 'roleforum'
    ]
    for fn in fnset:
        convert_sql(lfp / fn)

    local_files = glob.glob(lfp / '*')

    # if using latest date directory, also look for course_image.jpg one level up
    if use_dataset_latest:
        print lfp.dirname()
        ci_files = glob.glob(lfp.dirname() / 'course_image.jpg')
        if ci_files:
            local_files += list(ci_files)
            print "--> local course_image file: %s" % ci_files

    gsdir = gsutil.gs_path_from_course_id(
        course_id, gsbucket=gsbucket, use_dataset_latest=use_dataset_latest)

    local = pytz.timezone("America/New_York")

    if do_gs_copy:
        try:
            fnset = get_gs_file_list(gsdir)
        except Exception as err:
            fnset = []

        def copy_if_newer(fn, fnset, options='-z csv,json'):
            statbuf = os.stat(fn)
            mt = datetime.datetime.fromtimestamp(statbuf.st_mtime)

            # do some date checking to upload files which have changed, and are newer than that on google cloud storage
            local_dt = local.localize(mt, is_dst=None)
            utc_dt = local_dt.astimezone(pytz.utc)

            fnb = os.path.basename(fn)
            if fnb in fnset and fnset[fnb]['date'] > utc_dt:
                print "...%s already copied, skipping" % fn
                sys.stdout.flush()
                return
            elif fnb in fnset:
                print "...%s already exists, but has date=%s and mtime=%s, re-uploading" % (
                    fn, fnset[fnb]['date'], mt)

            gsutil.upload_file_to_gs(fn,
                                     gsdir / fnb,
                                     options=options,
                                     verbose=True)

        for fn in local_files:
            fnb = os.path.basename(fn)
            if fnb == 'course_image.jpg':
                copy_if_newer(fn, fnset, options='-a public-read')
            if not (fnb.endswith('.csv') or fnb.endswith('.json')
                    or fnb.endswith('.csv.gz') or fnb.endswith('.json.gz')
                    or fnb.endswith('.mongo.gz')):
                print "...unknown file type %s, skipping" % fn
                sys.stdout.flush()
                continue
            copy_if_newer(fn, fnset)

    # load into bigquery
    dataset = bqutil.course_id2dataset(course_id,
                                       use_dataset_latest=use_dataset_latest)
    bqutil.create_dataset_if_nonexistent(dataset)
    mypath = os.path.dirname(os.path.realpath(__file__))

    # load user_info_combo
    uicfn = lfp / 'user_info_combo.json.gz'
    if uicfn.exists():
        uic_schema = json.loads(
            open('%s/schemas/schema_user_info_combo.json' %
                 mypath).read())['user_info_combo']
        bqutil.load_data_to_table(dataset,
                                  'user_info_combo',
                                  gsdir / "user_info_combo.json.gz",
                                  uic_schema,
                                  wait=False)
    else:
        print "--> File %s does not exist, not loading user_info_combo into BigQuery" % uicfn

    # load studentmodule

    if fn_sm.exists():
        schemas = json.loads(open('%s/schemas/schemas.json' % mypath).read())
        cwsm_schema = schemas['courseware_studentmodule']
        bqutil.load_data_to_table(dataset,
                                  'studentmodule',
                                  gsdir / fn_sm.basename(),
                                  cwsm_schema,
                                  format='csv',
                                  wait=False,
                                  skiprows=1)
    else:
        print "--> Not loading studentmodule: file %s not found" % fn_sm
def load_all_daily_logs_for_course(course_id, gsbucket="gs://x-data", verbose=True, wait=False,
                                   check_dates=True):
    '''
    Load daily tracking logs for course from google storage into BigQuery.
    
    If wait=True then waits for loading jobs to be completed.  It's desirable to wait
    if subsequent jobs which need these tables (like person_day) are to be run
    immediately afterwards.
    '''

    print "Loading daily tracking logs for course %s into BigQuery (start: %s)" % (course_id, datetime.datetime.now())
    sys.stdout.flush()
    gsroot = gsutil.path_from_course_id(course_id)

    mypath = os.path.dirname(os.path.realpath(__file__))
    SCHEMA = json.loads(open('%s/schemas/schema_tracking_log.json' % mypath).read())['tracking_log']

    gsdir = '%s/%s/DAILY/' % (gsbucket, gsroot)

    fnset = gsutil.get_gs_file_list(gsdir)
  
    dataset = bqutil.course_id2dataset(gsroot, dtype="logs")
  
    # create this dataset if necessary
    bqutil.create_dataset_if_nonexistent(dataset)

    tables = bqutil.get_list_of_table_ids(dataset)
    tables = [x for x in tables if x.startswith('track')]
  
    if verbose:
        print "-"*77
        print "current tables loaded:", json.dumps(tables, indent=4)
        print "files to load: ", json.dumps(fnset.keys(), indent=4)
        print "-"*77
        sys.stdout.flush()
  
    for fn, fninfo in fnset.iteritems():

        if int(fninfo['size'])<=45:
            print "Zero size file %s, skipping" % fn
            continue

        m = re.search('(\d\d\d\d-\d\d-\d\d)', fn)
        if not m:
            continue
        date = m.group(1)
        tablename = "tracklog_%s" % date.replace('-','')	# YYYYMMDD for compatibility with table wildcards

        # file_date = gsutil.get_local_file_mtime_in_utc(fn, make_tz_unaware=True)
        file_date = fninfo['date'].replace(tzinfo=None)
  
        if tablename in tables:
            skip = True
            if check_dates:
                table_date = bqutil.get_bq_table_last_modified_datetime(dataset, tablename)
                if not (table_date > file_date):
                    print "Already have table %s, but %s file_date=%s, table_date=%s; re-loading from gs" % (tablename, fn, file_date, table_date)
                    skip = False
                    
            if skip:
                if verbose:
                    print "Already have table %s, skipping file %s" % (tablename, fn)
                    sys.stdout.flush()
                continue

        #if date < '2014-07-27':
        #  continue
  
        print "Loading %s into table %s " % (fn, tablename)
        if verbose:
            print "start [%s]" % datetime.datetime.now()
        sys.stdout.flush()
        gsfn = fninfo['name']
        ret = bqutil.load_data_to_table(dataset, tablename, gsfn, SCHEMA, wait=wait, maxbad=1000)
  
    if verbose:
        print "-" * 77
        print "done with %s [%s]" % (course_id, datetime.datetime.now())
    print "=" * 77
    sys.stdout.flush()
def load_sql_for_course(course_id, gsbucket="gs://x-data", basedir="X-Year-2-data-sql", datedir="2014-09-21", 
                        do_gs_copy=False,
                        use_dataset_latest=False):
    '''
    Load SQL files into google cloud storage then import into BigQuery.

    Datasets are typically named by course_id, with "__" replacing "/", and "_" replacing "."

    If use_dataset_latest then "_latest" is appended to the dataset name.  
    Thus, the latest SQL dataset can always be put in a consistently named dataset.
    '''
    
    print "Loading SQL for course %s into BigQuery (start: %s)" % (course_id, datetime.datetime.now())
    sys.stdout.flush()

    lfp = find_course_sql_dir(course_id, basedir, datedir, use_dataset_latest=use_dataset_latest)

    print "Using this directory for local files: ", lfp
    sys.stdout.flush()
                          
    # convert studentmodule if necessary

    fn_sm = lfp / 'studentmodule.csv.gz'
    if not fn_sm.exists():
        fn_sm = lfp / 'studentmodule.csv'
        if not fn_sm.exists():
            fn_sm = lfp / 'studentmodule.sql.gz'
            if not fn_sm.exists():
                fn_sm = lfp / 'studentmodule.sql'
                if not fn_sm.exists():
                    print "Error!  Missing studentmodule.[sql,csv][.gz]"
            if fn_sm.exists():	# have .sql or .sql.gz version: convert to .csv
                newfn = lfp / 'studentmodule.csv.gz'
                print "--> Converting %s to %s" % (fn_sm, newfn)
                tsv2csv(fn_sm, newfn)
                fn_sm = newfn

    if fn_sm.exists():
        # rephrase studentmodule if it's using opaque keys
        fline = ''
        smfp = openfile(fn_sm)
        fline = smfp.readline()	# skip first line - it's a header
        fline = smfp.readline()
        if 'block-v1:' in fline or 'course-v1' in fline:
            rephrase_studentmodule_opaque_keys(fn_sm)

    def convert_sql(fnroot):
        if os.path.exists(fnroot + ".csv") or os.path.exists(fnroot + ".csv.gz"):
            return
        if os.path.exists(fnroot + ".sql") or os.path.exists(fnroot + ".sql.gz"):
            infn = fnroot + '.sql'
            outfn = fnroot + '.csv.gz'
            print "--> Converting %s to %s" % (infn, outfn)
            tsv2csv(infn, outfn)

    # convert sql files if necesssary
    fnset = ['users', 'certificates', 'enrollment', "profiles", 'user_id_map', 'rolecourse', 'roleforum']
    for fn in fnset:
        convert_sql(lfp / fn)

    local_files = glob.glob(lfp / '*')

    # if using latest date directory, also look for course_image.jpg one level up
    if use_dataset_latest:
        print lfp.dirname()
        ci_files = glob.glob(lfp.dirname() / 'course_image.jpg')
        if ci_files:
            local_files += list(ci_files)
            print "--> local course_image file: %s" % ci_files

    gsdir = gsutil.gs_path_from_course_id(course_id, gsbucket=gsbucket, use_dataset_latest=use_dataset_latest)

    local = pytz.timezone ("America/New_York")

    if do_gs_copy:
        try:
            fnset = get_gs_file_list(gsdir)
        except Exception as err:
            fnset = []
        
        def copy_if_newer(fn, fnset, options='-z csv,json'):
            statbuf = os.stat(fn)
            mt = datetime.datetime.fromtimestamp(statbuf.st_mtime)
            
            # do some date checking to upload files which have changed, and are newer than that on google cloud storage
            local_dt = local.localize(mt, is_dst=None)
            utc_dt = local_dt.astimezone (pytz.utc)

            fnb = os.path.basename(fn)
            if fnb in fnset and fnset[fnb]['date'] > utc_dt:
                print "...%s already copied, skipping" % fn
                sys.stdout.flush()
                return
            elif fnb in fnset:
                print "...%s already exists, but has date=%s and mtime=%s, re-uploading" % (fn, fnset[fnb]['date'], mt)

            gsutil.upload_file_to_gs(fn, gsdir / fnb, options=options, verbose=True)

        for fn in local_files:
            fnb = os.path.basename(fn)
            if fnb=='course_image.jpg':
                copy_if_newer(fn, fnset, options='-a public-read')
            if not (fnb.endswith('.csv') or fnb.endswith('.json') or fnb.endswith('.csv.gz') 
                    or fnb.endswith('.json.gz') or fnb.endswith('.mongo.gz')):
                print "...unknown file type %s, skipping" % fn
                sys.stdout.flush()
                continue
            copy_if_newer(fn, fnset)

    # load into bigquery
    dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest)
    bqutil.create_dataset_if_nonexistent(dataset)
    mypath = os.path.dirname(os.path.realpath(__file__))

    # load user_info_combo
    uicfn = lfp / 'user_info_combo.json.gz'
    if uicfn.exists():
        uic_schema = json.loads(open('%s/schemas/schema_user_info_combo.json' % mypath).read())['user_info_combo']
        bqutil.load_data_to_table(dataset, 'user_info_combo', gsdir / "user_info_combo.json.gz", uic_schema, wait=False)
    else:
        print "--> File %s does not exist, not loading user_info_combo into BigQuery" % uicfn
    
    # load studentmodule
                
    if fn_sm.exists():
        schemas = json.loads(open('%s/schemas/schemas.json' % mypath).read())
        cwsm_schema = schemas['courseware_studentmodule']
        bqutil.load_data_to_table(dataset, 'studentmodule', gsdir / fn_sm.basename(), cwsm_schema, format='csv', wait=False, skiprows=1)
    else:
        print "--> Not loading studentmodule: file %s not found" % fn_sm
def do_combine(course_id_set, project_id, outdir="DATA", nskip=0,
               output_project_id=None, output_dataset_id=None, output_bucket=None,
               use_dataset_latest=False,
               extract_subset_tables=True,
               ):
    '''
    Combine individual person_course tables (from the set of specified course_id's) to create one single large
    person_course table.  Do this by downloading each file, checking to make sure they all have the same
    fields, concatenating, and uploading back to bigquery.  This is cheaper than doing a select *, and also
    uncovers person_course files which have the wrong schema (and it works around BQ's limitation on large
    result sizes).  The result is stored in the course_report_latest dataset (if use_dataset_latest), else 
    in course_report_ORG, where ORG is the configured organization name.

    If extract_subset_tables is True, then the subset of those who viewed (ie "participants"), and the subset
    of those who enrolled for IDV, are extracted and saved as person_course_viewed, and person_course_idv.
    (those are created using a select *, for efficiency, despite the cost).
    '''

    print "="*77
    print "Concatenating person course datasets from the following courses:"
    print course_id_set
    print "-"*77

    outdir = path(outdir)
    if not outdir.exists():
        os.mkdir(outdir)
        
    ofnset = []
    cnt = 0
    for course_id in course_id_set:
        gb = gsutil.gs_path_from_course_id(course_id, use_dataset_latest=use_dataset_latest)
        ofn = outdir / ('person_course_%s.csv.gz' % (course_id.replace('/', '__')))
        ofnset.append(ofn)

        if (nskip>0) and ofn.exists():
            print "%s already exists, not downloading" % ofn
            sys.stdout.flush()
            continue

        if ofn.exists():
            fnset = gsutil.get_gs_file_list(gb)
            local_dt = gsutil.get_local_file_mtime_in_utc(ofn)
            fnb = 'person_course.csv.gz'
            if not fnb in fnset:
                print "%s/%s missing!  skipping %s" % (gb, fnb, course_id)
                continue
            if (fnb in fnset) and (local_dt >= fnset[fnb]['date']):
                print "%s already exists with date %s (gs file date %s), not re-downloading" % (ofn, local_dt, fnset[fnb]['date'])
                sys.stdout.flush()
                continue
            else:
                print "%s already exists but has date %s (gs file date %s), so re-downloading" % (ofn, local_dt, fnset[fnb]['date'])
                sys.stdout.flush()

        cmd = 'gsutil cp %s/person_course.csv.gz %s' % (gb, ofn)
        print "Retrieving %s via %s" % (course_id, cmd)
        sys.stdout.flush()
        os.system(cmd)
        cnt += 1
        #if cnt>2:
        #    break

    org = course_id_set[0].split('/',1)[0]

    ofn = "person_course_%s_%s.csv" % (org, datetime.datetime.now().strftime('%Y-%m-%d-%H%M%S'))

    print "="*77
    print "Combining CSV files to produce %s" % ofn
    sys.stdout.flush()

    if (nskip>1) and os.path.exists(ofn):
        print "%s already exists, not downloading" % ofn
    else:
        first = 1
        header = None
        for zfn in ofnset:
            if first:
                cmd = "zcat %s > %s" % (zfn, ofn)
                header = os.popen("zcat %s | head -1" % zfn).read().strip()
                firstfn = zfn
            else:
                cmd = "zcat %s | tail -n +2 >> %s" % (zfn, ofn)	# first row is header; don't keep when concatenating
            print cmd
            first = 0
            new_header = os.popen("zcat %s | head -1" % zfn).read().strip()
            if not header == new_header:
                print "==> Warning!  header mismatch for %s vs %s" % (zfn, firstfn)
                print "    %s has: %s" % (firstfn, header)
                print "    but %s has: %s" % (zfn, new_header)
            sys.stdout.flush()
            os.system(cmd)

    gb = gsutil.gs_path_from_course_id('course_report_%s' % org, gsbucket=output_bucket)

    print "="*77
    print "Uploading combined CSV file to google cloud storage in bucket: %s" % gb
    sys.stdout.flush()
    cmd = "TMPDIR=/var/tmp gsutil cp -z csv %s %s/" % (ofn, gb)
    print cmd
    os.system(cmd)

    gsfn = gb + '/' + ofn
    print "Combined person_course dataset CSV download link: %s" % gsutil.gs_download_link(gsfn)

    # import into BigQuery
    crname = ('course_report_%s' % org)
    if use_dataset_latest:
        crname = 'course_report_latest'
    dataset = output_dataset_id or crname
    table = ofn[:-4].replace('-','_')
    
    print "Importing into BigQuery as %s:%s.%s" % (project_id, dataset, table)
    sys.stdout.flush()
    mypath = os.path.dirname(os.path.realpath(__file__))
    SCHEMA_FILE = '%s/schemas/schema_person_course.json' % mypath
    the_schema = json.loads(open(SCHEMA_FILE).read())['person_course']

    bqutil.load_data_to_table(dataset, table, gsfn, the_schema, format='csv', skiprows=1, project_id=output_project_id)

    msg = ''
    msg += "Combined person-course dataset, with data from:\n"
    msg += str(course_id_set)
    msg += "\n\n"
    msg += "="*100 + "\n"
    msg += "CSV download link: %s" % gsutil.gs_download_link(gsfn)

    bqutil.add_description_to_table(dataset, table, msg, append=True, project_id=output_project_id)
    
    # copy the new table (which has a specific date in its name) to a generically named "person_course_latest"
    # so that future SQL queries can simply use this as the latest person course table
    print "-> Copying %s to %s.person_course_latest" % (table, dataset)
    bqutil.copy_bq_table(dataset, table, "person_course_latest")

    if extract_subset_tables:
        do_extract_subset_person_course_tables(dataset, table)

    print "Done"
    sys.stdout.flush()
def do_combine(course_id_set, project_id, outdir="DATA", nskip=0,
               output_project_id=None, output_dataset_id=None, output_bucket=None,
               use_dataset_latest=False,
               ):

    print "="*77
    print "Concatenating person course datasets from the following courses:"
    print course_id_set
    print "-"*77

    outdir = path(outdir)
    if not outdir.exists():
        os.mkdir(outdir)
        
    ofnset = []
    cnt = 0
    for course_id in course_id_set:
        gb = gsutil.gs_path_from_course_id(course_id, use_dataset_latest=use_dataset_latest)
        ofn = outdir / ('person_course_%s.csv.gz' % (course_id.replace('/', '__')))
        ofnset.append(ofn)

        if (nskip>0) and ofn.exists():
            print "%s already exists, not downloading" % ofn
            sys.stdout.flush()
            continue

        if ofn.exists():
            fnset = gsutil.get_gs_file_list(gb)
            local_dt = gsutil.get_local_file_mtime_in_utc(ofn)
            fnb = 'person_course.csv.gz'
            if not fnb in fnset:
                print "%s/%s missing!  skipping %s" % (gb, fnb, course_id)
                continue
            if (fnb in fnset) and (local_dt >= fnset[fnb]['date']):
                print "%s already exists with date %s (gs file date %s), not re-downloading" % (ofn, local_dt, fnset[fnb]['date'])
                sys.stdout.flush()
                continue
            else:
                print "%s already exists but has date %s (gs file date %s), so re-downloading" % (ofn, local_dt, fnset[fnb]['date'])
                sys.stdout.flush()

        cmd = 'gsutil cp %s/person_course.csv.gz %s' % (gb, ofn)
        print "Retrieving %s via %s" % (course_id, cmd)
        sys.stdout.flush()
        os.system(cmd)
        cnt += 1
        #if cnt>2:
        #    break

    org = course_id_set[0].split('/',1)[0]

    ofn = "person_course_%s_%s.csv" % (org, datetime.datetime.now().strftime('%Y-%m-%d-%H%M%S'))

    print "="*77
    print "Combining CSV files to produce %s" % ofn
    sys.stdout.flush()

    if (nskip>1) and os.path.exists(ofn):
        print "%s already exists, not downloading" % ofn
    else:
        first = 1
        for zfn in ofnset:
            if first:
                cmd = "zcat %s > %s" % (zfn, ofn)
            else:
                cmd = "zcat %s | tail -n +2 >> %s" % (zfn, ofn)	# first row is header; don't keep when concatenating
            print cmd
            first = 0
            os.system(cmd)

    gb = gsutil.gs_path_from_course_id('course_report_%s' % org, gsbucket=output_bucket)

    print "="*77
    print "Uploading combined CSV file to google cloud storage in bucket: %s" % gb
    sys.stdout.flush()
    cmd = "TMPDIR=/var/tmp gsutil cp -z csv %s %s/" % (ofn, gb)
    print cmd
    os.system(cmd)

    gsfn = gb + '/' + ofn
    print "Combined person_course dataset CSV download link: %s" % gsutil.gs_download_link(gsfn)

    # import into BigQuery
    crname = ('course_report_%s' % org)
    if use_dataset_latest:
        crname = 'course_report_latest'
    dataset = output_dataset_id or crname
    table = ofn[:-4].replace('-','_')
    
    print "Importing into BigQuery as %s:%s.%s" % (project_id, dataset, table)
    sys.stdout.flush()
    mypath = os.path.dirname(os.path.realpath(__file__))
    SCHEMA_FILE = '%s/schemas/schema_person_course.json' % mypath
    the_schema = json.loads(open(SCHEMA_FILE).read())['person_course']

    bqutil.load_data_to_table(dataset, table, gsfn, the_schema, format='csv', skiprows=1, project_id=output_project_id)

    msg = ''
    msg += "Combined person-course dataset, with data from:\n"
    msg += str(course_id_set)
    msg += "\n\n"
    msg += "="*100 + "\n"
    msg += "CSV download link: %s" % gsutil.gs_download_link(gsfn)

    bqutil.add_description_to_table(dataset, table, msg, append=True, project_id=output_project_id)
    
    print "Done"
    sys.stdout.flush()
def upload_grades_persistent_data(cid,
                                  basedir,
                                  datedir,
                                  use_dataset_latest=False,
                                  subsection=False):
    """
    Upload grades_persistent csv.gz to Google Storage,
    create the BigQuery table,
    then insert the data into the table.

    :param cid: the course id
    :param basedir: the base directory path
    :param datedir: the date directory name (represented as YYYY-MM-DD)
    :param use_dataset_latest: should the most recent dataset be used?
    :param subsection: should grades_persistentsubsection be uploaded?
    :type cid: str
    :type basedir: str
    :type datedir: str
    :type use_dataset_latest: bool
    :type subsection: bool
    """
    gsdir = path(
        gsutil.gs_path_from_course_id(cid,
                                      use_dataset_latest=use_dataset_latest))

    if subsection:
        csv_name = "grades_persistentsubsectiongrade.csv.gz"
        temp_name = "grades_persistentsubsectiongrade_temp.csv.gz"
        table = "grades_persistent_subsection"
    else:
        csv_name = "grades_persistentcoursegrade.csv.gz"
        temp_name = "grades_persistentcoursegrade_temp.csv.gz"
        table = "grades_persistent"

    sdir = load_course_sql.find_course_sql_dir(
        cid,
        basedir=basedir,
        datedir=datedir,
        use_dataset_latest=(use_dataset_latest),
    )

    csvfn = sdir / csv_name
    tempfn = sdir / temp_name

    mypath = os.path.dirname(os.path.realpath(__file__))
    the_schema = json.loads(
        open('%s/schemas/schema_%s.json' % (mypath, table)).read())[table]

    if not os.path.exists(csvfn):
        print "[edx2bigquery] make_grades_persistent: missing file %s, skipping" % csvfn
        return

    if not subsection:
        cleanup_rows_from_grade_persistent(csvfn, tempfn)
    else:
        cleanup_rows_from_grade_persistent(csvfn,
                                           tempfn,
                                           field_to_fix="first_attempted")

    gsutil.upload_file_to_gs(csvfn, gsdir, options="-z csv", verbose=True)

    dataset = bqutil.course_id2dataset(cid,
                                       use_dataset_latest=use_dataset_latest)
    bqutil.create_dataset_if_nonexistent(
        dataset)  # create dataset if not already existent

    bqutil.load_data_to_table(dataset,
                              table,
                              gsdir / csv_name,
                              the_schema,
                              format="csv",
                              skiprows=1)
def make_video_stats(course_id, api_key, basedir, datedir, force_recompute,
                     use_dataset_latest):
    '''
    Create Video stats for Videos Viewed and Videos Watched.
    First create a video axis, based on course axis. Then use tracking logs to count up videos viewed and videos watched
    '''

    assert api_key is not None, "[analyze videos]: Public API Key is missing from configuration file. Visit https://developers.google.com/console/help/new/#generatingdevkeys for details on how to generate public key, and then add to edx2bigquery_config.py as API_KEY variable"

    # Get Course Dir path
    basedir = path(basedir or '')
    course_dir = course_id.replace('/', '__')
    lfp = find_course_sql_dir(course_id, basedir, datedir, use_dataset_latest)

    # get schema
    mypath = os.path.dirname(os.path.realpath(__file__))
    SCHEMA_FILE = '%s/%s' % (mypath, SCHEMA_VIDEO_AXIS)
    the_schema = json.loads(open(SCHEMA_FILE).read())[SCHEMA_VIDEO_AXIS_NAME]
    the_dict_schema = schema2dict(the_schema)

    # Create initial video axis
    videoAxisExists = False
    dataset = bqutil.course_id2dataset(course_id,
                                       use_dataset_latest=use_dataset_latest)
    va_date = None
    try:
        tinfo = bqutil.get_bq_table_info(dataset, TABLE_VIDEO_AXIS)
        assert tinfo is not None, "[analyze videos] %s.%s does not exist. First time creating table" % (
            dataset, TABLE_VIDEO_AXIS)
        videoAxisExists = True
        va_date = tinfo['lastModifiedTime']  # datetime
    except (AssertionError, Exception) as err:
        print "%s --> Attempting to process %s table" % (str(err),
                                                         TABLE_VIDEO_AXIS)
        sys.stdout.flush()

    # get course axis time
    ca_date = None
    try:
        tinfo = bqutil.get_bq_table_info(dataset, TABLE_COURSE_AXIS)
        ca_date = tinfo['lastModifiedTime']  # datetime
    except (AssertionError, Exception) as err:
        pass

    if videoAxisExists and (not force_recompute) and ca_date and va_date and (
            ca_date > va_date):
        force_recompute = True
        print "video_axis exists, but has date %s, older than course_axis date %s; forcing recompute" % (
            va_date, ca_date)
        sys.stdout.flush()

    if not videoAxisExists or force_recompute:
        force_recompute = True
        createVideoAxis(course_id=course_id,
                        force_recompute=force_recompute,
                        use_dataset_latest=use_dataset_latest)

        # Get video lengths
        va = bqutil.get_table_data(dataset, TABLE_VIDEO_AXIS)
        assert va is not None, "[analyze videos] Possibly no data in video axis table. Check course axis table"
        va_bqdata = va['data']
        fileoutput = lfp / FILENAME_VIDEO_AXIS
        getYoutubeDurations(dataset=dataset,
                            bq_table_input=va_bqdata,
                            api_key=api_key,
                            outputfilename=fileoutput,
                            schema=the_dict_schema,
                            force_recompute=force_recompute)

        # upload and import video axis
        gsfn = gsutil.gs_path_from_course_id(
            course_id,
            use_dataset_latest=use_dataset_latest) / FILENAME_VIDEO_AXIS
        gsutil.upload_file_to_gs(fileoutput, gsfn)
        table = TABLE_VIDEO_AXIS
        bqutil.load_data_to_table(dataset, table, gsfn, the_schema, wait=True)

    else:
        print "[analyze videos] %s.%s already exists (and force recompute not specified). Skipping step to generate %s using latest course axis" % (
            dataset, TABLE_VIDEO_AXIS, TABLE_VIDEO_AXIS)

    # Lastly, create video stats
    createVideoStats_day(course_id,
                         force_recompute=force_recompute,
                         use_dataset_latest=use_dataset_latest)
    createVideoStats(course_id,
                     force_recompute=force_recompute,
                     use_dataset_latest=use_dataset_latest)
def analyze_course_content(course_id, 
                           listings_file=None,
                           basedir="X-Year-2-data-sql", 
                           datedir="2013-09-21", 
                           use_dataset_latest=False,
                           do_upload=False,
                           courses=None,
                           verbose=True,
                           ):
    '''
    Compute course_content table, which quantifies:

    - number of chapter, sequential, vertical modules
    - number of video modules
    - number of problem, *openended, mentoring modules
    - number of dicussion, annotatable, word_cloud modules

    Do this using the course "xbundle" file, produced when the course axis is computed.

    Include only modules which had nontrivial use, to rule out the staff and un-shown content. 
    Do the exclusion based on count of module appearing in the studentmodule table, based on 
    stats_module_usage for each course.

    Also, from the course listings file, compute the number of weeks the course was open.

    If do_upload (triggered by --force-recompute) then upload all accumulated data to the course report dataset 
    as the "stats_course_content" table.  Also generate a "course_summary_stats" table, stored in the
    course_report_ORG or course_report_latest dataset.  The course_summary_stats table combines
    data from many reports,, including stats_course_content, the medians report, the listings file,
    broad_stats_by_course, and time_on_task_stats_by_course.
    
    '''

    if do_upload:
        if use_dataset_latest:
            org = "latest"
        else:
            org = courses[0].split('/',1)[0]	# extract org from first course_id in courses

        crname = 'course_report_%s' % org

        gspath = gsutil.gs_path_from_course_id(crname)
        gsfnp = gspath / CCDATA
        gsutil.upload_file_to_gs(CCDATA, gsfnp)
        tableid = "stats_course_content"
        dataset = crname

        mypath = os.path.dirname(os.path.realpath(__file__))
        SCHEMA_FILE = '%s/schemas/schema_content_stats.json' % mypath

        try:
            the_schema = json.loads(open(SCHEMA_FILE).read())[tableid]
        except Exception as err:
            print "Oops!  Failed to load schema file for %s.  Error: %s" % (tableid, str(err))
            raise

        if 0:
            bqutil.load_data_to_table(dataset, tableid, gsfnp, the_schema, wait=True, verbose=False,
                                      format='csv', skiprows=1)

        table = 'course_metainfo'
        course_tables = ',\n'.join([('[%s.course_metainfo]' % bqutil.course_id2dataset(x)) for x in courses])
        sql = "select * from {course_tables}".format(course_tables=course_tables)
        print "--> Creating %s.%s using %s" % (dataset, table, sql)

        if 1:
            metainfo_dataset = bqutil.get_bq_table(dataset, table, sql=sql, 
                                          newer_than=datetime.datetime(2015, 1, 16, 3, 0),
                                          )
            # bqutil.create_bq_table(dataset, table, sql, overwrite=True)


        #-----------------------------------------------------------------------------
        # make course_summary_stats table
        #
        # This is a combination of the broad_stats_by_course table (if that exists), and course_metainfo.
        # Also use (and create if necessary) the nregistered_by_wrap table.

        # get the broad_stats_by_course data
        bsbc = bqutil.get_table_data(dataset, 'broad_stats_by_course')

        table_list = bqutil.get_list_of_table_ids(dataset)

        latest_person_course = max([ x for x in table_list if x.startswith('person_course_')])
        print "Latest person_course table in %s is %s" % (dataset, latest_person_course)
        
        sql = """
                SELECT pc.course_id as course_id, 
                    cminfo.wrap_date as wrap_date,
                    count(*) as nregistered,
                    sum(case when pc.start_time < cminfo.wrap_date then 1 else 0 end) nregistered_by_wrap,
                    sum(case when pc.start_time < cminfo.wrap_date then 1 else 0 end) / nregistered * 100 nregistered_by_wrap_pct,
                FROM
                    [{dataset}.{person_course}] as pc
                left join (
                 SELECT course_id,
                      TIMESTAMP(concat(wrap_year, "-", wrap_month, '-', wrap_day, ' 23:59:59')) as wrap_date,
                 FROM (
                  SELECT course_id, 
                    regexp_extract(value, r'(\d+)/\d+/\d+') as wrap_month,
                    regexp_extract(value, r'\d+/(\d+)/\d+') as wrap_day,
                    regexp_extract(value, r'\d+/\d+/(\d+)') as wrap_year,
                  FROM [{dataset}.course_metainfo]
                  where key='listings_Course Wrap'
                 )) as cminfo
                on pc.course_id = cminfo.course_id
                
                group by course_id, wrap_date
                order by course_id
        """.format(dataset=dataset, person_course=latest_person_course)

        nr_by_wrap = bqutil.get_bq_table(dataset, 'nregistered_by_wrap', sql=sql, key={'name': 'course_id'})

        # rates for registrants before and during course
        
        sql = """
                SELECT 
                    *,
                    ncertified / nregistered * 100 as pct_certified_of_reg,
                    ncertified_and_registered_before_launch / nregistered_before_launch * 100 as pct_certified_reg_before_launch,
                    ncertified_and_registered_during_course / nregistered_during_course * 100 as pct_certified_reg_during_course,
                    ncertified / nregistered_by_wrap * 100 as pct_certified_of_reg_by_wrap,
                    ncertified / nviewed * 100 as pct_certified_of_viewed,
                    ncertified / nviewed_by_wrap * 100 as pct_certified_of_viewed_by_wrap,
                    ncertified_by_ewrap / nviewed_by_ewrap * 100 as pct_certified_of_viewed_by_ewrap,
                FROM
                (
                # ------------------------
                # get aggregate data
                SELECT pc.course_id as course_id, 
                    cminfo.wrap_date as wrap_date,
                    count(*) as nregistered,
                    sum(case when pc.certified then 1 else 0 end) ncertified,
                    sum(case when (TIMESTAMP(pc.cert_created_date) < cminfo.ewrap_date) and (pc.certified and pc.viewed) then 1 else 0 end) ncertified_by_ewrap,
                    sum(case when pc.viewed then 1 else 0 end) nviewed,
                    sum(case when pc.start_time < cminfo.wrap_date then 1 else 0 end) nregistered_by_wrap,
                    sum(case when pc.start_time < cminfo.wrap_date then 1 else 0 end) / nregistered * 100 nregistered_by_wrap_pct,
                    sum(case when (pc.start_time < cminfo.wrap_date) and pc.viewed then 1 else 0 end) nviewed_by_wrap,
                    sum(case when (pc.start_time < cminfo.ewrap_date) and pc.viewed then 1 else 0 end) nviewed_by_ewrap,
                    sum(case when pc.start_time < cminfo.launch_date then 1 else 0 end) nregistered_before_launch,
                    sum(case when pc.start_time < cminfo.launch_date 
                              and pc.certified
                              then 1 else 0 end) ncertified_and_registered_before_launch,
                    sum(case when (pc.start_time >= cminfo.launch_date) 
                              and (pc.start_time < cminfo.wrap_date) then 1 else 0 end) nregistered_during_course,
                    sum(case when (pc.start_time >= cminfo.launch_date) 
                              and (pc.start_time < cminfo.wrap_date) 
                              and pc.certified
                              then 1 else 0 end) ncertified_and_registered_during_course,
                FROM
                    [{dataset}.{person_course}] as pc
                left join (
                
                # --------------------
                #  get course launch and wrap dates from course_metainfo

       SELECT AA.course_id as course_id, 
              AA.wrap_date as wrap_date,
              AA.launch_date as launch_date,
              BB.ewrap_date as ewrap_date,
       FROM (
               #  inner get course launch and wrap dates from course_metainfo
                SELECT A.course_id as course_id,
                  A.wrap_date as wrap_date,
                  B.launch_date as launch_date,
                from
                (
                 SELECT course_id,
                      TIMESTAMP(concat(wrap_year, "-", wrap_month, '-', wrap_day, ' 23:59:59')) as wrap_date,
                 FROM (
                  SELECT course_id, 
                    regexp_extract(value, r'(\d+)/\d+/\d+') as wrap_month,
                    regexp_extract(value, r'\d+/(\d+)/\d+') as wrap_day,
                    regexp_extract(value, r'\d+/\d+/(\d+)') as wrap_year,
                  FROM [{dataset}.course_metainfo]
                  where key='listings_Course Wrap'
                 )
                ) as A
                left outer join 
                (
                 SELECT course_id,
                      TIMESTAMP(concat(launch_year, "-", launch_month, '-', launch_day)) as launch_date,
                 FROM (
                  SELECT course_id, 
                    regexp_extract(value, r'(\d+)/\d+/\d+') as launch_month,
                    regexp_extract(value, r'\d+/(\d+)/\d+') as launch_day,
                    regexp_extract(value, r'\d+/\d+/(\d+)') as launch_year,
                  FROM [{dataset}.course_metainfo]
                  where key='listings_Course Launch'
                 )
                ) as B
                on A.course_id = B.course_id 
                # end inner course_metainfo subquery
            ) as AA
            left outer join
            (
                 SELECT course_id,
                      TIMESTAMP(concat(wrap_year, "-", wrap_month, '-', wrap_day, ' 23:59:59')) as ewrap_date,
                 FROM (
                  SELECT course_id, 
                    regexp_extract(value, r'(\d+)/\d+/\d+') as wrap_month,
                    regexp_extract(value, r'\d+/(\d+)/\d+') as wrap_day,
                    regexp_extract(value, r'\d+/\d+/(\d+)') as wrap_year,
                  FROM [{dataset}.course_metainfo]
                  where key='listings_Empirical Course Wrap'
                 )
            ) as BB
            on AA.course_id = BB.course_id

                # end course_metainfo subquery
                # --------------------
                
                ) as cminfo
                on pc.course_id = cminfo.course_id
                
                group by course_id, wrap_date
                order by course_id
                # ---- end get aggregate data
                )
                order by course_id
        """.format(dataset=dataset, person_course=latest_person_course)

        print "--> Assembling course_summary_stats from %s" % 'stats_cert_rates_by_registration'
        sys.stdout.flush()
        cert_by_reg = bqutil.get_bq_table(dataset, 'stats_cert_rates_by_registration', sql=sql, 
                                          newer_than=datetime.datetime(2015, 1, 16, 3, 0),
                                          key={'name': 'course_id'})

        # start assembling course_summary_stats

        c_sum_stats = defaultdict(OrderedDict)
        for entry in bsbc['data']:
            course_id = entry['course_id']
            cmci = c_sum_stats[course_id]
            cmci.update(entry)
            cnbw = nr_by_wrap['data_by_key'][course_id]
            nbw = int(cnbw['nregistered_by_wrap'])
            cmci['nbw_wrap_date'] = cnbw['wrap_date']
            cmci['nregistered_by_wrap'] = nbw
            cmci['nregistered_by_wrap_pct'] = cnbw['nregistered_by_wrap_pct']
            cmci['frac_female'] = float(entry['n_female_viewed']) / (float(entry['n_male_viewed']) + float(entry['n_female_viewed']))
            ncert = float(cmci['certified_sum'])
            if ncert:
                cmci['certified_of_nregistered_by_wrap_pct'] = nbw / ncert * 100.0
            else:
                cmci['certified_of_nregistered_by_wrap_pct'] = None
            cbr = cert_by_reg['data_by_key'][course_id]
            for field, value in cbr.items():
                cmci['cbr_%s' % field] = value

        # add medians for viewed, explored, and certified

        msbc_tables = {'msbc_viewed': "viewed_median_stats_by_course",
                       'msbc_explored': 'explored_median_stats_by_course',
                       'msbc_certified': 'certified_median_stats_by_course',
                       'msbc_verified': 'verified_median_stats_by_course',
                       }
        for prefix, mtab in msbc_tables.items():
            print "--> Merging median stats data from %s" % mtab
            sys.stdout.flush()
            bqdat = bqutil.get_table_data(dataset, mtab)
            for entry in bqdat['data']:
                course_id = entry['course_id']
                cmci = c_sum_stats[course_id]
                for field, value in entry.items():
                    cmci['%s_%s' % (prefix, field)] = value

        # add time on task data

        tot_table = "time_on_task_stats_by_course"
        prefix = "ToT"
        print "--> Merging time on task data from %s" % tot_table
        sys.stdout.flush()
        try:
            bqdat = bqutil.get_table_data(dataset, tot_table)
        except Exception as err:
            bqdat = {'data': {}}
        for entry in bqdat['data']:
            course_id = entry['course_id']
            cmci = c_sum_stats[course_id]
            for field, value in entry.items():
                if field=='course_id':
                    continue
                cmci['%s_%s' % (prefix, field)] = value

        # add serial time on task data

        tot_table = "time_on_task_serial_stats_by_course"
        prefix = "SToT"
        print "--> Merging serial time on task data from %s" % tot_table
        sys.stdout.flush()
        try:
            bqdat = bqutil.get_table_data(dataset, tot_table)
        except Exception as err:
            bqdat = {'data': {}}
        for entry in bqdat['data']:
            course_id = entry['course_id']
            cmci = c_sum_stats[course_id]
            for field, value in entry.items():
                if field=='course_id':
                    continue
                cmci['%s_%s' % (prefix, field)] = value

        # add show_answer stats

        tot_table = "show_answer_stats_by_course"
        prefix = "SAS"
        print "--> Merging show_answer stats data from %s" % tot_table
        sys.stdout.flush()
        try:
            bqdat = bqutil.get_table_data(dataset, tot_table)
        except Exception as err:
            bqdat = {'data': {}}
        for entry in bqdat['data']:
            course_id = entry['course_id']
            cmci = c_sum_stats[course_id]
            for field, value in entry.items():
                if field=='course_id':
                    continue
                cmci['%s_%s' % (prefix, field)] = value

        # setup list of keys, for CSV output

        css_keys = c_sum_stats.values()[0].keys()

        # retrieve course_metainfo table, pivot, add that to summary_stats

        print "--> Merging course_metainfo from %s" % table
        sys.stdout.flush()
        bqdat = bqutil.get_table_data(dataset, table)

        def make_key(key):
            key = key.strip()
            key = key.replace(' ', '_').replace("'", "_").replace('/', '_').replace('(','').replace(')','').replace('-', '_').replace(',', '')
            return key

        listings_keys = map(make_key, ["Institution", "Semester", "New or Rerun", "Andrew Recodes New/Rerun", 
                                       "Course Number", "Short Title", "Andrew's Short Titles", "Title", 
                                       "Instructors", "Registration Open", "Course Launch", "Course Wrap", "course_id",
                                       "Empirical Course Wrap", "Andrew's Order", "certifies", "MinPassGrade",
                                       '4-way Category by name', "4-way (CS, STEM, HSocSciGov, HumHistRel)"
                                       ])
        listings_keys.reverse()
        
        for lk in listings_keys:
            css_keys.insert(1, "listings_%s" % lk)

        COUNTS_TO_KEEP = ['discussion', 'problem', 'optionresponse', 'checkboxgroup', 'optioninput', 
                          'choiceresponse', 'video', 'choicegroup', 'vertical', 'choice', 'sequential', 
                          'multiplechoiceresponse', 'numericalresponse', 'chapter', 'solution', 'img', 
                          'formulaequationinput', 'responseparam', 'selfassessment', 'track', 'task', 'rubric', 
                          'stringresponse', 'combinedopenended', 'description', 'textline', 'prompt', 'category', 
                          'option', 'lti', 'annotationresponse', 
                          'annotatable', 'colgroup', 'tag_prompt', 'comment', 'annotationinput', 'image', 
                          'options', 'comment_prompt', 'conditional', 
                          'answer', 'poll_question', 'section', 'wrapper', 'map', 'area', 
                          'customtag', 'transcript', 
                          'split_test', 'word_cloud', 
                          'openended', 'openendedparam', 'answer_display', 'code', 
                          'drag_and_drop_input', 'customresponse', 'draggable', 'mentoring', 
                          'textannotation', 'imageannotation', 'videosequence', 
                          'feedbackprompt', 'assessments', 'openassessment', 'assessment', 'explanation', 'criterion']

        for entry in bqdat['data']:
            thekey = make_key(entry['key'])
            # if thekey.startswith('count_') and thekey[6:] not in COUNTS_TO_KEEP:
            #     continue
            if thekey.startswith('listings_') and thekey[9:] not in listings_keys:
                # print "dropping key=%s for course_id=%s" % (thekey, entry['course_id'])
                continue
            c_sum_stats[entry['course_id']][thekey] = entry['value']
            #if 'certifies' in thekey:
            #    print "course_id=%s, key=%s, value=%s" % (entry['course_id'], thekey, entry['value'])
            if thekey not in css_keys:
                css_keys.append(thekey)

        # compute forum_posts_per_week
        for course_id, entry in c_sum_stats.items():
            nfps = entry.get('nforum_posts_sum', 0)
            if nfps:
                fppw = int(nfps) / float(entry['nweeks'])
                entry['nforum_posts_per_week'] = fppw
                print "    course: %s, assessments_per_week=%s, forum_posts_per_week=%s" % (course_id, entry['total_assessments_per_week'], fppw)
            else:
                entry['nforum_posts_per_week'] = None
        css_keys.append('nforum_posts_per_week')

        # read in listings file and merge that in also
        if listings_file:
            if listings_file.endswith('.csv'):
                listings = csv.DictReader(open(listings_file))
            else:
                listings = [ json.loads(x) for x in open(listings_file) ]
            for entry in listings:
                course_id = entry['course_id']
                if course_id not in c_sum_stats:
                    continue
                cmci = c_sum_stats[course_id]
                for field, value in entry.items():
                    lkey = "listings_%s" % make_key(field)
                    if not (lkey in cmci) or (not cmci[lkey]):
                        cmci[lkey] = value

        print "Storing these fields: %s" % css_keys

        # get schema
        mypath = os.path.dirname(os.path.realpath(__file__))
        the_schema = json.loads(open('%s/schemas/schema_combined_course_summary_stats.json' % mypath).read())
        schema_dict = { x['name'] : x for x in the_schema }

        # write out CSV
        css_table = "course_summary_stats"
        ofn = "%s__%s.csv" % (dataset, css_table)
        ofn2 = "%s__%s.json" % (dataset, css_table)
        print "Writing data to %s and %s" % (ofn, ofn2)

        ofp = open(ofn, 'w')
        ofp2 = open(ofn2, 'w')
        dw = csv.DictWriter(ofp, fieldnames=css_keys)
        dw.writeheader()
        for cid, entry in c_sum_stats.items():
            for ek in entry:
                if ek not in schema_dict:
                    entry.pop(ek)
                # entry[ek] = str(entry[ek])	# coerce to be string
            ofp2.write(json.dumps(entry) + "\n")
            for key in css_keys:
                if key not in entry:
                    entry[key] = None
            dw.writerow(entry)
        ofp.close()
        ofp2.close()

        # upload to bigquery
        # the_schema = [ { 'type': 'STRING', 'name': x } for x in css_keys ]
        if 1:
            gsfnp = gspath / dataset / (css_table + ".json")
            gsutil.upload_file_to_gs(ofn2, gsfnp)
            # bqutil.load_data_to_table(dataset, css_table, gsfnp, the_schema, wait=True, verbose=False,
            #                           format='csv', skiprows=1)
            bqutil.load_data_to_table(dataset, css_table, gsfnp, the_schema, wait=True, verbose=False)

        return

    
    print "-"*60 + " %s" % course_id

    # get nweeks from listings
    lfn = path(listings_file)
    if not lfn.exists():
        print "[analyze_content] course listings file %s doesn't exist!" % lfn
        return

    data = None
    for k in csv.DictReader(open(lfn)):
        if k['course_id']==course_id:
            data = k
            break

    if not data:
        print "[analyze_content] no entry for %s found in course listings file %s!" % (course_id, lfn)
        return

    def date_parse(field):
        (m, d, y) = map(int, data[field].split('/'))
        return datetime.datetime(y, m, d)

    launch = date_parse('Course Launch')
    wrap = date_parse('Course Wrap')
    ndays = (wrap - launch).days
    nweeks = ndays / 7.0

    print "Course length = %6.2f weeks (%d days)" % (nweeks, ndays)

    course_dir = find_course_sql_dir(course_id, basedir, datedir, use_dataset_latest)
    cfn = gsutil.path_from_course_id(course_id)

    xbfn = course_dir / ("xbundle_%s.xml" % cfn)
    
    if not xbfn.exists():
        print "[analyze_content] cannot find xbundle file %s for %s!" % (xbfn, course_id)
        return

    print "[analyze_content] For %s using %s" % (course_id, xbfn)
    
    # get module usage data
    mudata = get_stats_module_usage(course_id, basedir, datedir, use_dataset_latest)

    xml = etree.parse(open(xbfn)).getroot()
    
    counts = defaultdict(int)
    nexcluded = defaultdict(int)

    IGNORE = ['html', 'p', 'div', 'iframe', 'ol', 'li', 'ul', 'blockquote', 'h1', 'em', 'b', 'h2', 'h3', 'body', 'span', 'strong',
              'a', 'sub', 'strike', 'table', 'td', 'tr', 's', 'tbody', 'sup', 'sub', 'strike', 'i', 's', 'pre', 'policy', 'metadata',
              'grading_policy', 'br', 'center',  'wiki', 'course', 'font', 'tt', 'it', 'dl', 'startouttext', 'endouttext', 'h4', 
              'head', 'source', 'dt', 'hr', 'u', 'style', 'dd', 'script', 'th', 'p', 'P', 'TABLE', 'TD', 'small', 'text', 'title']

    def walk_tree(elem):
        if  type(elem.tag)==str and (elem.tag.lower() not in IGNORE):
            counts[elem.tag.lower()] += 1
        for k in elem:
            midfrag = (k.tag, k.get('url_name_orig', None))
            if (midfrag in mudata) and int(mudata[midfrag]['ncount']) < 20:
                nexcluded[k.tag] += 1
                if verbose:
                    print "    -> excluding %s (%s), ncount=%s" % (k.get('display_name', '<no_display_name>').encode('utf8'), 
                                                                   midfrag, 
                                                                   mudata.get(midfrag, {}).get('ncount'))
                continue
            walk_tree(k)

    walk_tree(xml)
    print counts

    # combine some into "qual_axis" and others into "quant_axis"
    qual_axis = ['openassessment', 'optionresponse', 'multiplechoiceresponse', 
                 # 'discussion', 
                 'choiceresponse', 'word_cloud', 
                 'combinedopenended', 'choiceresponse', 'stringresponse', 'textannotation', 'openended', 'lti']
    quant_axis = ['formularesponse', 'numericalresponse', 'customresponse', 'symbolicresponse', 'coderesponse',
                  'imageresponse']

    nqual = 0
    nquant = 0
    for tag, count in counts.items():
        if tag in qual_axis:
            nqual += count
        if tag in quant_axis:
            nquant += count
    
    print "nqual=%d, nquant=%d" % (nqual, nquant)

    nqual_per_week = nqual / nweeks
    nquant_per_week = nquant / nweeks
    total_per_week = nqual_per_week + nquant_per_week

    print "per week: nqual=%6.2f, nquant=%6.2f total=%6.2f" % (nqual_per_week, nquant_per_week, total_per_week)

    # save this overall data in CCDATA
    lock_file(CCDATA)
    ccdfn = path(CCDATA)
    ccd = {}
    if ccdfn.exists():
        for k in csv.DictReader(open(ccdfn)):
            ccd[k['course_id']] = k
    
    ccd[course_id] = {'course_id': course_id,
                      'nweeks': nweeks,
                      'nqual_per_week': nqual_per_week,
                      'nquant_per_week': nquant_per_week,
                      'total_assessments_per_week' : total_per_week,
                      }

    # fields = ccd[ccd.keys()[0]].keys()
    fields = ['course_id', 'nquant_per_week', 'total_assessments_per_week', 'nqual_per_week', 'nweeks']
    cfp = open(ccdfn, 'w')
    dw = csv.DictWriter(cfp, fieldnames=fields)
    dw.writeheader()
    for cid, entry in ccd.items():
        dw.writerow(entry)
    cfp.close()
    lock_file(CCDATA, release=True)

    # store data in course_metainfo table, which has one (course_id, key, value) on each line
    # keys include nweeks, nqual, nquant, count_* for module types *

    cmfields = OrderedDict()
    cmfields['course_id'] = course_id
    cmfields['course_length_days'] = str(ndays)
    cmfields.update({ ('listings_%s' % key) : value for key, value in data.items() })	# from course listings
    cmfields.update(ccd[course_id].copy())

    # cmfields.update({ ('count_%s' % key) : str(value) for key, value in counts.items() })	# from content counts

    for key in sorted(counts):	# store counts in sorted order, so that the later generated CSV file can have a predictable structure
        value = counts[key]
        cmfields['count_%s' % key] =  str(value) 	# from content counts

    cmfields.update({ ('nexcluded_sub_20_%s' % key) : str(value) for key, value in nexcluded.items() })	# from content counts

    course_dir = find_course_sql_dir(course_id, basedir, datedir, use_dataset_latest)
    csvfn = course_dir / CMINFO

    # manual overriding of the automatically computed fields can be done by storing course_id,key,value data
    # in the CMINFO_OVERRIDES file

    csvfn_overrides = course_dir / CMINFO_OVERRIDES
    if csvfn_overrides.exists():
        print "--> Loading manual override information from %s" % csvfn_overrides
        for ovent in csv.DictReader(open(csvfn_overrides)):
            if not ovent['course_id']==course_id:
                print "===> ERROR! override file has entry with wrong course_id: %s" % ovent
                continue
            print "    overriding key=%s with value=%s" % (ovent['key'], ovent['value'])
            cmfields[ovent['key']] = ovent['value']

    print "--> Course metainfo writing to %s" % csvfn

    fp = open(csvfn, 'w')

    cdw = csv.DictWriter(fp, fieldnames=['course_id', 'key', 'value'])
    cdw.writeheader()
    
    for k, v in cmfields.items():
        cdw.writerow({'course_id': course_id, 'key': k, 'value': v})
        
    fp.close()

    table = 'course_metainfo'
    dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest)

    gsfnp = gsutil.gs_path_from_course_id(course_id, use_dataset_latest=use_dataset_latest) / CMINFO
    print "--> Course metainfo uploading to %s then to %s.%s" % (gsfnp, dataset, table)

    gsutil.upload_file_to_gs(csvfn, gsfnp)

    mypath = os.path.dirname(os.path.realpath(__file__))
    SCHEMA_FILE = '%s/schemas/schema_course_metainfo.json' % mypath
    the_schema = json.loads(open(SCHEMA_FILE).read())[table]

    bqutil.load_data_to_table(dataset, table, gsfnp, the_schema, wait=True, verbose=False, format='csv', skiprows=1)
Beispiel #19
0
def rephrase_forum_json_for_course(
    course_id,
    gsbucket="gs://x-data",
    basedir="X-Year-2-data-sql",
    datedir=None,
    do_gs_copy=False,
    use_dataset_latest=False,
):

    print "Loading SQL for course %s into BigQuery (start: %s)" % (
        course_id, datetime.datetime.now())
    sys.stdout.flush()

    lfp = find_course_sql_dir(course_id,
                              basedir,
                              datedir,
                              use_dataset_latest=use_dataset_latest)

    print "Using this directory for local files: ", lfp
    sys.stdout.flush()

    fn = 'forum.mongo'
    gsdir = gsutil.gs_path_from_course_id(course_id, gsbucket,
                                          use_dataset_latest)

    def openfile(fn, mode='r'):
        if (not os.path.exists(lfp / fn)) and (not fn.endswith('.gz')):
            fn += ".gz"
        if fn.endswith('.gz'):
            return gzip.GzipFile(lfp / fn, mode)
        return open(lfp / fn, mode)

    fp = openfile(fn)

    ofn = lfp / "forum-rephrased.json.gz"
    ofncsv = "forum.csv.gz"  # To match table name in BQ
    ofncsv_lfp = lfp / ofncsv

    dataset = bqutil.course_id2dataset(course_id,
                                       use_dataset_latest=use_dataset_latest)
    bqutil.create_dataset_if_nonexistent(dataset)

    if os.path.exists(ofn) and os.path.exists(ofncsv_lfp):

        tables = bqutil.get_list_of_table_ids(dataset)
        if not 'forum' in tables:
            print "Already done?  But no forums table loaded into datasaet %s.  Redoing." % dataset
        else:
            print "Already done %s -> %s (skipping)" % (fn, ofn)
            print "Already done %s -> %s (skipping)" % (fn, ofncsv_lfp)
            sys.stdout.flush()
            return

    print "Processing %s -> writing to %s and %s (%s)" % (
        fn, ofn, ofncsv, datetime.datetime.now())
    sys.stdout.flush()

    # Setup CSV header
    ocsv = csv.DictWriter(openfile(ofncsv, 'w'),
                          fieldnames=SCHEMA_DICT.keys(),
                          quoting=csv.QUOTE_NONNUMERIC)
    ocsv.writeheader()

    cnt = 0
    ofp = gzip.GzipFile('tmp.json.gz', 'w')
    data = OrderedDict()
    for line in fp:
        cnt += 1
        # Write JSON row
        newline = do_rephrase_line(line, linecnt=cnt)
        ofp.write(newline)

        try:
            #Write CSV row
            data = json.loads(newline)
            ocsv.writerow(data)
        except Exception as err:
            print "Error writing CSV output row %s=%s" % (cnt, data)
            raise

    ofp.close()

    print "...done (%s)" % datetime.datetime.now()

    if cnt == 0:
        print "...but cnt=0 entries found, skipping forum loading"
        sys.stdout.flush()
        return

    print "...copying to gsc"
    sys.stdout.flush()

    # do upload twice, because GSE file metadata doesn't always make it to BigQuery right away?
    gsfn = gsdir + '/' + "forum-rephrased.json.gz"
    cmd = 'gsutil cp tmp.json.gz %s' % (gsfn)
    os.system(cmd)
    os.system(cmd)

    table = 'forum'
    bqutil.load_data_to_table(dataset, table, gsfn, SCHEMA, wait=True)
    msg = "Original data from %s" % (lfp / fn)
    bqutil.add_description_to_table(dataset, table, msg, append=True)

    os.system('mv tmp.json.gz "%s"' % (ofn))

    print "...done (%s)" % datetime.datetime.now()
    sys.stdout.flush()
def do_save(cid, caset_in, xbundle, datadir, log_msg, use_dataset_latest=False):
    '''
    Save course axis data to bigquery
    
    cid = course_id
    caset = list of course axis data in dict format
    xbundle = XML bundle of course (everything except static files)
    datadir = directory where output files should be written
    log_msg = list of messages about processing errors and issues
    '''

    # BigQuery requires data to fit within a schema; let's make sure our lines all fit the schema
    mypath = os.path.dirname(os.path.realpath(__file__))
    the_schema = json.loads(open('%s/schemas/schema_course_axis.json' % mypath).read())['course_axis']
    dict_schema = schema2dict(the_schema)

    caset = copy.deepcopy(caset_in)

    datadir = path(datadir)
    cafn = datadir / 'course_axis.json' 
    xbfn = datadir / ('xbundle_%s.xml' % (cid.replace('/','__')))
    fp = open(cafn, 'w')
    linecnt = 0

    for ca in caset:
        linecnt += 1
        ca['course_id'] = cid
        data = ca['data']
        if data and not type(data)==dict:
            try:
                ca['data'] = json.loads(data)	# make it native, for mongo
            except Exception as err:
                print "failed to create json for %s, error=%s" % (data, err)
        if ca['start'] is not None:
            ca['start'] = str(ca['start'])	# datetime to string
        if  ca['due'] is not None:
            ca['due'] = str(ca['due'])	# datetime to string
        if (ca['data'] is None) or (ca['data']==''):
            ca.pop('data')
        check_schema(linecnt, ca, the_ds=dict_schema, coerce=True)
        try:
            # db.course_axis.insert(ca)
            fp.write(json.dumps(ca)+'\n')
        except Exception as err:
            print "Failed to save!  Error=%s, data=%s" % (err, ca)
    fp.close()

    # upload axis.json file and course xbundle
    gsdir = path(gsutil.gs_path_from_course_id(cid, use_dataset_latest=use_dataset_latest))
    if 1:
        gsutil.upload_file_to_gs(cafn, gsdir, options="-z json", verbose=False)
        gsutil.upload_file_to_gs(xbfn, gsdir, options='-z xml', verbose=False)

    # import into BigQuery
    dataset = bqutil.course_id2dataset(cid, use_dataset_latest=use_dataset_latest)
    bqutil.create_dataset_if_nonexistent(dataset)	# create dataset if not already existent
    table = "course_axis"
    bqutil.load_data_to_table(dataset, table, gsdir / (cafn.basename()), the_schema)

    msg = "="*100 + '\n'
    msg += "Course axis for %s\n" % (cid)
    msg += "="*100 + '\n'
    msg += '\n'.join(log_msg)
    msg = msg[:16184]		# max message length 16384
    
    bqutil.add_description_to_table(dataset, table, msg, append=True)

    print "    Done - inserted %s records into course_axis" % len(caset)
def make_video_stats(course_id, api_key, basedir, datedir, force_recompute, use_dataset_latest, use_latest_sql_dir):
    '''
    Create Video stats for Videos Viewed and Videos Watched.
    First create a video axis, based on course axis. Then use tracking logs to count up videos viewed and videos watched
    '''

    assert api_key is not None, "[analyze videos]: Public API Key is missing from configuration file. Visit https://developers.google.com/console/help/new/#generatingdevkeys for details on how to generate public key, and then add to edx2bigquery_config.py as API_KEY variable"

    # Get Course Dir path
    basedir = path(basedir or '')
    course_dir = course_id.replace('/','__')
    lfp = find_course_sql_dir(course_id, basedir, datedir, use_dataset_latest or use_latest_sql_dir)
    
    # get schema
    mypath = os.path.dirname(os.path.realpath(__file__))
    SCHEMA_FILE = '%s/%s' % ( mypath, SCHEMA_VIDEO_AXIS )
    the_schema = json.loads(open(SCHEMA_FILE).read())[ SCHEMA_VIDEO_AXIS_NAME ]
    the_dict_schema = schema2dict(the_schema)

    # Create initial video axis
    videoAxisExists = False
    dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest)
    va_date = None
    try:
        tinfo = bqutil.get_bq_table_info(dataset, TABLE_VIDEO_AXIS )
        assert tinfo is not None, "[analyze videos] %s.%s does not exist. First time creating table" % ( dataset, TABLE_VIDEO_AXIS )
	videoAxisExists = True
        va_date = tinfo['lastModifiedTime']		# datetime
    except (AssertionError, Exception) as err:
        print "%s --> Attempting to process %s table" % ( str(err), TABLE_VIDEO_AXIS )
        sys.stdout.flush()

    # get course axis time
    ca_date = None
    try:
        tinfo = bqutil.get_bq_table_info(dataset, TABLE_COURSE_AXIS )
        ca_date = tinfo['lastModifiedTime']		# datetime
    except (AssertionError, Exception) as err:
        pass

    if videoAxisExists and (not force_recompute) and ca_date and va_date and (ca_date > va_date):
        force_recompute = True
        print "video_axis exists, but has date %s, older than course_axis date %s; forcing recompute" % (va_date, ca_date)
        sys.stdout.flush()

    if not videoAxisExists or force_recompute:
        force_recompute = True
        createVideoAxis(course_id=course_id, force_recompute=force_recompute, use_dataset_latest=use_dataset_latest)

        # Get video lengths
        va = bqutil.get_table_data(dataset, TABLE_VIDEO_AXIS)
        assert va is not None, "[analyze videos] Possibly no data in video axis table. Check course axis table"
        va_bqdata = va['data']
        fileoutput = lfp / FILENAME_VIDEO_AXIS
        getYoutubeDurations( dataset=dataset, bq_table_input=va_bqdata, api_key=api_key, outputfilename=fileoutput, schema=the_dict_schema, force_recompute=force_recompute )

        # upload and import video axis
        gsfn = gsutil.gs_path_from_course_id(course_id, use_dataset_latest=use_dataset_latest) / FILENAME_VIDEO_AXIS
        gsutil.upload_file_to_gs(fileoutput, gsfn)
        table = TABLE_VIDEO_AXIS
        bqutil.load_data_to_table(dataset, table, gsfn, the_schema, wait=True)

    else:
        print "[analyze videos] %s.%s already exists (and force recompute not specified). Skipping step to generate %s using latest course axis" % ( dataset, TABLE_VIDEO_AXIS, TABLE_VIDEO_AXIS )

    # Lastly, create video stats
    createVideoStats_day( course_id, force_recompute=force_recompute, use_dataset_latest=use_dataset_latest )
    createVideoStats( course_id, force_recompute=force_recompute, use_dataset_latest=use_dataset_latest )

    # also create person_course_video_watched
    createPersonCourseVideo( course_id, force_recompute=force_recompute, use_dataset_latest=use_dataset_latest )
Beispiel #22
0
def rephrase_forum_json_for_course(
    course_id,
    gsbucket="gs://x-data",
    basedir="X-Year-2-data-sql",
    datedir=None,
    do_gs_copy=False,
    use_dataset_latest=False,
):

    print "Loading SQL for course %s into BigQuery (start: %s)" % (
        course_id, datetime.datetime.now())
    sys.stdout.flush()

    lfp = find_course_sql_dir(course_id,
                              basedir,
                              datedir,
                              use_dataset_latest=use_dataset_latest)

    print "Using this directory for local files: ", lfp
    sys.stdout.flush()

    fn = 'forum.mongo'
    gsdir = gsutil.gs_path_from_course_id(course_id, gsbucket,
                                          use_dataset_latest)

    def openfile(fn, mode='r'):
        if (not os.path.exists(lfp / fn)) and (not fn.endswith('.gz')):
            fn += ".gz"
        if fn.endswith('.gz'):
            return gzip.GzipFile(lfp / fn, mode)
        return open(lfp / fn, mode)

    fp = openfile(fn)

    ofn = lfp / "forum-rephrased.json.gz"

    dataset = bqutil.course_id2dataset(course_id,
                                       use_dataset_latest=use_dataset_latest)
    bqutil.create_dataset_if_nonexistent(dataset)

    if os.path.exists(ofn):

        tables = bqutil.get_list_of_table_ids(dataset)
        if not 'forum' in tables:
            print "Already done?  But no forums table loaded into datasaet %s.  Redoing." % dataset
        else:
            print "Already done %s -> %s (skipping)" % (fn, ofn)
            sys.stdout.flush()
            return

    print "Processing %s -> %s (%s)" % (fn, ofn, datetime.datetime.now())
    sys.stdout.flush()

    cnt = 0
    ofp = gzip.GzipFile('tmp.json.gz', 'w')
    for line in fp:
        cnt += 1
        newline = do_rephrase_line(line, linecnt=cnt)
        ofp.write(newline)
    ofp.close()

    print "...done (%s)" % datetime.datetime.now()

    if cnt == 0:
        print "...but cnt=0 entries found, skipping forum loading"
        sys.stdout.flush()
        return

    print "...copying to gsc"
    sys.stdout.flush()

    # do upload twice, because GSE file metadata doesn't always make it to BigQuery right away?
    gsfn = gsdir + '/' + "forum-rephrased.json.gz"
    cmd = 'gsutil cp tmp.json.gz %s' % (gsfn)
    os.system(cmd)
    os.system(cmd)

    table = 'forum'
    bqutil.load_data_to_table(dataset, table, gsfn, SCHEMA, wait=True)
    msg = "Original data from %s" % (lfp / fn)
    bqutil.add_description_to_table(dataset, table, msg, append=True)

    os.system('mv tmp.json.gz "%s"' % (ofn))

    print "...done (%s)" % datetime.datetime.now()
    sys.stdout.flush()
def rephrase_forum_json_for_course(course_id, gsbucket="gs://x-data", 
                                   basedir="X-Year-2-data-sql", 
                                   datedir=None, 
                                   do_gs_copy=False,
                                   use_dataset_latest=False,
                                   ):
    
    print "Loading SQL for course %s into BigQuery (start: %s)" % (course_id, datetime.datetime.now())
    sys.stdout.flush()

    lfp = find_course_sql_dir(course_id, basedir, datedir, use_dataset_latest=use_dataset_latest)

    print "Using this directory for local files: ", lfp
    sys.stdout.flush()

    fn = 'forum.mongo'
    gsdir = gsutil.gs_path_from_course_id(course_id, gsbucket, use_dataset_latest)
    
    def openfile(fn, mode='r'):
        if (not os.path.exists(lfp / fn)) and (not fn.endswith('.gz')):
            fn += ".gz"
        if fn.endswith('.gz'):
            return gzip.GzipFile(lfp / fn, mode)
        return open(lfp / fn, mode)

    fp = openfile(fn)

    ofn = lfp / "forum-rephrased.json.gz"

    dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest)
    bqutil.create_dataset_if_nonexistent(dataset)

    if os.path.exists(ofn):

        tables = bqutil.get_list_of_table_ids(dataset)
        if not 'forum' in tables:
            print "Already done?  But no forums table loaded into datasaet %s.  Redoing." % dataset
        else:
            print "Already done %s -> %s (skipping)" % (fn, ofn)
            sys.stdout.flush()
            return

    print "Processing %s -> %s (%s)" % (fn, ofn, datetime.datetime.now())
    sys.stdout.flush()

    cnt = 0
    ofp = gzip.GzipFile('tmp.json.gz', 'w')
    for line in fp:
        cnt += 1
        newline = do_rephrase_line(line, linecnt=cnt)
        ofp.write(newline)
    ofp.close()

    print "...done (%s)" % datetime.datetime.now()

    if cnt==0:
        print "...but cnt=0 entries found, skipping forum loading"
        sys.stdout.flush()
        return

    print "...copying to gsc"
    sys.stdout.flush()

    # do upload twice, because GSE file metadata doesn't always make it to BigQuery right away?
    gsfn = gsdir + '/' + "forum-rephrased.json.gz"
    cmd = 'gsutil cp tmp.json.gz %s' % (gsfn)
    os.system(cmd)
    os.system(cmd)

    table = 'forum'
    bqutil.load_data_to_table(dataset, table, gsfn, SCHEMA, wait=True)
    msg = "Original data from %s" % (lfp / fn)
    bqutil.add_description_to_table(dataset, table, msg, append=True)

    os.system('mv tmp.json.gz "%s"' % (ofn))

    print "...done (%s)" % datetime.datetime.now()
    sys.stdout.flush()
def do_combine(
    course_id_set,
    project_id,
    outdir="DATA",
    nskip=0,
    output_project_id=None,
    output_dataset_id=None,
    output_bucket=None,
    use_dataset_latest=False,
):

    print "=" * 77
    print "Concatenating person course datasets from the following courses:"
    print course_id_set
    print "-" * 77

    outdir = path(outdir)
    if not outdir.exists():
        os.mkdir(outdir)

    ofnset = []
    cnt = 0
    for course_id in course_id_set:
        gb = gsutil.gs_path_from_course_id(
            course_id, use_dataset_latest=use_dataset_latest)
        ofn = outdir / ('person_course_%s.csv.gz' %
                        (course_id.replace('/', '__')))
        ofnset.append(ofn)

        if (nskip > 0) and ofn.exists():
            print "%s already exists, not downloading" % ofn
            sys.stdout.flush()
            continue

        if ofn.exists():
            fnset = gsutil.get_gs_file_list(gb)
            local_dt = gsutil.get_local_file_mtime_in_utc(ofn)
            fnb = 'person_course.csv.gz'
            if not fnb in fnset:
                print "%s/%s missing!  skipping %s" % (gb, fnb, course_id)
                continue
            if (fnb in fnset) and (local_dt >= fnset[fnb]['date']):
                print "%s already exists with date %s (gs file date %s), not re-downloading" % (
                    ofn, local_dt, fnset[fnb]['date'])
                sys.stdout.flush()
                continue
            else:
                print "%s already exists but has date %s (gs file date %s), so re-downloading" % (
                    ofn, local_dt, fnset[fnb]['date'])
                sys.stdout.flush()

        cmd = 'gsutil cp %s/person_course.csv.gz %s' % (gb, ofn)
        print "Retrieving %s via %s" % (course_id, cmd)
        sys.stdout.flush()
        os.system(cmd)
        cnt += 1
        #if cnt>2:
        #    break

    org = course_id_set[0].split('/', 1)[0]

    ofn = "person_course_%s_%s.csv" % (
        org, datetime.datetime.now().strftime('%Y-%m-%d-%H%M%S'))

    print "=" * 77
    print "Combining CSV files to produce %s" % ofn
    sys.stdout.flush()

    if (nskip > 1) and os.path.exists(ofn):
        print "%s already exists, not downloading" % ofn
    else:
        first = 1
        for zfn in ofnset:
            if first:
                cmd = "zcat %s > %s" % (zfn, ofn)
            else:
                cmd = "zcat %s | tail -n +2 >> %s" % (
                    zfn, ofn
                )  # first row is header; don't keep when concatenating
            print cmd
            first = 0
            os.system(cmd)

    gb = gsutil.gs_path_from_course_id('course_report_%s' % org,
                                       gsbucket=output_bucket)

    print "=" * 77
    print "Uploading combined CSV file to google cloud storage in bucket: %s" % gb
    sys.stdout.flush()
    cmd = "TMPDIR=/var/tmp gsutil cp -z csv %s %s/" % (ofn, gb)
    print cmd
    os.system(cmd)

    gsfn = gb + '/' + ofn
    print "Combined person_course dataset CSV download link: %s" % gsutil.gs_download_link(
        gsfn)

    # import into BigQuery
    crname = ('course_report_%s' % org)
    if use_dataset_latest:
        crname = 'course_report_latest'
    dataset = output_dataset_id or crname
    table = ofn[:-4].replace('-', '_')

    print "Importing into BigQuery as %s:%s.%s" % (project_id, dataset, table)
    sys.stdout.flush()
    mypath = os.path.dirname(os.path.realpath(__file__))
    SCHEMA_FILE = '%s/schemas/schema_person_course.json' % mypath
    the_schema = json.loads(open(SCHEMA_FILE).read())['person_course']

    bqutil.load_data_to_table(dataset,
                              table,
                              gsfn,
                              the_schema,
                              format='csv',
                              skiprows=1,
                              project_id=output_project_id)

    msg = ''
    msg += "Combined person-course dataset, with data from:\n"
    msg += str(course_id_set)
    msg += "\n\n"
    msg += "=" * 100 + "\n"
    msg += "CSV download link: %s" % gsutil.gs_download_link(gsfn)

    bqutil.add_description_to_table(dataset,
                                    table,
                                    msg,
                                    append=True,
                                    project_id=output_project_id)

    print "Done"
    sys.stdout.flush()
Beispiel #25
0
def load_all_daily_logs_for_course(course_id,
                                   gsbucket="gs://x-data",
                                   verbose=True,
                                   wait=False,
                                   check_dates=True):
    '''
    Load daily tracking logs for course from google storage into BigQuery.
    
    If wait=True then waits for loading jobs to be completed.  It's desirable to wait
    if subsequent jobs which need these tables (like person_day) are to be run
    immediately afterwards.
    '''

    print "Loading daily tracking logs for course %s into BigQuery (start: %s)" % (
        course_id, datetime.datetime.now())
    sys.stdout.flush()
    gsroot = gsutil.path_from_course_id(course_id)

    mypath = os.path.dirname(os.path.realpath(__file__))
    SCHEMA = json.loads(
        open('%s/schemas/schema_tracking_log.json' %
             mypath).read())['tracking_log']

    gsdir = '%s/%s/DAILY/' % (gsbucket, gsroot)

    fnset = gsutil.get_gs_file_list(gsdir)

    dataset = bqutil.course_id2dataset(gsroot, dtype="logs")

    # create this dataset if necessary
    bqutil.create_dataset_if_nonexistent(dataset)

    tables = bqutil.get_list_of_table_ids(dataset)
    tables = [x for x in tables if x.startswith('track')]

    if verbose:
        print "-" * 77
        print "current tables loaded:", json.dumps(tables, indent=4)
        print "files to load: ", json.dumps(fnset.keys(), indent=4)
        print "-" * 77
        sys.stdout.flush()

    for fn, fninfo in fnset.iteritems():

        if int(fninfo['size']) <= 45:
            print "Zero size file %s, skipping" % fn
            continue

        m = re.search('(\d\d\d\d-\d\d-\d\d)', fn)
        if not m:
            continue
        date = m.group(1)
        tablename = "tracklog_%s" % date.replace(
            '-', '')  # YYYYMMDD for compatibility with table wildcards

        # file_date = gsutil.get_local_file_mtime_in_utc(fn, make_tz_unaware=True)
        file_date = fninfo['date'].replace(tzinfo=None)

        if tablename in tables:
            skip = True
            if check_dates:
                table_date = bqutil.get_bq_table_last_modified_datetime(
                    dataset, tablename)
                if not (table_date > file_date):
                    print "Already have table %s, but %s file_date=%s, table_date=%s; re-loading from gs" % (
                        tablename, fn, file_date, table_date)
                    skip = False

            if skip:
                if verbose:
                    print "Already have table %s, skipping file %s" % (
                        tablename, fn)
                    sys.stdout.flush()
                continue

        #if date < '2014-07-27':
        #  continue

        print "Loading %s into table %s " % (fn, tablename)
        if verbose:
            print "start [%s]" % datetime.datetime.now()
        sys.stdout.flush()
        gsfn = fninfo['name']
        ret = bqutil.load_data_to_table(dataset,
                                        tablename,
                                        gsfn,
                                        SCHEMA,
                                        wait=wait,
                                        maxbad=1000)

    if verbose:
        print "-" * 77
        print "done with %s [%s]" % (course_id, datetime.datetime.now())
    print "=" * 77
    sys.stdout.flush()