def extract_logs_mongo2gs(course_id, start="2012-09-01", end="2014-09-24", verbose=False,
                          dbname=DBNAME,
                          collection = 'tracking_log',
                          tracking_logs_directory="TRACKING_LOGS",
                          ):

    print "extracting logs for course %s" % course_id

    # list of dates to dump
    dates = daterange(d2dt(start), d2dt(end))
    
    if verbose:
        print "Dates to dump:", [x['dstr'] for x in dates]

    # what files already on gs?
    gspath = "%s/DAILY" % gs_path_from_course_id(course_id)
    gsfiles = get_gs_file_list(gspath)

    DIR = tracking_logs_directory
    if not os.path.exists(DIR):
        os.mkdir(DIR)
    DIR += '/' + path_from_course_id(course_id)
    if not os.path.exists(DIR):
        os.mkdir(DIR)

    filebuf = []
    for k in range(len(dates)-1):
                          
        d = dates[k]
        ofn = '%s/tracklog-%s.json.gz' % (DIR, d['dstr'])
        start = d['start']
        end = d['end']

        ofnb = os.path.basename(ofn)

        if ofnb in gsfiles:
            print "Already have %s, skipping" % ofnb
            sys.stdout.flush()
            continue

        # dump tracking log of certain date using mongoexport, if needed
        if not os.path.exists(ofn):
            # db.tracking_log.find({'course_id': "HarvardX/ER22x/2013_Spring", 
            #                       'time': { '$gte': "2013-08-01T00:00:00.000000", '$lt': "2013-08-02T00:00:00.000000" }}).count()
            query = '{"course_id": "%s", "time": {"$gte": "%s", "$lt": "%s" }}' % (course_id, start, end)
            cmd = "mongoexport -d %s -c %s -q '%s'| edx2bigquery rephrase_logs | gzip -9 > %s" % (dbname, collection, query, ofn)
            # print cmd
            os.system(cmd)
        
        upload_file_to_gs(ofn, gspath + '/' + ofnb)

        filebuf.append(ofn)

        if len(filebuf)>20:
            ffn = filebuf.pop(0)
            os.unlink(ffn)
            print "...Deleted %s" % ffn
            sys.stdout.flush()
def process_dir(course_id,
                gspath='gs://x-data',
                logs_directory="TRACKING_LOGS",
                verbose=True):

    cdir = path(logs_directory) / gsutil.path_from_course_id(course_id)

    print "=" * 77
    print "Transferring tracking logs for %s from directory %s (start %s)" % (
        course_id, cdir, datetime.datetime.now())
    print "=" * 77

    if not os.path.exists(cdir):
        print "Oops!  non-existent course tracking logs directory %s" % cdir
        return

    sys.stdout.flush()
    cdir = path(cdir)
    gp = path(gspath + "/" + cdir.basename()) / 'DAILY'
    filelist = gsutil.get_gs_file_list(gp)
    # print filelist
    local_files = glob.glob(cdir / 'tracklog*.gz')
    local_files.sort()

    for fn in local_files:
        fnp = path(fn)
        fnb = fnp.basename()
        statbuf = os.stat(fn)
        mt = datetime.datetime.fromtimestamp(statbuf.st_mtime)

        # do some date checking to upload log files which have changed, and are newer than that on google cloud storage
        local = pytz.timezone("America/New_York")
        # naive = datetime.datetime.strptime ("2001-2-3 10:11:12", "%Y-%m-%d %H:%M:%S")
        local_dt = local.localize(mt, is_dst=None)
        utc_dt = local_dt.astimezone(pytz.utc)

        if fnb in filelist and filelist[fnb]['date'] > utc_dt:
            if verbose:
                print "%s already exists, skipping" % fn
            continue
        elif fnb in filelist:
            print "%s already exists, but has date=%s and mtime=%s, re-uploading" % (
                fn, filelist[fnb]['date'], mt)
        cmd = 'gsutil cp %s %s' % (fn, gp + '/')
        print cmd
        sys.stdout.flush()
        os.system(cmd)

    print "done with %s (%s)" % (cdir, datetime.datetime.now())
    print "-" * 77
def process_dir(course_id, gspath='gs://x-data', logs_directory="TRACKING_LOGS", verbose=True):

    cdir = path(logs_directory) / gsutil.path_from_course_id(course_id)

    print "="*77
    print "Transferring tracking logs for %s from directory %s (start %s)" % (course_id, cdir, datetime.datetime.now())
    print "="*77

    if not os.path.exists(cdir):
        print "Oops!  non-existent course tracking logs directory %s" % cdir
        return

    sys.stdout.flush()
    cdir = path(cdir)
    gp = path(gspath + "/" + cdir.basename()) / 'DAILY'
    filelist = gsutil.get_gs_file_list(gp)
    # print filelist
    local_files = glob.glob(cdir / 'tracklog*.gz')
    local_files.sort()
    
    for fn in local_files:
        fnp = path(fn)
        fnb = fnp.basename()
        statbuf = os.stat(fn)
        mt = datetime.datetime.fromtimestamp(statbuf.st_mtime)

        # do some date checking to upload log files which have changed, and are newer than that on google cloud storage
        local = pytz.timezone ("America/New_York")
        # naive = datetime.datetime.strptime ("2001-2-3 10:11:12", "%Y-%m-%d %H:%M:%S")
        local_dt = local.localize(mt, is_dst=None)
        utc_dt = local_dt.astimezone (pytz.utc)

        if fnb in filelist and filelist[fnb]['date'] > utc_dt:
            if verbose:
                print "%s already exists, skipping" % fn
            continue
        elif fnb in filelist:
            print "%s already exists, but has date=%s and mtime=%s, re-uploading" % (fn, filelist[fnb]['date'], mt)
        cmd = 'gsutil cp %s %s' % (fn, gp + '/')
        print cmd
        sys.stdout.flush()
        os.system(cmd)

    print "done with %s (%s)" % (cdir, datetime.datetime.now())
    print "-"*77
def do_combine(
    course_id_set,
    project_id,
    outdir="DATA",
    nskip=0,
    output_project_id=None,
    output_dataset_id=None,
    output_bucket=None,
    use_dataset_latest=False,
):

    print "=" * 77
    print "Concatenating person course datasets from the following courses:"
    print course_id_set
    print "-" * 77

    outdir = path(outdir)
    if not outdir.exists():
        os.mkdir(outdir)

    ofnset = []
    cnt = 0
    for course_id in course_id_set:
        gb = gsutil.gs_path_from_course_id(
            course_id, use_dataset_latest=use_dataset_latest)
        ofn = outdir / ('person_course_%s.csv.gz' %
                        (course_id.replace('/', '__')))
        ofnset.append(ofn)

        if (nskip > 0) and ofn.exists():
            print "%s already exists, not downloading" % ofn
            sys.stdout.flush()
            continue

        if ofn.exists():
            fnset = gsutil.get_gs_file_list(gb)
            local_dt = gsutil.get_local_file_mtime_in_utc(ofn)
            fnb = 'person_course.csv.gz'
            if not fnb in fnset:
                print "%s/%s missing!  skipping %s" % (gb, fnb, course_id)
                continue
            if (fnb in fnset) and (local_dt >= fnset[fnb]['date']):
                print "%s already exists with date %s (gs file date %s), not re-downloading" % (
                    ofn, local_dt, fnset[fnb]['date'])
                sys.stdout.flush()
                continue
            else:
                print "%s already exists but has date %s (gs file date %s), so re-downloading" % (
                    ofn, local_dt, fnset[fnb]['date'])
                sys.stdout.flush()

        cmd = 'gsutil cp %s/person_course.csv.gz %s' % (gb, ofn)
        print "Retrieving %s via %s" % (course_id, cmd)
        sys.stdout.flush()
        os.system(cmd)
        cnt += 1
        #if cnt>2:
        #    break

    org = course_id_set[0].split('/', 1)[0]

    ofn = "person_course_%s_%s.csv" % (
        org, datetime.datetime.now().strftime('%Y-%m-%d-%H%M%S'))

    print "=" * 77
    print "Combining CSV files to produce %s" % ofn
    sys.stdout.flush()

    if (nskip > 1) and os.path.exists(ofn):
        print "%s already exists, not downloading" % ofn
    else:
        first = 1
        for zfn in ofnset:
            if first:
                cmd = "zcat %s > %s" % (zfn, ofn)
            else:
                cmd = "zcat %s | tail -n +2 >> %s" % (
                    zfn, ofn
                )  # first row is header; don't keep when concatenating
            print cmd
            first = 0
            os.system(cmd)

    gb = gsutil.gs_path_from_course_id('course_report_%s' % org,
                                       gsbucket=output_bucket)

    print "=" * 77
    print "Uploading combined CSV file to google cloud storage in bucket: %s" % gb
    sys.stdout.flush()
    cmd = "TMPDIR=/var/tmp gsutil cp -z csv %s %s/" % (ofn, gb)
    print cmd
    os.system(cmd)

    gsfn = gb + '/' + ofn
    print "Combined person_course dataset CSV download link: %s" % gsutil.gs_download_link(
        gsfn)

    # import into BigQuery
    crname = ('course_report_%s' % org)
    if use_dataset_latest:
        crname = 'course_report_latest'
    dataset = output_dataset_id or crname
    table = ofn[:-4].replace('-', '_')

    print "Importing into BigQuery as %s:%s.%s" % (project_id, dataset, table)
    sys.stdout.flush()
    mypath = os.path.dirname(os.path.realpath(__file__))
    SCHEMA_FILE = '%s/schemas/schema_person_course.json' % mypath
    the_schema = json.loads(open(SCHEMA_FILE).read())['person_course']

    bqutil.load_data_to_table(dataset,
                              table,
                              gsfn,
                              the_schema,
                              format='csv',
                              skiprows=1,
                              project_id=output_project_id)

    msg = ''
    msg += "Combined person-course dataset, with data from:\n"
    msg += str(course_id_set)
    msg += "\n\n"
    msg += "=" * 100 + "\n"
    msg += "CSV download link: %s" % gsutil.gs_download_link(gsfn)

    bqutil.add_description_to_table(dataset,
                                    table,
                                    msg,
                                    append=True,
                                    project_id=output_project_id)

    print "Done"
    sys.stdout.flush()
def extract_logs_mongo2gs(
    course_id,
    start="2012-09-01",
    end="2014-09-24",
    verbose=False,
    dbname=DBNAME,
    collection='tracking_log',
    tracking_logs_directory="TRACKING_LOGS",
):

    print "extracting logs for course %s" % course_id

    # list of dates to dump
    dates = daterange(d2dt(start), d2dt(end))

    if verbose:
        print "Dates to dump:", [x['dstr'] for x in dates]

    # what files already on gs?
    gspath = "%s/DAILY" % gs_path_from_course_id(course_id)
    gsfiles = get_gs_file_list(gspath)

    DIR = tracking_logs_directory
    if not os.path.exists(DIR):
        os.mkdir(DIR)
    DIR += '/' + path_from_course_id(course_id)
    if not os.path.exists(DIR):
        os.mkdir(DIR)

    filebuf = []
    for k in range(len(dates) - 1):

        d = dates[k]
        ofn = '%s/tracklog-%s.json.gz' % (DIR, d['dstr'])
        start = d['start']
        end = d['end']

        ofnb = os.path.basename(ofn)

        if ofnb in gsfiles:
            print "Already have %s, skipping" % ofnb
            sys.stdout.flush()
            continue

        # dump tracking log of certain date using mongoexport, if needed
        if not os.path.exists(ofn):
            # db.tracking_log.find({'course_id': "HarvardX/ER22x/2013_Spring",
            #                       'time': { '$gte': "2013-08-01T00:00:00.000000", '$lt': "2013-08-02T00:00:00.000000" }}).count()
            query = '{"course_id": "%s", "time": {"$gte": "%s", "$lt": "%s" }}' % (
                course_id, start, end)
            cmd = "mongoexport -d %s -c %s -q '%s'| edx2bigquery rephrase_logs | gzip -9 > %s" % (
                dbname, collection, query, ofn)
            # print cmd
            os.system(cmd)

        upload_file_to_gs(ofn, gspath + '/' + ofnb)

        filebuf.append(ofn)

        if len(filebuf) > 20:
            ffn = filebuf.pop(0)
            os.unlink(ffn)
            print "...Deleted %s" % ffn
            sys.stdout.flush()
Exemple #6
0
def do_combine(
    course_id_set,
    project_id,
    outdir="DATA",
    nskip=0,
    output_project_id=None,
    output_dataset_id=None,
    output_bucket=None,
    use_dataset_latest=False,
    extract_subset_tables=True,
):
    '''
    Combine individual person_course tables (from the set of specified course_id's) to create one single large
    person_course table.  Do this by downloading each file, checking to make sure they all have the same
    fields, concatenating, and uploading back to bigquery.  This is cheaper than doing a select *, and also
    uncovers person_course files which have the wrong schema (and it works around BQ's limitation on large
    result sizes).  The result is stored in the course_report_latest dataset (if use_dataset_latest), else 
    in course_report_ORG, where ORG is the configured organization name.

    If extract_subset_tables is True, then the subset of those who viewed (ie "participants"), and the subset
    of those who enrolled for IDV, are extracted and saved as person_course_viewed, and person_course_idv.
    (those are created using a select *, for efficiency, despite the cost).
    '''

    print "=" * 77
    print "Concatenating person course datasets from the following courses:"
    print course_id_set
    print "-" * 77

    outdir = path(outdir)
    if not outdir.exists():
        os.mkdir(outdir)

    ofnset = []
    cnt = 0
    for course_id in course_id_set:
        gb = gsutil.gs_path_from_course_id(
            course_id, use_dataset_latest=use_dataset_latest)
        ofn = outdir / ('person_course_%s.csv.gz' %
                        (course_id.replace('/', '__')))
        ofnset.append(ofn)

        if (nskip > 0) and ofn.exists():
            print "%s already exists, not downloading" % ofn
            sys.stdout.flush()
            continue

        if ofn.exists():
            fnset = gsutil.get_gs_file_list(gb)
            local_dt = gsutil.get_local_file_mtime_in_utc(ofn)
            fnb = 'person_course.csv.gz'
            if not fnb in fnset:
                print "%s/%s missing!  skipping %s" % (gb, fnb, course_id)
                continue
            if (fnb in fnset) and (local_dt >= fnset[fnb]['date']):
                print "%s already exists with date %s (gs file date %s), not re-downloading" % (
                    ofn, local_dt, fnset[fnb]['date'])
                sys.stdout.flush()
                continue
            else:
                print "%s already exists but has date %s (gs file date %s), so re-downloading" % (
                    ofn, local_dt, fnset[fnb]['date'])
                sys.stdout.flush()

        cmd = 'gsutil cp %s/person_course.csv.gz %s' % (gb, ofn)
        print "Retrieving %s via %s" % (course_id, cmd)
        sys.stdout.flush()
        os.system(cmd)
        cnt += 1
        #if cnt>2:
        #    break

    org = course_id_set[0].split('/', 1)[0]

    ofn = "person_course_%s_%s.csv" % (
        org, datetime.datetime.now().strftime('%Y-%m-%d-%H%M%S'))

    print "=" * 77
    print "Combining CSV files to produce %s" % ofn
    sys.stdout.flush()

    if (nskip > 1) and os.path.exists(ofn):
        print "%s already exists, not downloading" % ofn
    else:
        first = 1
        header = None
        for zfn in ofnset:
            if first:
                cmd = "zcat %s > %s" % (zfn, ofn)
                header = os.popen("zcat %s | head -1" % zfn).read().strip()
                firstfn = zfn
            else:
                cmd = "zcat %s | tail -n +2 >> %s" % (
                    zfn, ofn
                )  # first row is header; don't keep when concatenating
            print cmd
            first = 0
            new_header = os.popen("zcat %s | head -1" % zfn).read().strip()
            if not header == new_header:
                print "==> Warning!  header mismatch for %s vs %s" % (zfn,
                                                                      firstfn)
                print "    %s has: %s" % (firstfn, header)
                print "    but %s has: %s" % (zfn, new_header)
            sys.stdout.flush()
            os.system(cmd)

    gb = gsutil.gs_path_from_course_id('course_report_%s' % org,
                                       gsbucket=output_bucket)

    print "=" * 77
    print "Uploading combined CSV file to google cloud storage in bucket: %s" % gb
    sys.stdout.flush()
    cmd = "TMPDIR=/var/tmp gsutil cp -z csv %s %s/" % (ofn, gb)
    print cmd
    os.system(cmd)

    gsfn = gb + '/' + ofn
    print "Combined person_course dataset CSV download link: %s" % gsutil.gs_download_link(
        gsfn)

    # import into BigQuery
    crname = ('course_report_%s' % org)
    if use_dataset_latest:
        crname = 'course_report_latest'
    dataset = output_dataset_id or crname
    table = ofn[:-4].replace('-', '_')

    print "Importing into BigQuery as %s:%s.%s" % (project_id, dataset, table)
    sys.stdout.flush()
    mypath = os.path.dirname(os.path.realpath(__file__))
    SCHEMA_FILE = '%s/schemas/schema_person_course.json' % mypath
    the_schema = json.loads(open(SCHEMA_FILE).read())['person_course']

    bqutil.load_data_to_table(dataset,
                              table,
                              gsfn,
                              the_schema,
                              format='csv',
                              skiprows=1,
                              project_id=output_project_id)

    msg = ''
    msg += "Combined person-course dataset, with data from:\n"
    msg += str(course_id_set)
    msg += "\n\n"
    msg += "=" * 100 + "\n"
    msg += "CSV download link: %s" % gsutil.gs_download_link(gsfn)

    bqutil.add_description_to_table(dataset,
                                    table,
                                    msg,
                                    append=True,
                                    project_id=output_project_id)

    # copy the new table (which has a specific date in its name) to a generically named "person_course_latest"
    # so that future SQL queries can simply use this as the latest person course table
    print "-> Copying %s to %s.person_course_latest" % (table, dataset)
    bqutil.copy_bq_table(dataset, table, "person_course_latest")

    if extract_subset_tables:
        do_extract_subset_person_course_tables(dataset, table)

    print "Done"
    sys.stdout.flush()
def load_sql_for_course(course_id,
                        gsbucket="gs://x-data",
                        basedir="X-Year-2-data-sql",
                        datedir="2014-09-21",
                        do_gs_copy=False,
                        use_dataset_latest=False):
    '''
    Load SQL files into google cloud storage then import into BigQuery.

    Datasets are typically named by course_id, with "__" replacing "/", and "_" replacing "."

    If use_dataset_latest then "_latest" is appended to the dataset name.  
    Thus, the latest SQL dataset can always be put in a consistently named dataset.
    '''

    print "Loading SQL for course %s into BigQuery (start: %s)" % (
        course_id, datetime.datetime.now())
    sys.stdout.flush()

    lfp = find_course_sql_dir(course_id,
                              basedir,
                              datedir,
                              use_dataset_latest=use_dataset_latest)

    print "Using this directory for local files: ", lfp
    sys.stdout.flush()

    # convert studentmodule if necessary

    fn_sm = lfp / 'studentmodule.csv.gz'
    if not fn_sm.exists():
        fn_sm = lfp / 'studentmodule.csv'
        if not fn_sm.exists():
            fn_sm = lfp / 'studentmodule.sql.gz'
            if not fn_sm.exists():
                fn_sm = lfp / 'studentmodule.sql'
                if not fn_sm.exists():
                    print "Error!  Missing studentmodule.[sql,csv][.gz]"
            if fn_sm.exists():  # have .sql or .sql.gz version: convert to .csv
                newfn = lfp / 'studentmodule.csv.gz'
                print "--> Converting %s to %s" % (fn_sm, newfn)
                tsv2csv(fn_sm, newfn)
                fn_sm = newfn

    if fn_sm.exists():
        # rephrase studentmodule if it's using opaque keys
        fline = ''
        smfp = openfile(fn_sm)
        fline = smfp.readline()  # skip first line - it's a header
        fline = smfp.readline()
        if 'block-v1:' in fline or 'course-v1' in fline:
            rephrase_studentmodule_opaque_keys(fn_sm)

    def convert_sql(fnroot):
        if os.path.exists(fnroot + ".csv") or os.path.exists(fnroot +
                                                             ".csv.gz"):
            return
        if os.path.exists(fnroot + ".sql") or os.path.exists(fnroot +
                                                             ".sql.gz"):
            infn = fnroot + '.sql'
            outfn = fnroot + '.csv.gz'
            print "--> Converting %s to %s" % (infn, outfn)
            tsv2csv(infn, outfn)

    # convert sql files if necesssary
    fnset = [
        'users', 'certificates', 'enrollment', "profiles", 'user_id_map',
        'rolecourse', 'roleforum'
    ]
    for fn in fnset:
        convert_sql(lfp / fn)

    local_files = glob.glob(lfp / '*')

    # if using latest date directory, also look for course_image.jpg one level up
    if use_dataset_latest:
        print lfp.dirname()
        ci_files = glob.glob(lfp.dirname() / 'course_image.jpg')
        if ci_files:
            local_files += list(ci_files)
            print "--> local course_image file: %s" % ci_files

    gsdir = gsutil.gs_path_from_course_id(
        course_id, gsbucket=gsbucket, use_dataset_latest=use_dataset_latest)

    local = pytz.timezone("America/New_York")

    if do_gs_copy:
        try:
            fnset = get_gs_file_list(gsdir)
        except Exception as err:
            fnset = []

        def copy_if_newer(fn, fnset, options='-z csv,json'):
            statbuf = os.stat(fn)
            mt = datetime.datetime.fromtimestamp(statbuf.st_mtime)

            # do some date checking to upload files which have changed, and are newer than that on google cloud storage
            local_dt = local.localize(mt, is_dst=None)
            utc_dt = local_dt.astimezone(pytz.utc)

            fnb = os.path.basename(fn)
            if fnb in fnset and fnset[fnb]['date'] > utc_dt:
                print "...%s already copied, skipping" % fn
                sys.stdout.flush()
                return
            elif fnb in fnset:
                print "...%s already exists, but has date=%s and mtime=%s, re-uploading" % (
                    fn, fnset[fnb]['date'], mt)

            gsutil.upload_file_to_gs(fn,
                                     gsdir / fnb,
                                     options=options,
                                     verbose=True)

        for fn in local_files:
            fnb = os.path.basename(fn)
            if fnb == 'course_image.jpg':
                copy_if_newer(fn, fnset, options='-a public-read')
            if not (fnb.endswith('.csv') or fnb.endswith('.json')
                    or fnb.endswith('.csv.gz') or fnb.endswith('.json.gz')
                    or fnb.endswith('.mongo.gz')):
                print "...unknown file type %s, skipping" % fn
                sys.stdout.flush()
                continue
            copy_if_newer(fn, fnset)

    # load into bigquery
    dataset = bqutil.course_id2dataset(course_id,
                                       use_dataset_latest=use_dataset_latest)
    bqutil.create_dataset_if_nonexistent(dataset)
    mypath = os.path.dirname(os.path.realpath(__file__))

    # load user_info_combo
    uicfn = lfp / 'user_info_combo.json.gz'
    if uicfn.exists():
        uic_schema = json.loads(
            open('%s/schemas/schema_user_info_combo.json' %
                 mypath).read())['user_info_combo']
        bqutil.load_data_to_table(dataset,
                                  'user_info_combo',
                                  gsdir / "user_info_combo.json.gz",
                                  uic_schema,
                                  wait=False)
    else:
        print "--> File %s does not exist, not loading user_info_combo into BigQuery" % uicfn

    # load studentmodule

    if fn_sm.exists():
        schemas = json.loads(open('%s/schemas/schemas.json' % mypath).read())
        cwsm_schema = schemas['courseware_studentmodule']
        bqutil.load_data_to_table(dataset,
                                  'studentmodule',
                                  gsdir / fn_sm.basename(),
                                  cwsm_schema,
                                  format='csv',
                                  wait=False,
                                  skiprows=1)
    else:
        print "--> Not loading studentmodule: file %s not found" % fn_sm
def load_sql_for_course(course_id, gsbucket="gs://x-data", basedir="X-Year-2-data-sql", datedir="2014-09-21", 
                        do_gs_copy=False,
                        use_dataset_latest=False):
    '''
    Load SQL files into google cloud storage then import into BigQuery.

    Datasets are typically named by course_id, with "__" replacing "/", and "_" replacing "."

    If use_dataset_latest then "_latest" is appended to the dataset name.  
    Thus, the latest SQL dataset can always be put in a consistently named dataset.
    '''
    
    print "Loading SQL for course %s into BigQuery (start: %s)" % (course_id, datetime.datetime.now())
    sys.stdout.flush()

    lfp = find_course_sql_dir(course_id, basedir, datedir, use_dataset_latest=use_dataset_latest)

    print "Using this directory for local files: ", lfp
    sys.stdout.flush()
                          
    # convert studentmodule if necessary

    fn_sm = lfp / 'studentmodule.csv.gz'
    if not fn_sm.exists():
        fn_sm = lfp / 'studentmodule.csv'
        if not fn_sm.exists():
            fn_sm = lfp / 'studentmodule.sql.gz'
            if not fn_sm.exists():
                fn_sm = lfp / 'studentmodule.sql'
                if not fn_sm.exists():
                    print "Error!  Missing studentmodule.[sql,csv][.gz]"
            if fn_sm.exists():	# have .sql or .sql.gz version: convert to .csv
                newfn = lfp / 'studentmodule.csv.gz'
                print "--> Converting %s to %s" % (fn_sm, newfn)
                tsv2csv(fn_sm, newfn)
                fn_sm = newfn

    if fn_sm.exists():
        # rephrase studentmodule if it's using opaque keys
        fline = ''
        smfp = openfile(fn_sm)
        fline = smfp.readline()	# skip first line - it's a header
        fline = smfp.readline()
        if 'block-v1:' in fline or 'course-v1' in fline:
            rephrase_studentmodule_opaque_keys(fn_sm)

    def convert_sql(fnroot):
        if os.path.exists(fnroot + ".csv") or os.path.exists(fnroot + ".csv.gz"):
            return
        if os.path.exists(fnroot + ".sql") or os.path.exists(fnroot + ".sql.gz"):
            infn = fnroot + '.sql'
            outfn = fnroot + '.csv.gz'
            print "--> Converting %s to %s" % (infn, outfn)
            tsv2csv(infn, outfn)

    # convert sql files if necesssary
    fnset = ['users', 'certificates', 'enrollment', "profiles", 'user_id_map', 'rolecourse', 'roleforum']
    for fn in fnset:
        convert_sql(lfp / fn)

    local_files = glob.glob(lfp / '*')

    # if using latest date directory, also look for course_image.jpg one level up
    if use_dataset_latest:
        print lfp.dirname()
        ci_files = glob.glob(lfp.dirname() / 'course_image.jpg')
        if ci_files:
            local_files += list(ci_files)
            print "--> local course_image file: %s" % ci_files

    gsdir = gsutil.gs_path_from_course_id(course_id, gsbucket=gsbucket, use_dataset_latest=use_dataset_latest)

    local = pytz.timezone ("America/New_York")

    if do_gs_copy:
        try:
            fnset = get_gs_file_list(gsdir)
        except Exception as err:
            fnset = []
        
        def copy_if_newer(fn, fnset, options='-z csv,json'):
            statbuf = os.stat(fn)
            mt = datetime.datetime.fromtimestamp(statbuf.st_mtime)
            
            # do some date checking to upload files which have changed, and are newer than that on google cloud storage
            local_dt = local.localize(mt, is_dst=None)
            utc_dt = local_dt.astimezone (pytz.utc)

            fnb = os.path.basename(fn)
            if fnb in fnset and fnset[fnb]['date'] > utc_dt:
                print "...%s already copied, skipping" % fn
                sys.stdout.flush()
                return
            elif fnb in fnset:
                print "...%s already exists, but has date=%s and mtime=%s, re-uploading" % (fn, fnset[fnb]['date'], mt)

            gsutil.upload_file_to_gs(fn, gsdir / fnb, options=options, verbose=True)

        for fn in local_files:
            fnb = os.path.basename(fn)
            if fnb=='course_image.jpg':
                copy_if_newer(fn, fnset, options='-a public-read')
            if not (fnb.endswith('.csv') or fnb.endswith('.json') or fnb.endswith('.csv.gz') 
                    or fnb.endswith('.json.gz') or fnb.endswith('.mongo.gz')):
                print "...unknown file type %s, skipping" % fn
                sys.stdout.flush()
                continue
            copy_if_newer(fn, fnset)

    # load into bigquery
    dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest)
    bqutil.create_dataset_if_nonexistent(dataset)
    mypath = os.path.dirname(os.path.realpath(__file__))

    # load user_info_combo
    uicfn = lfp / 'user_info_combo.json.gz'
    if uicfn.exists():
        uic_schema = json.loads(open('%s/schemas/schema_user_info_combo.json' % mypath).read())['user_info_combo']
        bqutil.load_data_to_table(dataset, 'user_info_combo', gsdir / "user_info_combo.json.gz", uic_schema, wait=False)
    else:
        print "--> File %s does not exist, not loading user_info_combo into BigQuery" % uicfn
    
    # load studentmodule
                
    if fn_sm.exists():
        schemas = json.loads(open('%s/schemas/schemas.json' % mypath).read())
        cwsm_schema = schemas['courseware_studentmodule']
        bqutil.load_data_to_table(dataset, 'studentmodule', gsdir / fn_sm.basename(), cwsm_schema, format='csv', wait=False, skiprows=1)
    else:
        print "--> Not loading studentmodule: file %s not found" % fn_sm
def do_combine(course_id_set, project_id, outdir="DATA", nskip=0,
               output_project_id=None, output_dataset_id=None, output_bucket=None,
               use_dataset_latest=False,
               extract_subset_tables=True,
               ):
    '''
    Combine individual person_course tables (from the set of specified course_id's) to create one single large
    person_course table.  Do this by downloading each file, checking to make sure they all have the same
    fields, concatenating, and uploading back to bigquery.  This is cheaper than doing a select *, and also
    uncovers person_course files which have the wrong schema (and it works around BQ's limitation on large
    result sizes).  The result is stored in the course_report_latest dataset (if use_dataset_latest), else 
    in course_report_ORG, where ORG is the configured organization name.

    If extract_subset_tables is True, then the subset of those who viewed (ie "participants"), and the subset
    of those who enrolled for IDV, are extracted and saved as person_course_viewed, and person_course_idv.
    (those are created using a select *, for efficiency, despite the cost).
    '''

    print "="*77
    print "Concatenating person course datasets from the following courses:"
    print course_id_set
    print "-"*77

    outdir = path(outdir)
    if not outdir.exists():
        os.mkdir(outdir)
        
    ofnset = []
    cnt = 0
    for course_id in course_id_set:
        gb = gsutil.gs_path_from_course_id(course_id, use_dataset_latest=use_dataset_latest)
        ofn = outdir / ('person_course_%s.csv.gz' % (course_id.replace('/', '__')))
        ofnset.append(ofn)

        if (nskip>0) and ofn.exists():
            print "%s already exists, not downloading" % ofn
            sys.stdout.flush()
            continue

        if ofn.exists():
            fnset = gsutil.get_gs_file_list(gb)
            local_dt = gsutil.get_local_file_mtime_in_utc(ofn)
            fnb = 'person_course.csv.gz'
            if not fnb in fnset:
                print "%s/%s missing!  skipping %s" % (gb, fnb, course_id)
                continue
            if (fnb in fnset) and (local_dt >= fnset[fnb]['date']):
                print "%s already exists with date %s (gs file date %s), not re-downloading" % (ofn, local_dt, fnset[fnb]['date'])
                sys.stdout.flush()
                continue
            else:
                print "%s already exists but has date %s (gs file date %s), so re-downloading" % (ofn, local_dt, fnset[fnb]['date'])
                sys.stdout.flush()

        cmd = 'gsutil cp %s/person_course.csv.gz %s' % (gb, ofn)
        print "Retrieving %s via %s" % (course_id, cmd)
        sys.stdout.flush()
        os.system(cmd)
        cnt += 1
        #if cnt>2:
        #    break

    org = course_id_set[0].split('/',1)[0]

    ofn = "person_course_%s_%s.csv" % (org, datetime.datetime.now().strftime('%Y-%m-%d-%H%M%S'))

    print "="*77
    print "Combining CSV files to produce %s" % ofn
    sys.stdout.flush()

    if (nskip>1) and os.path.exists(ofn):
        print "%s already exists, not downloading" % ofn
    else:
        first = 1
        header = None
        for zfn in ofnset:
            if first:
                cmd = "zcat %s > %s" % (zfn, ofn)
                header = os.popen("zcat %s | head -1" % zfn).read().strip()
                firstfn = zfn
            else:
                cmd = "zcat %s | tail -n +2 >> %s" % (zfn, ofn)	# first row is header; don't keep when concatenating
            print cmd
            first = 0
            new_header = os.popen("zcat %s | head -1" % zfn).read().strip()
            if not header == new_header:
                print "==> Warning!  header mismatch for %s vs %s" % (zfn, firstfn)
                print "    %s has: %s" % (firstfn, header)
                print "    but %s has: %s" % (zfn, new_header)
            sys.stdout.flush()
            os.system(cmd)

    gb = gsutil.gs_path_from_course_id('course_report_%s' % org, gsbucket=output_bucket)

    print "="*77
    print "Uploading combined CSV file to google cloud storage in bucket: %s" % gb
    sys.stdout.flush()
    cmd = "TMPDIR=/var/tmp gsutil cp -z csv %s %s/" % (ofn, gb)
    print cmd
    os.system(cmd)

    gsfn = gb + '/' + ofn
    print "Combined person_course dataset CSV download link: %s" % gsutil.gs_download_link(gsfn)

    # import into BigQuery
    crname = ('course_report_%s' % org)
    if use_dataset_latest:
        crname = 'course_report_latest'
    dataset = output_dataset_id or crname
    table = ofn[:-4].replace('-','_')
    
    print "Importing into BigQuery as %s:%s.%s" % (project_id, dataset, table)
    sys.stdout.flush()
    mypath = os.path.dirname(os.path.realpath(__file__))
    SCHEMA_FILE = '%s/schemas/schema_person_course.json' % mypath
    the_schema = json.loads(open(SCHEMA_FILE).read())['person_course']

    bqutil.load_data_to_table(dataset, table, gsfn, the_schema, format='csv', skiprows=1, project_id=output_project_id)

    msg = ''
    msg += "Combined person-course dataset, with data from:\n"
    msg += str(course_id_set)
    msg += "\n\n"
    msg += "="*100 + "\n"
    msg += "CSV download link: %s" % gsutil.gs_download_link(gsfn)

    bqutil.add_description_to_table(dataset, table, msg, append=True, project_id=output_project_id)
    
    # copy the new table (which has a specific date in its name) to a generically named "person_course_latest"
    # so that future SQL queries can simply use this as the latest person course table
    print "-> Copying %s to %s.person_course_latest" % (table, dataset)
    bqutil.copy_bq_table(dataset, table, "person_course_latest")

    if extract_subset_tables:
        do_extract_subset_person_course_tables(dataset, table)

    print "Done"
    sys.stdout.flush()
def load_all_daily_logs_for_course(course_id, gsbucket="gs://x-data", verbose=True, wait=False,
                                   check_dates=True):
    '''
    Load daily tracking logs for course from google storage into BigQuery.
    
    If wait=True then waits for loading jobs to be completed.  It's desirable to wait
    if subsequent jobs which need these tables (like person_day) are to be run
    immediately afterwards.
    '''

    print "Loading daily tracking logs for course %s into BigQuery (start: %s)" % (course_id, datetime.datetime.now())
    sys.stdout.flush()
    gsroot = gsutil.path_from_course_id(course_id)

    mypath = os.path.dirname(os.path.realpath(__file__))
    SCHEMA = json.loads(open('%s/schemas/schema_tracking_log.json' % mypath).read())['tracking_log']

    gsdir = '%s/%s/DAILY/' % (gsbucket, gsroot)

    fnset = gsutil.get_gs_file_list(gsdir)
  
    dataset = bqutil.course_id2dataset(gsroot, dtype="logs")
  
    # create this dataset if necessary
    bqutil.create_dataset_if_nonexistent(dataset)

    tables = bqutil.get_list_of_table_ids(dataset)
    tables = [x for x in tables if x.startswith('track')]
  
    if verbose:
        print "-"*77
        print "current tables loaded:", json.dumps(tables, indent=4)
        print "files to load: ", json.dumps(fnset.keys(), indent=4)
        print "-"*77
        sys.stdout.flush()
  
    for fn, fninfo in fnset.iteritems():

        if int(fninfo['size'])<=45:
            print "Zero size file %s, skipping" % fn
            continue

        m = re.search('(\d\d\d\d-\d\d-\d\d)', fn)
        if not m:
            continue
        date = m.group(1)
        tablename = "tracklog_%s" % date.replace('-','')	# YYYYMMDD for compatibility with table wildcards

        # file_date = gsutil.get_local_file_mtime_in_utc(fn, make_tz_unaware=True)
        file_date = fninfo['date'].replace(tzinfo=None)
  
        if tablename in tables:
            skip = True
            if check_dates:
                table_date = bqutil.get_bq_table_last_modified_datetime(dataset, tablename)
                if not (table_date > file_date):
                    print "Already have table %s, but %s file_date=%s, table_date=%s; re-loading from gs" % (tablename, fn, file_date, table_date)
                    skip = False
                    
            if skip:
                if verbose:
                    print "Already have table %s, skipping file %s" % (tablename, fn)
                    sys.stdout.flush()
                continue

        #if date < '2014-07-27':
        #  continue
  
        print "Loading %s into table %s " % (fn, tablename)
        if verbose:
            print "start [%s]" % datetime.datetime.now()
        sys.stdout.flush()
        gsfn = fninfo['name']
        ret = bqutil.load_data_to_table(dataset, tablename, gsfn, SCHEMA, wait=wait, maxbad=1000)
  
    if verbose:
        print "-" * 77
        print "done with %s [%s]" % (course_id, datetime.datetime.now())
    print "=" * 77
    sys.stdout.flush()
def do_combine(course_id_set, project_id, outdir="DATA", nskip=0,
               output_project_id=None, output_dataset_id=None, output_bucket=None,
               use_dataset_latest=False,
               ):

    print "="*77
    print "Concatenating person course datasets from the following courses:"
    print course_id_set
    print "-"*77

    outdir = path(outdir)
    if not outdir.exists():
        os.mkdir(outdir)
        
    ofnset = []
    cnt = 0
    for course_id in course_id_set:
        gb = gsutil.gs_path_from_course_id(course_id, use_dataset_latest=use_dataset_latest)
        ofn = outdir / ('person_course_%s.csv.gz' % (course_id.replace('/', '__')))
        ofnset.append(ofn)

        if (nskip>0) and ofn.exists():
            print "%s already exists, not downloading" % ofn
            sys.stdout.flush()
            continue

        if ofn.exists():
            fnset = gsutil.get_gs_file_list(gb)
            local_dt = gsutil.get_local_file_mtime_in_utc(ofn)
            fnb = 'person_course.csv.gz'
            if not fnb in fnset:
                print "%s/%s missing!  skipping %s" % (gb, fnb, course_id)
                continue
            if (fnb in fnset) and (local_dt >= fnset[fnb]['date']):
                print "%s already exists with date %s (gs file date %s), not re-downloading" % (ofn, local_dt, fnset[fnb]['date'])
                sys.stdout.flush()
                continue
            else:
                print "%s already exists but has date %s (gs file date %s), so re-downloading" % (ofn, local_dt, fnset[fnb]['date'])
                sys.stdout.flush()

        cmd = 'gsutil cp %s/person_course.csv.gz %s' % (gb, ofn)
        print "Retrieving %s via %s" % (course_id, cmd)
        sys.stdout.flush()
        os.system(cmd)
        cnt += 1
        #if cnt>2:
        #    break

    org = course_id_set[0].split('/',1)[0]

    ofn = "person_course_%s_%s.csv" % (org, datetime.datetime.now().strftime('%Y-%m-%d-%H%M%S'))

    print "="*77
    print "Combining CSV files to produce %s" % ofn
    sys.stdout.flush()

    if (nskip>1) and os.path.exists(ofn):
        print "%s already exists, not downloading" % ofn
    else:
        first = 1
        for zfn in ofnset:
            if first:
                cmd = "zcat %s > %s" % (zfn, ofn)
            else:
                cmd = "zcat %s | tail -n +2 >> %s" % (zfn, ofn)	# first row is header; don't keep when concatenating
            print cmd
            first = 0
            os.system(cmd)

    gb = gsutil.gs_path_from_course_id('course_report_%s' % org, gsbucket=output_bucket)

    print "="*77
    print "Uploading combined CSV file to google cloud storage in bucket: %s" % gb
    sys.stdout.flush()
    cmd = "TMPDIR=/var/tmp gsutil cp -z csv %s %s/" % (ofn, gb)
    print cmd
    os.system(cmd)

    gsfn = gb + '/' + ofn
    print "Combined person_course dataset CSV download link: %s" % gsutil.gs_download_link(gsfn)

    # import into BigQuery
    crname = ('course_report_%s' % org)
    if use_dataset_latest:
        crname = 'course_report_latest'
    dataset = output_dataset_id or crname
    table = ofn[:-4].replace('-','_')
    
    print "Importing into BigQuery as %s:%s.%s" % (project_id, dataset, table)
    sys.stdout.flush()
    mypath = os.path.dirname(os.path.realpath(__file__))
    SCHEMA_FILE = '%s/schemas/schema_person_course.json' % mypath
    the_schema = json.loads(open(SCHEMA_FILE).read())['person_course']

    bqutil.load_data_to_table(dataset, table, gsfn, the_schema, format='csv', skiprows=1, project_id=output_project_id)

    msg = ''
    msg += "Combined person-course dataset, with data from:\n"
    msg += str(course_id_set)
    msg += "\n\n"
    msg += "="*100 + "\n"
    msg += "CSV download link: %s" % gsutil.gs_download_link(gsfn)

    bqutil.add_description_to_table(dataset, table, msg, append=True, project_id=output_project_id)
    
    print "Done"
    sys.stdout.flush()
Exemple #12
0
def load_all_daily_logs_for_course(course_id,
                                   gsbucket="gs://x-data",
                                   verbose=True,
                                   wait=False,
                                   check_dates=True):
    '''
    Load daily tracking logs for course from google storage into BigQuery.
    
    If wait=True then waits for loading jobs to be completed.  It's desirable to wait
    if subsequent jobs which need these tables (like person_day) are to be run
    immediately afterwards.
    '''

    print "Loading daily tracking logs for course %s into BigQuery (start: %s)" % (
        course_id, datetime.datetime.now())
    sys.stdout.flush()
    gsroot = gsutil.path_from_course_id(course_id)

    mypath = os.path.dirname(os.path.realpath(__file__))
    SCHEMA = json.loads(
        open('%s/schemas/schema_tracking_log.json' %
             mypath).read())['tracking_log']

    gsdir = '%s/%s/DAILY/' % (gsbucket, gsroot)

    fnset = gsutil.get_gs_file_list(gsdir)

    dataset = bqutil.course_id2dataset(gsroot, dtype="logs")

    # create this dataset if necessary
    bqutil.create_dataset_if_nonexistent(dataset)

    tables = bqutil.get_list_of_table_ids(dataset)
    tables = [x for x in tables if x.startswith('track')]

    if verbose:
        print "-" * 77
        print "current tables loaded:", json.dumps(tables, indent=4)
        print "files to load: ", json.dumps(fnset.keys(), indent=4)
        print "-" * 77
        sys.stdout.flush()

    for fn, fninfo in fnset.iteritems():

        if int(fninfo['size']) <= 45:
            print "Zero size file %s, skipping" % fn
            continue

        m = re.search('(\d\d\d\d-\d\d-\d\d)', fn)
        if not m:
            continue
        date = m.group(1)
        tablename = "tracklog_%s" % date.replace(
            '-', '')  # YYYYMMDD for compatibility with table wildcards

        # file_date = gsutil.get_local_file_mtime_in_utc(fn, make_tz_unaware=True)
        file_date = fninfo['date'].replace(tzinfo=None)

        if tablename in tables:
            skip = True
            if check_dates:
                table_date = bqutil.get_bq_table_last_modified_datetime(
                    dataset, tablename)
                if not (table_date > file_date):
                    print "Already have table %s, but %s file_date=%s, table_date=%s; re-loading from gs" % (
                        tablename, fn, file_date, table_date)
                    skip = False

            if skip:
                if verbose:
                    print "Already have table %s, skipping file %s" % (
                        tablename, fn)
                    sys.stdout.flush()
                continue

        #if date < '2014-07-27':
        #  continue

        print "Loading %s into table %s " % (fn, tablename)
        if verbose:
            print "start [%s]" % datetime.datetime.now()
        sys.stdout.flush()
        gsfn = fninfo['name']
        ret = bqutil.load_data_to_table(dataset,
                                        tablename,
                                        gsfn,
                                        SCHEMA,
                                        wait=wait,
                                        maxbad=1000)

    if verbose:
        print "-" * 77
        print "done with %s [%s]" % (course_id, datetime.datetime.now())
    print "=" * 77
    sys.stdout.flush()