def upload_grades_persistent_data(cid, basedir, datedir, use_dataset_latest=False, subsection=False):
    """
    Upload grades_persistent csv.gz to Google Storage,
    create the BigQuery table,
    then insert the data into the table.

    :param cid: the course id
    :param basedir: the base directory path
    :param datedir: the date directory name (represented as YYYY-MM-DD)
    :param use_dataset_latest: should the most recent dataset be used?
    :param subsection: should grades_persistentsubsection be uploaded?
    :type cid: str
    :type basedir: str
    :type datedir: str
    :type use_dataset_latest: bool
    :type subsection: bool
    """
    gsdir = path(gsutil.gs_path_from_course_id(cid, use_dataset_latest=use_dataset_latest))

    if subsection:
        csv_name = "grades_persistentsubsectiongrade.csv.gz"
        temp_name = "grades_persistentsubsectiongrade_temp.csv.gz"
        table = "grades_persistent_subsection"
    else:
        csv_name = "grades_persistentcoursegrade.csv.gz"
        temp_name = "grades_persistentcoursegrade_temp.csv.gz"
        table = "grades_persistent"

    sdir = load_course_sql.find_course_sql_dir(cid,
                                               basedir=basedir,
                                               datedir=datedir,
                                               use_dataset_latest=(use_dataset_latest),
                                               )

    csvfn = sdir / csv_name
    tempfn = sdir / temp_name

    mypath = os.path.dirname(os.path.realpath(__file__))
    the_schema = json.loads(open('%s/schemas/schema_%s.json' % (mypath, table)).read())[table]

    if not os.path.exists(csvfn):
        print "[edx2bigquery] make_grades_persistent: missing file %s, skipping" % csvfn
        return

    if not subsection:
        cleanup_rows_from_grade_persistent(csvfn, tempfn)
    else:
        cleanup_rows_from_grade_persistent(csvfn, tempfn, field_to_fix="first_attempted")

    gsutil.upload_file_to_gs(csvfn, gsdir, options="-z csv", verbose=True)

    dataset = bqutil.course_id2dataset(cid, use_dataset_latest=use_dataset_latest)
    bqutil.create_dataset_if_nonexistent(dataset)  # create dataset if not already existent

    bqutil.load_data_to_table(dataset,
                              table,
                              gsdir / csv_name,
                              the_schema,
                              format="csv",
                              skiprows=1)
def upload_grades_persistent_data(cid,
                                  basedir,
                                  datedir,
                                  use_dataset_latest=False,
                                  subsection=False):
    """Upload grades_persistent csv.gz to Google Storage, create the BigQuery table, then insert the data into the table

    :param cid: the course id
    :param basedir: the base directory path
    :param datedir: the date directory name (represented as YYYY-MM-DD)
    :param use_dataset_latest: should the most recent dataset be used?
    :param subsection: should grades_persistentsubsection be uploaded?
    :type cid: str
    :type basedir: str
    :type datedir: str
    :type use_dataset_latest: bool
    :type subsection: bool
    """
    gsdir = path(
        gsutil.gs_path_from_course_id(cid,
                                      use_dataset_latest=use_dataset_latest))

    if subsection:
        csv_name = "grades_persistentsubsectiongrade.csv.gz"
        temp_name = "grades_persistentsubsectiongrade_temp.csv.gz"
        table = "grades_persistent_subsection"
    else:
        csv_name = "grades_persistentcoursegrade.csv.gz"
        temp_name = "grades_persistentcoursegrade_temp.csv.gz"
        table = "grades_persistent"

    csvfn = '%s/%s/%s/%s' % (basedir, cid.replace('/',
                                                  '__'), datedir, csv_name)
    tempfn = '%s/%s/%s/%s' % (basedir, cid.replace('/',
                                                   '__'), datedir, temp_name)

    mypath = os.path.dirname(os.path.realpath(__file__))
    the_schema = json.loads(
        open('%s/schemas/schema_%s.json' % (mypath, table)).read())[table]

    if not subsection:
        remove_nulls_from_grade_persistent(csvfn, tempfn)

    gsutil.upload_file_to_gs(csvfn, gsdir, options="-z csv", verbose=True)

    dataset = bqutil.course_id2dataset(cid,
                                       use_dataset_latest=use_dataset_latest)
    bqutil.create_dataset_if_nonexistent(
        dataset)  # create dataset if not already existent

    bqutil.load_data_to_table(dataset,
                              table,
                              gsdir / csv_name,
                              the_schema,
                              format="csv",
                              skiprows=1)
Example #3
0
def do_course_listings(course_listings_fn):
    dataset = 'courses'
    table = 'listings'
    bqutil.create_dataset_if_nonexistent(dataset)
    mypath = os.path.dirname(os.path.realpath(__file__))

    gsfn = gsutil.gs_path_from_course_id('courses') / 'listings.csv'
    gsutil.upload_file_to_gs(course_listings_fn, gsfn)

    schema = json.loads(open('%s/schemas/schema_course_listings.json' % mypath).read())['course_listings']
    bqutil.load_data_to_table(dataset, table, gsfn, schema, wait=True, format='csv', skiprows=1)
    def write_geoip_table(self):
        '''
        Write out the geoipdat table if nchanged > 0
        '''
        if not self.nchanged:
            return

        ofn = 'tmp_geoip_%08d.json' % random.uniform(0, 100000000)
        print "--> new entries added to geoipdat, writing to %s" % (ofn)
        sys.stdout.flush()

        ofp = codecs.open(ofn, 'w', encoding='utf8')
        for key, val in self.geoipdat.iteritems():
            try:
                ofp.write(json.dumps(val) + '\n')
            except Exception as err:
                print "Error!  %s" % err
                sys.stdout.write(repr(val))
                raise
        ofp.close()

        lock_file(self.gipfn)
        try:
            print "--> renaming %s to %s" % (ofn, self.gipfn)
            sys.stdout.flush()
            os.rename(ofn, self.gipfn)
        except Exception as err:
            print "Error %s in renaming gipfn" % str(err)
        lock_file(self.gipfn, release=True)

        mypath = os.path.dirname(os.path.realpath(__file__))
        the_schema = json.loads(
            open('%s/schemas/schema_extra_geoip.json' %
                 mypath).read())['extra_geoip']

        gsp = gsutil.gs_path_from_course_id(self.gipdataset) / self.gipfn
        print "--> Uploading %s to %s" % (self.gipfn, gsp)
        sys.stdout.flush()
        gsutil.upload_file_to_gs(self.gipfn, gsp, '-z json')

        print "--> Importing %s to %s" % (gsp, self.giptable)
        sys.stdout.flush()
        try:
            bqutil.create_dataset_if_nonexistent(self.gipdataset)
        except Exception as err:
            print "--> Warning: failed to create %s, err=%s" % (gsp, err)
        try:
            bqutil.load_data_to_table(self.gipdataset, self.giptable, gsp,
                                      the_schema)
        except Exception as err:
            print "---> ERROR: failed to load %s into BigQuery %s.%s, err=%s" % (
                gsp, self.gipdataset, self.giptable, err)
            print "---> Continuing anyway"
            sys.stdout.flush()
    def write_geoip_table(self):
        '''
        Write out the geoipdat table if nchanged > 0
        '''
        if not self.nchanged:
            return

        ofn = 'tmp_geoip_%08d.json' % random.uniform(0,100000000)
        print "--> new entries added to geoipdat, writing to %s" % (ofn)
        sys.stdout.flush()

        ofp = codecs.open(ofn, 'w', encoding='utf8')
        for key, val in self.geoipdat.iteritems():
            try:
                ofp.write(json.dumps(val)+'\n')
            except Exception as err:
                print "Error!  %s" % err
                sys.stdout.write(repr(val))
                raise
        ofp.close()

        lock_file(self.gipfn)
        try:
            print "--> renaming %s to %s" % (ofn, self.gipfn)
            sys.stdout.flush()
            os.rename(ofn, self.gipfn)
        except Exception as err:
            print "Error %s in renaming gipfn" % str(err)
        lock_file(self.gipfn, release=True)
        
        mypath = os.path.dirname(os.path.realpath(__file__))
        the_schema = json.loads(open('%s/schemas/schema_extra_geoip.json' % mypath).read())['extra_geoip']

        gsp = gsutil.gs_path_from_course_id(self.gipdataset) / self.gipfn
        print "--> Uploading %s to %s" % (self.gipfn, gsp)
        sys.stdout.flush()
        gsutil.upload_file_to_gs(self.gipfn, gsp, '-z json')
        
        print "--> Importing %s to %s" % (gsp, self.giptable)
        sys.stdout.flush()
        try:
            bqutil.create_dataset_if_nonexistent(self.gipdataset)
        except Exception as err:
            print "--> Warning: failed to create %s, err=%s" % (gsp, err)
        try:
            bqutil.load_data_to_table(self.gipdataset, self.giptable, gsp, the_schema)
        except Exception as err:
            print "---> ERROR: failed to load %s into BigQuery %s.%s, err=%s" % (gsp, self.gipdataset, self.giptable, err)
            print "---> Continuing anyway"
            sys.stdout.flush()
def load_sql_for_course(course_id, gsbucket="gs://x-data", basedir="X-Year-2-data-sql", datedir="2014-09-21", 
                        do_gs_copy=False,
                        use_dataset_latest=False):
    '''
    Load SQL files into google cloud storage then import into BigQuery.

    Datasets are typically named by course_id, with "__" replacing "/", and "_" replacing "."

    If use_dataset_latest then "_latest" is appended to the dataset name.  
    Thus, the latest SQL dataset can always be put in a consistently named dataset.
    '''
    
    print "Loading SQL for course %s into BigQuery (start: %s)" % (course_id, datetime.datetime.now())
    sys.stdout.flush()

    lfp = find_course_sql_dir(course_id, basedir, datedir, use_dataset_latest=use_dataset_latest)

    print "Using this directory for local files: ", lfp
    sys.stdout.flush()
                          
    # convert studentmodule if necessary

    fn_sm = lfp / 'studentmodule.csv.gz'
    if not fn_sm.exists():
        fn_sm = lfp / 'studentmodule.csv'
        if not fn_sm.exists():
            fn_sm = lfp / 'studentmodule.sql.gz'
            if not fn_sm.exists():
                fn_sm = lfp / 'studentmodule.sql'
                if not fn_sm.exists():
                    print "Error!  Missing studentmodule.[sql,csv][.gz]"
            if fn_sm.exists():	# have .sql or .sql.gz version: convert to .csv
                newfn = lfp / 'studentmodule.csv.gz'
                print "--> Converting %s to %s" % (fn_sm, newfn)
                tsv2csv(fn_sm, newfn)
                fn_sm = newfn

    if fn_sm.exists():
        # rephrase studentmodule if it's using opaque keys
        fline = ''
        smfp = openfile(fn_sm)
        fline = smfp.readline()	# skip first line - it's a header
        fline = smfp.readline()
        if 'block-v1:' in fline or 'course-v1' in fline:
            rephrase_studentmodule_opaque_keys(fn_sm)

    def convert_sql(fnroot):
        if os.path.exists(fnroot + ".csv") or os.path.exists(fnroot + ".csv.gz"):
            return
        if os.path.exists(fnroot + ".sql") or os.path.exists(fnroot + ".sql.gz"):
            infn = fnroot + '.sql'
            outfn = fnroot + '.csv.gz'
            print "--> Converting %s to %s" % (infn, outfn)
            tsv2csv(infn, outfn)

    # convert sql files if necesssary
    fnset = ['users', 'certificates', 'enrollment', "profiles", 'user_id_map', 'rolecourse', 'roleforum']
    for fn in fnset:
        convert_sql(lfp / fn)

    local_files = glob.glob(lfp / '*')

    # if using latest date directory, also look for course_image.jpg one level up
    if use_dataset_latest:
        print lfp.dirname()
        ci_files = glob.glob(lfp.dirname() / 'course_image.jpg')
        if ci_files:
            local_files += list(ci_files)
            print "--> local course_image file: %s" % ci_files

    gsdir = gsutil.gs_path_from_course_id(course_id, gsbucket=gsbucket, use_dataset_latest=use_dataset_latest)

    local = pytz.timezone ("America/New_York")

    if do_gs_copy:
        try:
            fnset = get_gs_file_list(gsdir)
        except Exception as err:
            fnset = []
        
        def copy_if_newer(fn, fnset, options='-z csv,json'):
            statbuf = os.stat(fn)
            mt = datetime.datetime.fromtimestamp(statbuf.st_mtime)
            
            # do some date checking to upload files which have changed, and are newer than that on google cloud storage
            local_dt = local.localize(mt, is_dst=None)
            utc_dt = local_dt.astimezone (pytz.utc)

            fnb = os.path.basename(fn)
            if fnb in fnset and fnset[fnb]['date'] > utc_dt:
                print "...%s already copied, skipping" % fn
                sys.stdout.flush()
                return
            elif fnb in fnset:
                print "...%s already exists, but has date=%s and mtime=%s, re-uploading" % (fn, fnset[fnb]['date'], mt)

            gsutil.upload_file_to_gs(fn, gsdir / fnb, options=options, verbose=True)

        for fn in local_files:
            fnb = os.path.basename(fn)
            if fnb=='course_image.jpg':
                copy_if_newer(fn, fnset, options='-a public-read')
            if not (fnb.endswith('.csv') or fnb.endswith('.json') or fnb.endswith('.csv.gz') 
                    or fnb.endswith('.json.gz') or fnb.endswith('.mongo.gz')):
                print "...unknown file type %s, skipping" % fn
                sys.stdout.flush()
                continue
            copy_if_newer(fn, fnset)

    # load into bigquery
    dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest)
    bqutil.create_dataset_if_nonexistent(dataset)
    mypath = os.path.dirname(os.path.realpath(__file__))

    # load user_info_combo
    uicfn = lfp / 'user_info_combo.json.gz'
    if uicfn.exists():
        uic_schema = json.loads(open('%s/schemas/schema_user_info_combo.json' % mypath).read())['user_info_combo']
        bqutil.load_data_to_table(dataset, 'user_info_combo', gsdir / "user_info_combo.json.gz", uic_schema, wait=False)
    else:
        print "--> File %s does not exist, not loading user_info_combo into BigQuery" % uicfn
    
    # load studentmodule
                
    if fn_sm.exists():
        schemas = json.loads(open('%s/schemas/schemas.json' % mypath).read())
        cwsm_schema = schemas['courseware_studentmodule']
        bqutil.load_data_to_table(dataset, 'studentmodule', gsdir / fn_sm.basename(), cwsm_schema, format='csv', wait=False, skiprows=1)
    else:
        print "--> Not loading studentmodule: file %s not found" % fn_sm
def rephrase_forum_json_for_course(course_id, gsbucket="gs://x-data", 
                                   basedir="X-Year-2-data-sql", 
                                   datedir=None, 
                                   do_gs_copy=False,
                                   use_dataset_latest=False,
                                   ):
    
    print "Loading SQL for course %s into BigQuery (start: %s)" % (course_id, datetime.datetime.now())
    sys.stdout.flush()

    lfp = find_course_sql_dir(course_id, basedir, datedir, use_dataset_latest=use_dataset_latest)

    print "Using this directory for local files: ", lfp
    sys.stdout.flush()

    fn = 'forum.mongo'
    gsdir = gsutil.gs_path_from_course_id(course_id, gsbucket, use_dataset_latest)
    
    def openfile(fn, mode='r'):
        if (not os.path.exists(lfp / fn)) and (not fn.endswith('.gz')):
            fn += ".gz"
        if fn.endswith('.gz'):
            return gzip.GzipFile(lfp / fn, mode)
        return open(lfp / fn, mode)

    fp = openfile(fn)

    ofn = lfp / "forum-rephrased.json.gz"

    dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest)
    bqutil.create_dataset_if_nonexistent(dataset)

    if os.path.exists(ofn):

        tables = bqutil.get_list_of_table_ids(dataset)
        if not 'forum' in tables:
            print "Already done?  But no forums table loaded into datasaet %s.  Redoing." % dataset
        else:
            print "Already done %s -> %s (skipping)" % (fn, ofn)
            sys.stdout.flush()
            return

    print "Processing %s -> %s (%s)" % (fn, ofn, datetime.datetime.now())
    sys.stdout.flush()

    cnt = 0
    ofp = gzip.GzipFile('tmp.json.gz', 'w')
    for line in fp:
        cnt += 1
        newline = do_rephrase_line(line, linecnt=cnt)
        ofp.write(newline)
    ofp.close()

    print "...done (%s)" % datetime.datetime.now()

    if cnt==0:
        print "...but cnt=0 entries found, skipping forum loading"
        sys.stdout.flush()
        return

    print "...copying to gsc"
    sys.stdout.flush()

    # do upload twice, because GSE file metadata doesn't always make it to BigQuery right away?
    gsfn = gsdir + '/' + "forum-rephrased.json.gz"
    cmd = 'gsutil cp tmp.json.gz %s' % (gsfn)
    os.system(cmd)
    os.system(cmd)

    table = 'forum'
    bqutil.load_data_to_table(dataset, table, gsfn, SCHEMA, wait=True)
    msg = "Original data from %s" % (lfp / fn)
    bqutil.add_description_to_table(dataset, table, msg, append=True)

    os.system('mv tmp.json.gz "%s"' % (ofn))

    print "...done (%s)" % datetime.datetime.now()
    sys.stdout.flush()
Example #8
0
def load_all_daily_logs_for_course(course_id,
                                   gsbucket="gs://x-data",
                                   verbose=True,
                                   wait=False,
                                   check_dates=True):
    '''
    Load daily tracking logs for course from google storage into BigQuery.
    
    If wait=True then waits for loading jobs to be completed.  It's desirable to wait
    if subsequent jobs which need these tables (like person_day) are to be run
    immediately afterwards.
    '''

    print "Loading daily tracking logs for course %s into BigQuery (start: %s)" % (
        course_id, datetime.datetime.now())
    sys.stdout.flush()
    gsroot = gsutil.path_from_course_id(course_id)

    mypath = os.path.dirname(os.path.realpath(__file__))
    SCHEMA = json.loads(
        open('%s/schemas/schema_tracking_log.json' %
             mypath).read())['tracking_log']

    gsdir = '%s/%s/DAILY/' % (gsbucket, gsroot)

    fnset = gsutil.get_gs_file_list(gsdir)

    dataset = bqutil.course_id2dataset(gsroot, dtype="logs")

    # create this dataset if necessary
    bqutil.create_dataset_if_nonexistent(dataset)

    tables = bqutil.get_list_of_table_ids(dataset)
    tables = [x for x in tables if x.startswith('track')]

    if verbose:
        print "-" * 77
        print "current tables loaded:", json.dumps(tables, indent=4)
        print "files to load: ", json.dumps(fnset.keys(), indent=4)
        print "-" * 77
        sys.stdout.flush()

    for fn, fninfo in fnset.iteritems():

        if int(fninfo['size']) <= 45:
            print "Zero size file %s, skipping" % fn
            continue

        m = re.search('(\d\d\d\d-\d\d-\d\d)', fn)
        if not m:
            continue
        date = m.group(1)
        tablename = "tracklog_%s" % date.replace(
            '-', '')  # YYYYMMDD for compatibility with table wildcards

        # file_date = gsutil.get_local_file_mtime_in_utc(fn, make_tz_unaware=True)
        file_date = fninfo['date'].replace(tzinfo=None)

        if tablename in tables:
            skip = True
            if check_dates:
                table_date = bqutil.get_bq_table_last_modified_datetime(
                    dataset, tablename)
                if not (table_date > file_date):
                    print "Already have table %s, but %s file_date=%s, table_date=%s; re-loading from gs" % (
                        tablename, fn, file_date, table_date)
                    skip = False

            if skip:
                if verbose:
                    print "Already have table %s, skipping file %s" % (
                        tablename, fn)
                    sys.stdout.flush()
                continue

        #if date < '2014-07-27':
        #  continue

        print "Loading %s into table %s " % (fn, tablename)
        if verbose:
            print "start [%s]" % datetime.datetime.now()
        sys.stdout.flush()
        gsfn = fninfo['name']
        ret = bqutil.load_data_to_table(dataset,
                                        tablename,
                                        gsfn,
                                        SCHEMA,
                                        wait=wait,
                                        maxbad=1000)

    if verbose:
        print "-" * 77
        print "done with %s [%s]" % (course_id, datetime.datetime.now())
    print "=" * 77
    sys.stdout.flush()
def upload_grades_persistent_data(cid,
                                  basedir,
                                  datedir,
                                  use_dataset_latest=False,
                                  subsection=False):
    """
    Upload grades_persistent csv.gz to Google Storage,
    create the BigQuery table,
    then insert the data into the table.

    :param cid: the course id
    :param basedir: the base directory path
    :param datedir: the date directory name (represented as YYYY-MM-DD)
    :param use_dataset_latest: should the most recent dataset be used?
    :param subsection: should grades_persistentsubsection be uploaded?
    :type cid: str
    :type basedir: str
    :type datedir: str
    :type use_dataset_latest: bool
    :type subsection: bool
    """
    gsdir = path(
        gsutil.gs_path_from_course_id(cid,
                                      use_dataset_latest=use_dataset_latest))

    if subsection:
        csv_name = "grades_persistentsubsectiongrade.csv.gz"
        temp_name = "grades_persistentsubsectiongrade_temp.csv.gz"
        table = "grades_persistent_subsection"
    else:
        csv_name = "grades_persistentcoursegrade.csv.gz"
        temp_name = "grades_persistentcoursegrade_temp.csv.gz"
        table = "grades_persistent"

    sdir = load_course_sql.find_course_sql_dir(
        cid,
        basedir=basedir,
        datedir=datedir,
        use_dataset_latest=(use_dataset_latest),
    )

    csvfn = sdir / csv_name
    tempfn = sdir / temp_name

    mypath = os.path.dirname(os.path.realpath(__file__))
    the_schema = json.loads(
        open('%s/schemas/schema_%s.json' % (mypath, table)).read())[table]

    if not os.path.exists(csvfn):
        print "[edx2bigquery] make_grades_persistent: missing file %s, skipping" % csvfn
        return

    if not subsection:
        cleanup_rows_from_grade_persistent(csvfn, tempfn)
    else:
        cleanup_rows_from_grade_persistent(csvfn,
                                           tempfn,
                                           field_to_fix="first_attempted")

    gsutil.upload_file_to_gs(csvfn, gsdir, options="-z csv", verbose=True)

    dataset = bqutil.course_id2dataset(cid,
                                       use_dataset_latest=use_dataset_latest)
    bqutil.create_dataset_if_nonexistent(
        dataset)  # create dataset if not already existent

    bqutil.load_data_to_table(dataset,
                              table,
                              gsdir / csv_name,
                              the_schema,
                              format="csv",
                              skiprows=1)
def old_process_course(course_id, force_recompute=False):
    '''
    DEPRACATED - instead of creating one table per day, because there is so little
    total data, create one enrollday_all table (see other function below).

    make enrollday2_* tables for specified course_id
    '''

    SQL = """
            SELECT 
  		    "{course_id}" as course_id,
	            time, 
                    event_struct.user_id as user_id, 
                    (case when (event_type = "edx.course.enrollment.activated" 
                                and event_struct.mode = "honor")
                          then 1 
                          when (event_type = "edx.course.enrollment.deactivated" 
                                and event_struct.mode = "honor")
                          then -1 
                          else 0 end) as diff_enrollment_honor,
                    (case when (event_type = "edx.course.enrollment.activated" 
                                and event_struct.mode = "verified")
                          then 1 
                          when (event_type = "edx.course.enrollment.deactivated" 
                                and event_struct.mode = "verified")
                          then -1 
                          else 0 end) as diff_enrollment_verified,
                    (case when (event_type = "edx.course.enrollment.activated" 
                                and event_struct.mode = "audit")
                          then 1 
                          when (event_type = "edx.course.enrollment.deactivated" 
                                and event_struct.mode = "audit")
                          then -1 
                          else 0 end) as diff_enrollment_audit,
            FROM [{dataset}.{table_id}] 
            where (event_type = "edx.course.enrollment.activated") or
                  (event_type = "edx.course.enrollment.deactivated")
            order by time;
            """

    course_dir = course_id.replace('/','__')
    dataset = bqutil.course_id2dataset(course_id)
    log_dataset = bqutil.course_id2dataset(course_id, dtype="logs")
    pcd_dataset = bqutil.course_id2dataset(course_id, dtype="pcday")

    print "Processing course %s (start %s)"  % (course_id, datetime.datetime.now())
    sys.stdout.flush()

    log_tables = bqutil.get_tables(log_dataset)

    try:
        bqutil.create_dataset_if_nonexistent(pcd_dataset)
    except Exception as err:
        print "Oops, err when creating %s, err=%s" % (pcd_dataset, str(err))
        
    pcday_tables_info = bqutil.get_tables(pcd_dataset)
    pcday_tables = [x['tableReference']['tableId'] for x in pcday_tables_info.get('tables', [])]

    # print "pcday_tables = ", pcday_tables

    log_table_list = log_tables['tables']
    log_table_list.sort()

    for table in log_table_list:
        tr = table['tableReference']
        table_id = tr['tableId']
        if not table_id.startswith('tracklog'):
            continue
    
        date = table_id[9:]
    
        table_out = 'enrollday2_%s' % date
    
        if (table_out in pcday_tables) and not force_recompute:
            print "%s...already done, skipping" % table_id
            sys.stdout.flush()
            continue

        if bqutil.get_bq_table_size_rows(log_dataset, table_id)==0:
            print "...zero size table %s, skipping" % table_id
            sys.stdout.flush()
            continue

        print ("Creating %s " % table_out), 
        
        the_sql = SQL.format(course_id=course_id, 
                             dataset=log_dataset,
                             table_id=table_id)
    
        sys.stdout.flush()
    
        bqutil.create_bq_table(pcd_dataset, table_out, the_sql, wait=False)
    
    print "Done with course %s (end %s)"  % (course_id, datetime.datetime.now())
    print "="*77
    sys.stdout.flush()
def load_local_sql_files_to_bigquery(course_id, verbose, base_dir, date_dir):
    """
    Loads the MySQL files from edx-analytics-exporter into Google Big Query.

    Run the waldofy command before attempting to upload MySQL data into Google Big Query.

    Args:
        course_id: course_id string.
        verbose: Whether or not the function logging should be verbose.
    """
    STUDENTMODULE_TABLE_NAME = 'studentmodule'
    USER_INFO_COMBO_TABLE_NAME = 'user_info_combo'

    dataset_name = bqutil.course_id2dataset(course_id)
    sql_scheme_file_names = getattr(edx2bigquery_config,
                                    'SQL_SCHEME_FILE_NAMES', None)
    sql_files_abs_path = os.path.abspath('{}'.format(
        find_course_sql_dir(course_id, base_dir, date_dir)))

    if not sql_scheme_file_names:
        print(
            'Missing SQL_SCHEME_FILE_NAMES setting, unable to load scheme files.'
        )
        exit()

    for table_name, scheme_data in sql_scheme_file_names.items():
        dict_schema = get_schema_from_file(
            scheme_data.get('scheme_file_name', ''))

        if table_name == STUDENTMODULE_TABLE_NAME:
            scheme = dict_schema.get('courseware_studentmodule', {})
        elif table_name == USER_INFO_COMBO_TABLE_NAME:
            scheme = dict_schema.get('user_info_combo', {})

        if not scheme:
            print('Unable to load Google Big Query scheme for: {}'.format(
                table_name))
            continue

        bqutil.create_dataset_if_nonexistent(dataset_name)

        file_name = '{}/{}{}'.format(
            sql_files_abs_path,
            table_name,
            DEFAULT_SQL_FILE_EXTENSION,
        )

        if not os.path.isfile(file_name):
            print(
                'File does not exists: {}, unable to load SQL file to Google Big Query.'
                .format(file_name))
            continue

        if verbose:
            print('Uploading: {} to the table: {}'.format(
                file_name, table_name))

        bqutil.upload_local_data_to_big_query(
            dataset_id=dataset_name,
            table_id=table_name,
            schema=scheme,
            course_id=course_id,
            file_name=file_name,
            source_format=DEFAULT_CSV_SOURCE_FORMAT_NAME,
        )
def load_all_daily_logs_for_course(course_id, gsbucket="gs://x-data", verbose=True, wait=False,
                                   check_dates=True):
    '''
    Load daily tracking logs for course from google storage into BigQuery.
    
    If wait=True then waits for loading jobs to be completed.  It's desirable to wait
    if subsequent jobs which need these tables (like person_day) are to be run
    immediately afterwards.
    '''

    print "Loading daily tracking logs for course %s into BigQuery (start: %s)" % (course_id, datetime.datetime.now())
    sys.stdout.flush()
    gsroot = gsutil.path_from_course_id(course_id)

    mypath = os.path.dirname(os.path.realpath(__file__))
    SCHEMA = json.loads(open('%s/schemas/schema_tracking_log.json' % mypath).read())['tracking_log']

    gsdir = '%s/%s/DAILY/' % (gsbucket, gsroot)

    fnset = gsutil.get_gs_file_list(gsdir)
  
    dataset = bqutil.course_id2dataset(gsroot, dtype="logs")
  
    # create this dataset if necessary
    bqutil.create_dataset_if_nonexistent(dataset)

    tables = bqutil.get_list_of_table_ids(dataset)
    tables = [x for x in tables if x.startswith('track')]
  
    if verbose:
        print "-"*77
        print "current tables loaded:", json.dumps(tables, indent=4)
        print "files to load: ", json.dumps(fnset.keys(), indent=4)
        print "-"*77
        sys.stdout.flush()
  
    for fn, fninfo in fnset.iteritems():

        if int(fninfo['size'])<=45:
            print "Zero size file %s, skipping" % fn
            continue

        m = re.search('(\d\d\d\d-\d\d-\d\d)', fn)
        if not m:
            continue
        date = m.group(1)
        tablename = "tracklog_%s" % date.replace('-','')	# YYYYMMDD for compatibility with table wildcards

        # file_date = gsutil.get_local_file_mtime_in_utc(fn, make_tz_unaware=True)
        file_date = fninfo['date'].replace(tzinfo=None)
  
        if tablename in tables:
            skip = True
            if check_dates:
                table_date = bqutil.get_bq_table_last_modified_datetime(dataset, tablename)
                if not (table_date > file_date):
                    print "Already have table %s, but %s file_date=%s, table_date=%s; re-loading from gs" % (tablename, fn, file_date, table_date)
                    skip = False
                    
            if skip:
                if verbose:
                    print "Already have table %s, skipping file %s" % (tablename, fn)
                    sys.stdout.flush()
                continue

        #if date < '2014-07-27':
        #  continue
  
        print "Loading %s into table %s " % (fn, tablename)
        if verbose:
            print "start [%s]" % datetime.datetime.now()
        sys.stdout.flush()
        gsfn = fninfo['name']
        ret = bqutil.load_data_to_table(dataset, tablename, gsfn, SCHEMA, wait=wait, maxbad=1000)
  
    if verbose:
        print "-" * 77
        print "done with %s [%s]" % (course_id, datetime.datetime.now())
    print "=" * 77
    sys.stdout.flush()
def CreateForumEvents(course_id,
                      force_recompute=False,
                      use_dataset_latest=False,
                      skip_last_day=False,
                      end_date=None):
    '''
    Create forum events table, based on tracking logs.  Extracts all forum-related events, including forum post reads,
    into the date-time ordered table.  Repeated calls to this procedure will append new events to the table.  If no
    new events are found, the existing table is left unchanged.
    '''
    dataset = bqutil.course_id2dataset(course_id,
                                       use_dataset_latest=use_dataset_latest)
    logs = bqutil.course_id2dataset(course_id, dtype='logs')

    table = TABLE_FORUM_EVENTS

    # event_type for forums may be like:
    #  /courses/UnivX/123.4x/2T2015/discussion/forum/The_Subject/threads/5460c918a2a525003a0007fa
    #  /courses/UnivX/123.4x/2T2015/discussion/forum/The_Subject/inline
    #  /courses/UnivX/123.4x/2T2015/discussion/forum/users/4051854/followed
    #  /courses/UnivX/123.4x/2T2015/discussion/comments/54593f21a2a525003a000351/reply
    #  /courses/UnivX/123.4x/2T2015/discussion/threads/545e4f5da2a5251aac000672/reply
    #  /courses/UnivX/123.4x/2T2015/discussion/threads/545770e9dad66c17cd0001d5/upvote
    #  /courses/UnivX/123.4x/2T2015/discussion/threads/545770e9dad66c17cd0001d5/unvote
    #  /courses/UnivX/123.4x/2T2015/discussion/threads/5447c22e892b213c7b0001f3/update
    #  /courses/UnivX/123.4x/2T2015/discussion/threads/54493025892b2120a1000335/pin
    #  /courses/UnivX/123.4x/2T2015/discussion/threads/54492e9c35c79cb03e00030c/delete
    #  /courses/UnivX/123.4x/2T2015/discussion/forum/General/inline
    #  /courses/UnivX/123.4x/2T2015/instructor/api/list_forum_members
    #  /courses/UnivX/123.4x/2T2015/instructor/api/update_forum_role_membership
    #     \"GET\": {\"action\": [\"allow\"], \"rolename\": [\"Administrator\"], \"unique_student_identifier\": [\"NEW_ADMIN_USER\"]}}"}
    #
    # module_id will be like:
    # "module_id": "UnivX/123.4x/forum/54492f0c892b21597e00030a"

    the_sql = """
              SELECT time, 
                     username,
                     '{course_id}' as course_id,
                     (case when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/threads/[^/]+/reply') then "reply"
                           when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/threads/[^/]+/upvote') then "upvote"
                           when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/threads/[^/]+/unvote') then "unvote"
                           when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/threads/[^/]+/update') then "update"
                           when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/threads/[^/]+/delete') then "delete"
                           when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/threads/[^/]+/close') then "close"
                           when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/threads/[^/]+/follow') then "follow_thread"
                           when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/threads/[^/]+/unfollow') then "unfollow_thread"
                           when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/threads/[^/]+/pin') then "pin"
                           when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/threads/[^/]+/unpin') then "unpin"
                           when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/threads/[^/]+/downvote') then "downvote"  # does this happen?
                           when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/comments/[^/]+/reply') then "comment_reply"
                           when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/comments/[^/]+/upvote') then "comment_upvote"
                           when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/comments/[^/]+/update') then "comment_update"
                           when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/comments/[^/]+/unvote') then "comment_unvote"
                           when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/comments/[^/]+/delete') then "comment_delete"
                           when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/forum/users/[^/]+/followed') then "follow_user"
                           when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/forum/users/[^/]+$') then "target_user"
                           when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/forum/[^/]+/threads/[^/]+') then "read"
                           when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/forum/[^/]+/inline') then "read_inline"
                           when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/forum/search') then "search"
                           when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/forum$') then "enter_forum"
                           when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/forum/$') then "enter_forum"
                           when REGEXP_MATCH(event_type, r'/courses/(.*)/instructor/api/(.*)') then REGEXP_EXTRACT(event_type, r'/courses/.*/instructor/api/(.*)')
                           when event_type = "edx.forum.thread.created" then "created_thread"
                           when event_type = "edx.forum.response.created" then "created_response"
                           when event_type = "edx.forum.comment.created" then "created_comment"
                           when event_type = "edx.forum.searched" then "searched"
                           else event_type end) as forum_action,
                           (case when module_id is not null then REGEXP_EXTRACT(module_id, r'[^/]+/[^/]+/forum/([^/]+)') # For old-school courses with transparent course ids
                                      else (case when module_id is null # otherwise, for new opaque course ids, use regex to find thread_id from event_type, since module_id is null
                                               then (case when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/forum/[^/]+/threads/[^/]+') then REGEXP_EXTRACT(event_type, r'/courses/.*/discussion/forum/[^/]+/threads/([^/]+)') # read
                                                      else (case when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/threads/[^/]+/') then REGEXP_EXTRACT(event_type, r'/courses/.*/discussion/threads/([^/]+)') # upvote, pinned, upvoted, unvoted, deleted, followed
                                                             else REGEXP_EXTRACT(event_type, r'/courses/.*/discussion/comments/([^/]+)/') end) # comment
                                                                end) end) end) as thread_id,
                     REGEXP_EXTRACT(event_type, r'/courses/.*/forum/([^/]+)/') as subject,
                     REGEXP_EXTRACT(event_type, r'/courses/.*/forum/users/([^/]+)') as target_user_id,
                     event_struct.query as search_query,   # unavailable before June 1, 2015
                     event_struct.GET as event_GET,        # unavailable before June 1, 2015
              FROM {DATASETS}
              WHERE  (REGEXP_MATCH(event_type ,r'^edx\.forum\..*')
                      or event_type contains "/discussion/forum"
                      or event_type contains "/discussion/threads"
                      or event_type contains "/discussion/comments"
                      or event_type contains "list-forum-"
                      or event_type contains "list_forum_"
                      or event_type contains "add-forum-"
                      or event_type contains "add_forum_"
                      or event_type contains "remove-forum-"
                      or event_type contains "remove_forum_"
                      or event_type contains "update_forum_"
                     ) 
                    AND username is not null
                    AND event is not null
                    and time > TIMESTAMP("{last_date}")
                    {hash_limit}
              order by time
              """

    try:
        bqutil.create_dataset_if_nonexistent(dataset)
        tinfo = bqutil.get_bq_table_info(dataset, table)
        assert tinfo is not None, "[make_forum_analysis] Creating %s.%s table for %s" % (
            dataset, table, course_id)

        print "[make_forum_analysis] Appending latest data to %s.%s table for %s" % (
            dataset, table, course_id)
        sys.stdout.flush()

    except (AssertionError, Exception) as err:
        print str(err)
        sys.stdout.flush()
        print " --> Missing %s.%s?  Attempting to create..." % (dataset, table)
        sys.stdout.flush()
        pass

    print "=== Processing Forum Events for %s (start %s)" % (
        course_id, datetime.datetime.now())
    sys.stdout.flush()

    def gdf(row):
        return datetime.datetime.utcfromtimestamp(float(row['time']))

    process_tracking_logs.run_query_on_tracking_logs(
        the_sql,
        table,
        course_id,
        force_recompute=force_recompute,
        use_dataset_latest=use_dataset_latest,
        get_date_function=gdf,
        has_hash_limit=True,
        end_date=end_date,
        skip_last_day=skip_last_day)

    print "Done with Forum Events for %s (end %s)" % (course_id,
                                                      datetime.datetime.now())
    print "=" * 77
    sys.stdout.flush()
Example #14
0
def process_course_time_on_asset(course_id,
                                 force_recompute=False,
                                 use_dataset_latest=False,
                                 end_date=None,
                                 just_do_totals=False,
                                 limit_query_size=False,
                                 table_max_size_mb=800,
                                 skip_totals=False,
                                 start_date=None,
                                 config_parameter_overrides=None):
    '''
    Create the time_on_asset_daily table, containing module_id, username, date, 
    and time on asset stats.  This table has separate rows for each date, 
    and is ordered in time.  To update it, a new day's logs are
    processed, then the results appended to this table.

    If the table doesn't exist, then run it once on all
    the existing tracking logs.  

    If it already exists, then run a query on it to see what dates have
    already been done.  Then do all tracking logs except those which
    have already been done.  Append the results to the existing table.

    Compute totals and store in time_on_asset_totals, by summing over all dates, 
    grouped by module_id.

    How are time_on_asset numbers computed?

    See discussion in make_time_on_task.py

    The time_one_asset_daily table has these columns:

    - date: gives day for the data
    - username
    - module_id
    - time_umid5: total time on module (by module_id) in seconds, with a 5-minute timeout
    - time_umid30: total time on module (by module_id) in seconds, with a 30-minute timeout

    '''

    if just_do_totals:
        return process_time_on_task_totals(
            course_id,
            force_recompute=force_recompute,
            use_dataset_latest=use_dataset_latest)

    SQL = """
            SELECT
                    "{course_id}" as course_id,
                    date(time) as date,
                    username,
                    module_id,
                    # time_umid5 = total time on module (by module_id) in seconds
                    # time_mid5 has 5 minute timeout, time_mid30 has 30 min timeout
                    SUM( case when dt_umid < 5*60 then dt_umid end ) as time_umid5,
                    SUM( case when dt_umid < 30*60 then dt_umid end ) as time_umid30,
            FROM (
              SELECT time,
                username,
                module_id,
                (time - last_time)/1.0E6 as dt,         # dt is in seconds
                (time - last_time_umid)/1.0E6 as dt_umid,   # dt for (user, module_id) in seconds
                last_time_umid,
              FROM
                (
                SELECT time,
                    username,
                    last_username,
                    module_id,
                    USEC_TO_TIMESTAMP(last_time) as last_time,
                    USEC_TO_TIMESTAMP(last_time_umid) as last_time_umid,                    
                FROM (
                  SELECT time,
                    username,
                    module_id,
                    lag(time, 1) over (partition by username order by time) last_time,
                    lag(username, 1) over (partition by username order by time) last_username,
                    lag(time, 1) over (partition by username, module_id order by time) last_time_umid,
                  FROM
                    (SELECT time, 
                      username,
                      (case when REGEXP_MATCH(module_id, r'.*\"\}}$') then REGEXP_EXTRACT(module_id, r'(.*)\"\}}$')
                            when REGEXP_MATCH(module_id, r'.*\"\]\}}\}}$') then REGEXP_EXTRACT(module_id, r'(.*)\"\]\}}\}}$')
                           else module_id end) as module_id,	# fix some errors in module_id names
                    FROM {DATASETS}
                  )
         
                    WHERE       
                                     module_id is not null
                                     AND username is not null
                                     AND username != ""
                                     and time > TIMESTAMP("{last_date}")
                                     
                    )
                )
              )
              WHERE module_id is not null
                    AND NOT module_id CONTAINS '"'
              GROUP BY date, module_id, username
              ORDER BY date, module_id, username
          """

    table = 'time_on_asset_daily'
    dataset_name = bqutil.course_id2dataset(course_id)
    bqutil.create_dataset_if_nonexistent(dataset_name)

    def gdf(row):
        return datetime.datetime.strptime(row['date'], '%Y-%m-%d')

    process_tracking_logs.run_query_on_tracking_logs(
        SQL,
        table,
        course_id,
        force_recompute=force_recompute,
        use_dataset_latest=use_dataset_latest,
        end_date=end_date,
        start_date=start_date,
        get_date_function=gdf,
        days_delta=0,
        has_hash_limit=True,
        newer_than=datetime.datetime(2015, 3, 15),  # schema change
        table_max_size_mb=table_max_size_mb,
        limit_query_size=limit_query_size)

    if not skip_totals:
        return process_time_on_asset_totals(
            course_id,
            force_recompute=force_recompute,
            use_dataset_latest=use_dataset_latest)

    return
def ExtractProblemEvents( course_id, force_recompute=False, use_dataset_latest=False, skip_last_day=False, end_date=None):
    dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest)
    table = TABLE_PROBLEM_EVENTS
    the_sql = """
SELECT  
    context.user_id as user_id, 
    time,
    event_source,
    REGEXP_EXTRACT(
      (CASE when module_id is not null then module_id 
          when event_type contains "/xblock/i4x:;_" then REPLACE(REGEXP_EXTRACT(event_type, r"i4x:;_;_(.*)/handler/xmodule"),";_", "/")
          else REPLACE(event_struct.problem, "i4x://", "")
          end),
      "[^/]+/problem/([^/]+)") as problem_url,
    (CASE when event_type contains "/xblock/i4x:;_" then REGEXP_EXTRACT(event_type, r"xmodule_handler/(.[^/]+)")
          when event_type contains "type@problem+block" then REGEXP_EXTRACT(event_type, r"xmodule_handler/(.[^/]+)")
          else event_type
          end) as event_type,
   event_struct.attempts as attempts,
   event_struct.success as success,
   event_struct.grade as grade,          
FROM {DATASETS}
WHERE       
   ( REGEXP_MATCH(event_type, r'problem_\w+') 
     OR event_type = "showanswer"
   )
   AND context.user_id is not null
   and time > TIMESTAMP("{last_date}")
   {hash_limit}
order by user_id, time
    """

    try:
        bqutil.create_dataset_if_nonexistent(dataset)
        tinfo = bqutil.get_bq_table_info(dataset, table )
        assert tinfo is not None, "[make_problem_events] Creating %s.%s table for %s" % (dataset, table, course_id)

        print "[make_problem_events] Appending latest data to %s.%s table for %s" % (dataset, table, course_id)
        sys.stdout.flush()

    except (AssertionError, Exception) as err:
        print str(err)
        sys.stdout.flush()
        print " --> Missing %s.%s?  Attempting to create..." % ( dataset, table )
        sys.stdout.flush()
        pass

    print "=== Processing Forum Events for %s (start %s)"  % (course_id, datetime.datetime.now())
    sys.stdout.flush()

    def gdf(row):
        return datetime.datetime.utcfromtimestamp(float(row['time']))

    process_tracking_logs.run_query_on_tracking_logs(the_sql, table, course_id, force_recompute=force_recompute,
                                                     use_dataset_latest=use_dataset_latest,
                                                     get_date_function=gdf,
                                                     has_hash_limit=True,
                                                     end_date=end_date,
                                                     skip_last_day=skip_last_day
                                                    )

    print "Done with Problem Events for %s (end %s)"  % (course_id, datetime.datetime.now())
    print "="*77
    sys.stdout.flush()
def load_sql_for_course(course_id,
                        gsbucket="gs://x-data",
                        basedir="X-Year-2-data-sql",
                        datedir="2014-09-21",
                        do_gs_copy=False,
                        use_dataset_latest=False):
    '''
    Load SQL files into google cloud storage then import into BigQuery.

    Datasets are typically named by course_id, with "__" replacing "/", and "_" replacing "."

    If use_dataset_latest then "_latest" is appended to the dataset name.  
    Thus, the latest SQL dataset can always be put in a consistently named dataset.
    '''

    print "Loading SQL for course %s into BigQuery (start: %s)" % (
        course_id, datetime.datetime.now())
    sys.stdout.flush()

    lfp = find_course_sql_dir(course_id,
                              basedir,
                              datedir,
                              use_dataset_latest=use_dataset_latest)

    print "Using this directory for local files: ", lfp
    sys.stdout.flush()

    # convert studentmodule if necessary

    fn_sm = lfp / 'studentmodule.csv.gz'
    if not fn_sm.exists():
        fn_sm = lfp / 'studentmodule.csv'
        if not fn_sm.exists():
            fn_sm = lfp / 'studentmodule.sql.gz'
            if not fn_sm.exists():
                fn_sm = lfp / 'studentmodule.sql'
                if not fn_sm.exists():
                    print "Error!  Missing studentmodule.[sql,csv][.gz]"
            if fn_sm.exists():  # have .sql or .sql.gz version: convert to .csv
                newfn = lfp / 'studentmodule.csv.gz'
                print "--> Converting %s to %s" % (fn_sm, newfn)
                tsv2csv(fn_sm, newfn)
                fn_sm = newfn

    if fn_sm.exists():
        # rephrase studentmodule if it's using opaque keys
        fline = ''
        smfp = openfile(fn_sm)
        fline = smfp.readline()  # skip first line - it's a header
        fline = smfp.readline()
        if 'block-v1:' in fline or 'course-v1' in fline:
            rephrase_studentmodule_opaque_keys(fn_sm)

    def convert_sql(fnroot):
        if os.path.exists(fnroot + ".csv") or os.path.exists(fnroot +
                                                             ".csv.gz"):
            return
        if os.path.exists(fnroot + ".sql") or os.path.exists(fnroot +
                                                             ".sql.gz"):
            infn = fnroot + '.sql'
            outfn = fnroot + '.csv.gz'
            print "--> Converting %s to %s" % (infn, outfn)
            tsv2csv(infn, outfn)

    # convert sql files if necesssary
    fnset = [
        'users', 'certificates', 'enrollment', "profiles", 'user_id_map',
        'rolecourse', 'roleforum'
    ]
    for fn in fnset:
        convert_sql(lfp / fn)

    local_files = glob.glob(lfp / '*')

    # if using latest date directory, also look for course_image.jpg one level up
    if use_dataset_latest:
        print lfp.dirname()
        ci_files = glob.glob(lfp.dirname() / 'course_image.jpg')
        if ci_files:
            local_files += list(ci_files)
            print "--> local course_image file: %s" % ci_files

    gsdir = gsutil.gs_path_from_course_id(
        course_id, gsbucket=gsbucket, use_dataset_latest=use_dataset_latest)

    local = pytz.timezone("America/New_York")

    if do_gs_copy:
        try:
            fnset = get_gs_file_list(gsdir)
        except Exception as err:
            fnset = []

        def copy_if_newer(fn, fnset, options='-z csv,json'):
            statbuf = os.stat(fn)
            mt = datetime.datetime.fromtimestamp(statbuf.st_mtime)

            # do some date checking to upload files which have changed, and are newer than that on google cloud storage
            local_dt = local.localize(mt, is_dst=None)
            utc_dt = local_dt.astimezone(pytz.utc)

            fnb = os.path.basename(fn)
            if fnb in fnset and fnset[fnb]['date'] > utc_dt:
                print "...%s already copied, skipping" % fn
                sys.stdout.flush()
                return
            elif fnb in fnset:
                print "...%s already exists, but has date=%s and mtime=%s, re-uploading" % (
                    fn, fnset[fnb]['date'], mt)

            gsutil.upload_file_to_gs(fn,
                                     gsdir / fnb,
                                     options=options,
                                     verbose=True)

        for fn in local_files:
            fnb = os.path.basename(fn)
            if fnb == 'course_image.jpg':
                copy_if_newer(fn, fnset, options='-a public-read')
            if not (fnb.endswith('.csv') or fnb.endswith('.json')
                    or fnb.endswith('.csv.gz') or fnb.endswith('.json.gz')
                    or fnb.endswith('.mongo.gz')):
                print "...unknown file type %s, skipping" % fn
                sys.stdout.flush()
                continue
            copy_if_newer(fn, fnset)

    # load into bigquery
    dataset = bqutil.course_id2dataset(course_id,
                                       use_dataset_latest=use_dataset_latest)
    bqutil.create_dataset_if_nonexistent(dataset)
    mypath = os.path.dirname(os.path.realpath(__file__))

    # load user_info_combo
    uicfn = lfp / 'user_info_combo.json.gz'
    if uicfn.exists():
        uic_schema = json.loads(
            open('%s/schemas/schema_user_info_combo.json' %
                 mypath).read())['user_info_combo']
        bqutil.load_data_to_table(dataset,
                                  'user_info_combo',
                                  gsdir / "user_info_combo.json.gz",
                                  uic_schema,
                                  wait=False)
    else:
        print "--> File %s does not exist, not loading user_info_combo into BigQuery" % uicfn

    # load studentmodule

    if fn_sm.exists():
        schemas = json.loads(open('%s/schemas/schemas.json' % mypath).read())
        cwsm_schema = schemas['courseware_studentmodule']
        bqutil.load_data_to_table(dataset,
                                  'studentmodule',
                                  gsdir / fn_sm.basename(),
                                  cwsm_schema,
                                  format='csv',
                                  wait=False,
                                  skiprows=1)
    else:
        print "--> Not loading studentmodule: file %s not found" % fn_sm
Example #17
0
def rephrase_forum_json_for_course(
    course_id,
    gsbucket="gs://x-data",
    basedir="X-Year-2-data-sql",
    datedir=None,
    do_gs_copy=False,
    use_dataset_latest=False,
):

    print "Loading SQL for course %s into BigQuery (start: %s)" % (
        course_id, datetime.datetime.now())
    sys.stdout.flush()

    lfp = find_course_sql_dir(course_id,
                              basedir,
                              datedir,
                              use_dataset_latest=use_dataset_latest)

    print "Using this directory for local files: ", lfp
    sys.stdout.flush()

    fn = 'forum.mongo'
    gsdir = gsutil.gs_path_from_course_id(course_id, gsbucket,
                                          use_dataset_latest)

    def openfile(fn, mode='r'):
        if (not os.path.exists(lfp / fn)) and (not fn.endswith('.gz')):
            fn += ".gz"
        if fn.endswith('.gz'):
            return gzip.GzipFile(lfp / fn, mode)
        return open(lfp / fn, mode)

    fp = openfile(fn)

    ofn = lfp / "forum-rephrased.json.gz"
    ofncsv = "forum.csv.gz"  # To match table name in BQ
    ofncsv_lfp = lfp / ofncsv

    dataset = bqutil.course_id2dataset(course_id,
                                       use_dataset_latest=use_dataset_latest)
    bqutil.create_dataset_if_nonexistent(dataset)

    if os.path.exists(ofn) and os.path.exists(ofncsv_lfp):

        tables = bqutil.get_list_of_table_ids(dataset)
        if not 'forum' in tables:
            print "Already done?  But no forums table loaded into datasaet %s.  Redoing." % dataset
        else:
            print "Already done %s -> %s (skipping)" % (fn, ofn)
            print "Already done %s -> %s (skipping)" % (fn, ofncsv_lfp)
            sys.stdout.flush()
            return

    print "Processing %s -> writing to %s and %s (%s)" % (
        fn, ofn, ofncsv, datetime.datetime.now())
    sys.stdout.flush()

    # Setup CSV header
    ocsv = csv.DictWriter(openfile(ofncsv, 'w'),
                          fieldnames=SCHEMA_DICT.keys(),
                          quoting=csv.QUOTE_NONNUMERIC)
    ocsv.writeheader()

    cnt = 0
    ofp = gzip.GzipFile('tmp.json.gz', 'w')
    data = OrderedDict()
    for line in fp:
        cnt += 1
        # Write JSON row
        newline = do_rephrase_line(line, linecnt=cnt)
        ofp.write(newline)

        try:
            #Write CSV row
            data = json.loads(newline)
            ocsv.writerow(data)
        except Exception as err:
            print "Error writing CSV output row %s=%s" % (cnt, data)
            raise

    ofp.close()

    print "...done (%s)" % datetime.datetime.now()

    if cnt == 0:
        print "...but cnt=0 entries found, skipping forum loading"
        sys.stdout.flush()
        return

    print "...copying to gsc"
    sys.stdout.flush()

    # do upload twice, because GSE file metadata doesn't always make it to BigQuery right away?
    gsfn = gsdir + '/' + "forum-rephrased.json.gz"
    cmd = 'gsutil cp tmp.json.gz %s' % (gsfn)
    os.system(cmd)
    os.system(cmd)

    table = 'forum'
    bqutil.load_data_to_table(dataset, table, gsfn, SCHEMA, wait=True)
    msg = "Original data from %s" % (lfp / fn)
    bqutil.add_description_to_table(dataset, table, msg, append=True)

    os.system('mv tmp.json.gz "%s"' % (ofn))

    print "...done (%s)" % datetime.datetime.now()
    sys.stdout.flush()
    def __init__(self, course_id_set, output_project_id=None, nskip=0, 
                 output_dataset_id=None, 
                 output_bucket=None,
                 use_dataset_latest=False,
                 only_step=None,
                 end_date=None,
                 ):
        '''
        Compute course report tables, based on combination of all person_course and other individual course tables.

        only_step: specify a single course report step to be executed; runs all reports, if None
        '''
        
        if only_step and ',' in only_step:
            only_step = only_step.split(',')
        self.only_step = only_step

        self.end_date = end_date;

        if not course_id_set:
            print "ERROR! Must specify list of course_id's for report.  Aborting."
            return

        org = course_id_set[0].split('/',1)[0]	# extract org from first course_id
        self.org = org

        self.output_project_id = output_project_id

        crname = ('course_report_%s' % org)
        if use_dataset_latest:
            crname = 'course_report_latest'
        self.dataset = output_dataset_id or crname

        self.gsbucket = gsutil.gs_path_from_course_id(crname, gsbucket=output_bucket)
        self.course_id_set = course_id_set

        course_datasets = [ bqutil.course_id2dataset(x, use_dataset_latest=use_dataset_latest) for x in course_id_set]

        # check to see which datasets have person_course tables
        datasets_with_pc = []
        self.all_pc_tables = OrderedDict()
        self.all_pcday_ip_counts_tables = OrderedDict()
        self.all_uic_tables = OrderedDict()
        self.all_tott_tables = OrderedDict()
        for cd in course_datasets:
            try:
                table = bqutil.get_bq_table_info(cd, 'person_course')
            except Exception as err:
                print "[make-course_report_tables] Err: %s" % str(err)
                table = None
            if table is not None:
                self.all_pc_tables[cd] = table
                datasets_with_pc.append(cd)

            try:
                table = bqutil.get_bq_table_info(cd, 'pcday_ip_counts')
            except Exception as err:
                table = None
            if table is not None:
                self.all_pcday_ip_counts_tables[cd] = table

            try:
                table = bqutil.get_bq_table_info(cd, 'user_info_combo')
            except Exception as err:
                table = None
            if table is not None:
                self.all_uic_tables[cd] = table

            try:
                table = bqutil.get_bq_table_info(cd, 'time_on_task_totals')
            except Exception as err:
                print "[make-course_report_tables] Err: %s" % str(err)
                table = None
            if table is not None:
                self.all_tott_tables[cd] = table

        pc_tables = ',\n'.join(['[%s.person_course]' % x for x in datasets_with_pc])
        pcday_ip_counts_tables = ',\n'.join(['[%s.pcday_ip_counts]' % x for x in self.all_pcday_ip_counts_tables])
        uic_tables = ',\n'.join(['[%s.user_info_combo]' % x for x in self.all_uic_tables])
        tott_tables = ',\n'.join(['[%s.time_on_task_totals]' % x for x in self.all_tott_tables])

        print "%d time_on_task tables: %s" % (len(self.all_tott_tables), tott_tables)
        sys.stdout.flush()

        # find latest combined person_course table
        cpc_tables = [ x for x in bqutil.get_list_of_table_ids(self.dataset) if x.startswith("person_course_") ]
        if cpc_tables:
            the_cpc_table = "[%s.%s]" % (self.dataset, max(cpc_tables))
        else:
            the_cpc_table = None
        print "[make_course_report_tables] ==> Using %s as the latest combined person_course table" % the_cpc_table

        self.parameters = {'dataset': self.dataset,
                           'pc_tables': pc_tables,
                           'uic_tables': uic_tables,
                           'tott_tables': tott_tables,
                           'pcday_ip_counts_tables': pcday_ip_counts_tables,
                           'combined_person_course': the_cpc_table,
                           }
        print "[make_course_report_tables] ==> Using these datasets (with person_course tables): %s" % datasets_with_pc

        self.course_datasets = course_datasets
    
        print "="*100
        print "Generating course report tables -> dataset=%s, project=%s" % (self.dataset, self.output_project_id)
        sys.stdout.flush()

        bqutil.create_dataset_if_nonexistent(self.dataset, project_id=output_project_id)

        self.nskip = nskip
        if 1:
            self.combine_show_answer_stats_by_course()
            self.make_totals_by_course()
            self.make_medians_by_course()
            self.make_table_of_email_addresses()
            self.make_global_modal_ip_table()
            self.make_enrollment_by_day()
            self.make_time_on_task_stats_by_course()
            self.make_total_populations_by_course()
            self.make_table_of_n_courses_registered()
            self.make_geographic_distributions()
            # self.count_tracking_log_events()
            self.make_overall_totals()
    
        print "="*100
        print "Done with course report tables"
        sys.stdout.flush()
Example #19
0
def do_save(cid, caset_in, xbundle, datadir, log_msg, use_dataset_latest=False):
    '''
    Save course axis data to bigquery
    
    cid = course_id
    caset = list of course axis data in dict format
    xbundle = XML bundle of course (everything except static files)
    datadir = directory where output files should be written
    log_msg = list of messages about processing errors and issues
    '''

    # BigQuery requires data to fit within a schema; let's make sure our lines all fit the schema
    mypath = os.path.dirname(os.path.realpath(__file__))
    the_schema = json.loads(open('%s/schemas/schema_course_axis.json' % mypath).read())['course_axis']
    dict_schema = schema2dict(the_schema)

    caset = copy.deepcopy(caset_in)

    datadir = path(datadir)
    cafn = datadir / 'course_axis.json' 
    xbfn = datadir / ('xbundle_%s.xml' % (cid.replace('/','__')))
    fp = open(cafn, 'w')
    linecnt = 0

    for ca in caset:
        linecnt += 1
        ca['course_id'] = cid
        data = ca['data']
        if data and not type(data)==dict:
            try:
                ca['data'] = json.loads(data)	# make it native, for mongo
            except Exception as err:
                print "failed to create json for %s, error=%s" % (data, err)
        if ca['start'] is not None:
            ca['start'] = str(ca['start'])	# datetime to string
        if  ca['due'] is not None:
            ca['due'] = str(ca['due'])	# datetime to string
        if (ca['data'] is None) or (ca['data']==''):
            ca.pop('data')
        check_schema(linecnt, ca, the_ds=dict_schema, coerce=True)
        try:
            # db.course_axis.insert(ca)
            fp.write(json.dumps(ca)+'\n')
        except Exception as err:
            print "Failed to save!  Error=%s, data=%s" % (err, ca)
    fp.close()

    # upload axis.json file and course xbundle
    gsdir = path(gsutil.gs_path_from_course_id(cid, use_dataset_latest=use_dataset_latest))
    if 1:
        gsutil.upload_file_to_gs(cafn, gsdir, options="-z json", verbose=False)
        gsutil.upload_file_to_gs(xbfn, gsdir, options='-z xml', verbose=False)

    # import into BigQuery
    dataset = bqutil.course_id2dataset(cid, use_dataset_latest=use_dataset_latest)
    bqutil.create_dataset_if_nonexistent(dataset)	# create dataset if not already existent
    table = "course_axis"
    bqutil.load_data_to_table(dataset, table, gsdir / (cafn.basename()), the_schema)

    msg = "="*100 + '\n'
    msg += "Course axis for %s\n" % (cid)
    msg += "="*100 + '\n'
    msg += '\n'.join(log_msg)
    msg = msg[:16184]		# max message length 16384
    
    bqutil.add_description_to_table(dataset, table, msg, append=True)

    print "    Done - inserted %s records into course_axis" % len(caset)
Example #20
0
def rephrase_forum_json_for_course(
    course_id,
    gsbucket="gs://x-data",
    basedir="X-Year-2-data-sql",
    datedir=None,
    do_gs_copy=False,
    use_dataset_latest=False,
):

    print "Loading SQL for course %s into BigQuery (start: %s)" % (
        course_id, datetime.datetime.now())
    sys.stdout.flush()

    lfp = find_course_sql_dir(course_id,
                              basedir,
                              datedir,
                              use_dataset_latest=use_dataset_latest)

    print "Using this directory for local files: ", lfp
    sys.stdout.flush()

    fn = 'forum.mongo'
    gsdir = gsutil.gs_path_from_course_id(course_id, gsbucket,
                                          use_dataset_latest)

    def openfile(fn, mode='r'):
        if (not os.path.exists(lfp / fn)) and (not fn.endswith('.gz')):
            fn += ".gz"
        if fn.endswith('.gz'):
            return gzip.GzipFile(lfp / fn, mode)
        return open(lfp / fn, mode)

    fp = openfile(fn)

    ofn = lfp / "forum-rephrased.json.gz"

    dataset = bqutil.course_id2dataset(course_id,
                                       use_dataset_latest=use_dataset_latest)
    bqutil.create_dataset_if_nonexistent(dataset)

    if os.path.exists(ofn):

        tables = bqutil.get_list_of_table_ids(dataset)
        if not 'forum' in tables:
            print "Already done?  But no forums table loaded into datasaet %s.  Redoing." % dataset
        else:
            print "Already done %s -> %s (skipping)" % (fn, ofn)
            sys.stdout.flush()
            return

    print "Processing %s -> %s (%s)" % (fn, ofn, datetime.datetime.now())
    sys.stdout.flush()

    cnt = 0
    ofp = gzip.GzipFile('tmp.json.gz', 'w')
    for line in fp:
        cnt += 1
        newline = do_rephrase_line(line, linecnt=cnt)
        ofp.write(newline)
    ofp.close()

    print "...done (%s)" % datetime.datetime.now()

    if cnt == 0:
        print "...but cnt=0 entries found, skipping forum loading"
        sys.stdout.flush()
        return

    print "...copying to gsc"
    sys.stdout.flush()

    # do upload twice, because GSE file metadata doesn't always make it to BigQuery right away?
    gsfn = gsdir + '/' + "forum-rephrased.json.gz"
    cmd = 'gsutil cp tmp.json.gz %s' % (gsfn)
    os.system(cmd)
    os.system(cmd)

    table = 'forum'
    bqutil.load_data_to_table(dataset, table, gsfn, SCHEMA, wait=True)
    msg = "Original data from %s" % (lfp / fn)
    bqutil.add_description_to_table(dataset, table, msg, append=True)

    os.system('mv tmp.json.gz "%s"' % (ofn))

    print "...done (%s)" % datetime.datetime.now()
    sys.stdout.flush()
def old_process_course(course_id, force_recompute=False):
    '''
    DEPRACATED - instead of creating one table per day, because there is so little
    total data, create one enrollday_all table (see other function below).

    make enrollday2_* tables for specified course_id
    '''

    SQL = """
            SELECT 
  		    "{course_id}" as course_id,
	            time, 
                    event_struct.user_id as user_id, 
                    (case when (event_type = "edx.course.enrollment.activated" 
                                and event_struct.mode = "honor")
                          then 1 
                          when (event_type = "edx.course.enrollment.deactivated" 
                                and event_struct.mode = "honor")
                          then -1 
                          else 0 end) as diff_enrollment_honor,
                    (case when (event_type = "edx.course.enrollment.activated" 
                                and event_struct.mode = "verified")
                          then 1 
                          when (event_type = "edx.course.enrollment.deactivated" 
                                and event_struct.mode = "verified")
                          then -1 
                          else 0 end) as diff_enrollment_verified,
                    (case when (event_type = "edx.course.enrollment.activated" 
                                and event_struct.mode = "audit")
                          then 1 
                          when (event_type = "edx.course.enrollment.deactivated" 
                                and event_struct.mode = "audit")
                          then -1 
                          else 0 end) as diff_enrollment_audit,
            FROM [{dataset}.{table_id}] 
            where (event_type = "edx.course.enrollment.activated") or
                  (event_type = "edx.course.enrollment.deactivated")
            order by time;
            """

    course_dir = course_id.replace('/', '__')
    dataset = bqutil.course_id2dataset(course_id)
    log_dataset = bqutil.course_id2dataset(course_id, dtype="logs")
    pcd_dataset = bqutil.course_id2dataset(course_id, dtype="pcday")

    print "Processing course %s (start %s)" % (course_id,
                                               datetime.datetime.now())
    sys.stdout.flush()

    log_tables = bqutil.get_tables(log_dataset)

    try:
        bqutil.create_dataset_if_nonexistent(pcd_dataset)
    except Exception as err:
        print "Oops, err when creating %s, err=%s" % (pcd_dataset, str(err))

    pcday_tables_info = bqutil.get_tables(pcd_dataset)
    pcday_tables = [
        x['tableReference']['tableId']
        for x in pcday_tables_info.get('tables', [])
    ]

    # print "pcday_tables = ", pcday_tables

    log_table_list = log_tables['tables']
    log_table_list.sort()

    for table in log_table_list:
        tr = table['tableReference']
        table_id = tr['tableId']
        if not table_id.startswith('tracklog'):
            continue

        date = table_id[9:]

        table_out = 'enrollday2_%s' % date

        if (table_out in pcday_tables) and not force_recompute:
            print "%s...already done, skipping" % table_id
            sys.stdout.flush()
            continue

        if bqutil.get_bq_table_size_rows(log_dataset, table_id) == 0:
            print "...zero size table %s, skipping" % table_id
            sys.stdout.flush()
            continue

        print("Creating %s " % table_out),

        the_sql = SQL.format(course_id=course_id,
                             dataset=log_dataset,
                             table_id=table_id)

        sys.stdout.flush()

        bqutil.create_bq_table(pcd_dataset, table_out, the_sql, wait=False)

    print "Done with course %s (end %s)" % (course_id, datetime.datetime.now())
    print "=" * 77
    sys.stdout.flush()
Example #22
0
def obsolete_process_course(course_id, force_recompute=False, check_dates=True):
    '''
    make person_course_day tables for specified course_id.  This version
    produces one table for each day.  It is inefficient when there are 
    many days with very small daily tracking log tables.
    '''

    PCDAY_SQL = """
    select username, 
           "{course_id}" as course_id,
           sum(bevent) as nevents,
           sum(bprogress) as nprogcheck,
           sum(bshow_answer) as nshow_answer,
           sum(bvideo) as nvideo, 
           sum(bproblem_check) as nproblem_check,
           sum(bforum) as nforum,
           sum(bshow_transcript) as ntranscript,
           sum(bseq_goto) as nseq_goto,
           sum(bseek_video) as nseek_video,
           sum(bpause_video) as npause_video,
           MAX(time) as last_event,
           AVG(
               case when (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 > 5*60 then null
               else (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 end
               ) as avg_dt,
           STDDEV(
               case when (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 > 5*60 then null
               else (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 end
           ) as sdv_dt,
           MAX(
               case when (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 > 5*60 then null
               else (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 end
           ) as max_dt,
           COUNT(
               case when (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 > 5*60 then null
               else (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 end
           ) as n_dt,
           SUM(
               case when (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 > 5*60 then null
               else (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 end
           ) as sum_dt
    from
    (SELECT username, 
      case when event_type = "play_video" then 1 else 0 end as bvideo,
      case when event_type = "problem_check" then 1 else 0 end as bproblem_check,
      case when username != "" then 1 else 0 end as bevent,
      case when regexp_match(event_type, "^/courses/{course_id}/discussion/.*") then 1 else 0 end as bforum,
      case when regexp_match(event_type, "^/courses/{course_id}/progress") then 1 else 0 end as bprogress,
      case when event_type in ("show_answer", "showanswer") then 1 else 0 end as bshow_answer,
      case when event_type = 'show_transcript' then 1 else 0 end as bshow_transcript,
      case when event_type = 'seq_goto' then 1 else 0 end as bseq_goto,
      case when event_type = 'seek_video' then 1 else 0 end as bseek_video,
      case when event_type = 'pause_video' then 1 else 0 end as bpause_video,
      # case when event_type = 'edx.course.enrollment.activated' then 1 else 0 end as benroll,
      # case when event_type = 'edx.course.enrollment.deactivated' then 1 else 0 end as bunenroll
      time,
      lag(time, 1) over (partition by username order by time) last_time
      FROM [{dataset}.{table_id}]
      WHERE
        NOT event_type contains "/xblock/"
        AND username != ""
    )
    group by course_id, username
    order by sdv_dt desc
    """

    course_dir = course_id.replace('/','__')
    dataset = bqutil.course_id2dataset(course_id)
    log_dataset = bqutil.course_id2dataset(course_id, dtype="logs")
    pcd_dataset = bqutil.course_id2dataset(course_id, dtype="pcday")

    print "Processing course %s (start %s)"  % (course_id, datetime.datetime.now())
    sys.stdout.flush()

    log_tables = bqutil.get_tables(log_dataset)

    try:
        bqutil.create_dataset_if_nonexistent(pcd_dataset)
    except Exception as err:
        print "Oops, err when creating %s, err=%s" % (pcd_dataset, str(err))
        
    pcday_tables_info = bqutil.get_tables(pcd_dataset)
    pcday_tables = [x['tableReference']['tableId'] for x in pcday_tables_info.get('tables', [])]

    print "pcday_tables = ", pcday_tables

    log_table_list = log_tables['tables']
    log_table_list.sort()

    for table in log_table_list:
        tr = table['tableReference']
        table_id = tr['tableId']
        if not table_id.startswith('tracklog'):
            continue
    
        date = table_id[9:]
    
        table_out = 'pcday_%s' % date
    
        if (table_out in pcday_tables) and not force_recompute:
            skip = True
            if check_dates:
                table_out_date = bqutil.get_bq_table_last_modified_datetime(pcd_dataset, table_out)
                log_table_date = bqutil.get_bq_table_last_modified_datetime(log_dataset, table_id)
                if log_table_date > table_out_date:
                    skip = False
                    print "%s...already exists, but table_out date=%s and log_table date=%s, so re-computing" % (table_out,
                                                                                                                 table_out_date,
                                                                                                                 log_table_date)
            if skip:
                print "%s...already done, skipping" % table_out
                sys.stdout.flush()
                continue

        if bqutil.get_bq_table_size_rows(log_dataset, table_id)==0:
            print "...zero size table %s, skipping" % table_id
            sys.stdout.flush()
            continue

        print ("Creating %s " % table_out),
        
        the_sql = PCDAY_SQL.format(course_id=course_id, 
                                   dataset=log_dataset,
                                   table_id=table_id)
    
        sys.stdout.flush()
    
        bqutil.create_bq_table(pcd_dataset, table_out, the_sql, wait=False)
    
    print "Done with course %s (end %s)"  % (course_id, datetime.datetime.now())
    print "="*77
    sys.stdout.flush()
Example #23
0
def load_local_logs_to_biqquery(course_id, start_date, end_date, verbose):
    """
    Loads the local tracking logs into Google BigQuery.

    First, it will try to create the dataset if not exist.

    Args:
        course_id: Course id string.
        start_date: Start date string to process the tracking logs.
        end_date: End date string to process the tracking logs.
        verbose: Whether or not the function logging should be verbose.
    """
    dataset_name = bqutil.course_id2dataset(course_id, dtype="logs")
    date_pattern = getattr(edx2bigquery_config,
                           'TRACKING_LOG_REGEX_DATE_PATTERN', '')
    tracking_start_date, tracking_end_date = get_start_and_end_date(
        start_date, end_date)
    schema = get_tracking_log_schema()

    bqutil.create_dataset_if_nonexistent(dataset_name)

    for file_name in local_util.get_tracking_log_file_list(course_id):
        if not file_name:
            continue

        file_match = re.findall(date_pattern, file_name)

        if not file_match and verbose:
            logging(
                'The file name: {} does not have the string date at the end of the name.'
                .format(file_name, ))
            continue

        file_date = dateutil.parser.parse(file_match[-1])

        if file_date <= tracking_end_date and file_date >= tracking_start_date:
            table_name = 'tracklog_{}'.format(
                file_date.strftime(
                    getattr(edx2bigquery_config, 'TRACKING_LOG_DATE_FORMAT',
                            '%Y-%m-%d')))

            if verbose:
                logging('Uploading: {} to the table: {}'.format(
                    file_name, table_name))

            bqutil.upload_local_data_to_big_query(
                dataset_id=dataset_name,
                table_id=table_name,
                schema=schema,
                course_id=course_id,
                file_name=file_name,
                source_format=DEFAULT_JSON_SOURCE_FORMAT_NAME,
            )
        elif verbose:
            logging(
                'The file with name: {} has a date before or after of the start_date and end_date provided.'
                .format(file_name, ))
            continue

        if verbose:
            logging(
                'The file with name: {} has been succesufully uploaded to Big Query.'
                .format(file_name, ))