Python gs_download_link Exemples, gsutil.gs_download_link Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : make_research_data_tables.py Projet : AbdouSeck/edx2bigquery

    def extractResearchData( self, course_id, tablename, the_dataset=None, rdp=None, rdp_format='csv', output_bucket=None, basedir='', datedir='', do_gzip=True):
	'''
		Get research data output into tables and archive onto server
	'''
	
	# Archive location
	if course_id is not None: # Individual Course Research Data Products

		self.gsp = gsutil.gs_path_from_course_id( course_id=course_id, gsbucket=output_bucket, use_dataset_latest=True )
		gsfilename  = "%s/%s" % ( self.gsp, RESEARCH_DATA_PRODUCTS[ rdp ] )

	else: 
		print "ERROR! Must specify course_id's.  Aborting."
		return

	try:
		# Copy to Google Storage
		msg = "[researchData]: Copying Research Data table %s to %s" % ( tablename, gsfilename )
		print msg
		#gsfilename  = "%s/%s-*.csv.gz" % ( self.gsp, tablename ) # temp
		gsfilename  = "%s/%s.csv.gz" % ( self.gsp, tablename ) # temp
		ret = bqutil.extract_table_to_gs( the_dataset, tablename, gsfilename, format=rdp_format, do_gzip=True, wait=True)
		msg = "[researchData]: CSV download link: %s" % gsutil.gs_download_link( gsfilename )
		print msg
		sys.stdout.flush()
	
	except Exception as err:

		print str(err)
		if ('BQ Error creating table' in str(err) ):
			msg = "[researchData]: Retrying... by sharding."
			print msg
			sys.stdout.flush()
			gsfilename  = "%s/%s-*.csv.gz" % ( self.gsp, tablename )
			print gsfilename
			sys.stdout.flush()
			ret = bqutil.extract_table_to_gs( the_dataset, tablename, gsfilename, format=rdp_format, do_gzip=True, wait=True)
			msg = "[researchData]: CSV download link: %s" % gsutil.gs_download_link( gsfilename )
			print msg
			sys.stdout.flush()
	

	# Copy from Google Storage to Secure Data Warehouse for archiving
	archiveLocation = find_course_sql_dir(course_id=course_id, basedir=basedir, datedir=datedir, use_dataset_latest=True)
	#time.sleep( CFG.TIME_TO_WAIT_30s ) # delay needed to allow for GS to upload file fully (This should be size dependent, and may not be enough time)
	msg = "[researchData]: Archiving Research Data table %s from %s to %s" % ( tablename, gsfilename, archiveLocation )
	print msg
        sys.stdout.flush()
	gsutil.upload_file_to_gs(src=gsfilename, dst=archiveLocation, verbose=True)

	pass

Exemple #2

0

Afficher le fichier

Fichier : make_course_report_tables.py Projet : maxliu/edx2bigquery

    def do_table(self, the_sql, tablename, the_dataset=None, sql_for_description=None, check_skip=True):

        if check_skip:
            if self.skip_or_do_step(tablename) < 0:
                return	# skip step

        if the_dataset is None:
            the_dataset = self.dataset

        print("Computing %s in BigQuery" % tablename)
        sys.stdout.flush()
        try:
            ret = bqutil.create_bq_table(the_dataset, tablename, the_sql, 
                                         overwrite=True,
                                         output_project_id=self.output_project_id,
                                         sql_for_description=sql_for_description or the_sql,
                                     )
        except Exception as err:
            print "ERROR! Failed on SQL="
            print the_sql
            raise

        gsfn = "%s/%s.csv" % (self.gsbucket, tablename)
        bqutil.extract_table_to_gs(the_dataset, tablename, gsfn, 
                                   format='csv', 
                                   do_gzip=False,
                                   wait=False)

        msg = "CSV download link: %s" % gsutil.gs_download_link(gsfn)
        print msg
        bqutil.add_description_to_table(the_dataset, tablename, msg, append=True, project_id=self.output_project_id)

Exemple #3

0

Afficher le fichier

Fichier : make_research_data_tables.py Projet : sibycharley/edx2bigquery

    def extractResearchData(self,
                            course_id,
                            tablename,
                            the_dataset=None,
                            rdp=None,
                            rdp_format='csv',
                            output_bucket=None,
                            basedir='',
                            datedir='',
                            do_gzip=True):
        '''
		Get research data output into tables and archive onto server
	'''

        # Archive location
        if course_id is not None:  # Individual Course Research Data Products

            self.gsp = gsutil.gs_path_from_course_id(course_id=course_id,
                                                     gsbucket=output_bucket,
                                                     use_dataset_latest=True)
            gsfilename = "%s/%s" % (self.gsp, RESEARCH_DATA_PRODUCTS[rdp])

        else:
            print "ERROR! Must specify course_id's.  Aborting."
            return

        try:
            # Copy to Google Storage
            msg = "[researchData]: Copying Research Data table %s to %s" % (
                tablename, gsfilename)
            print msg
            #gsfilename  = "%s/%s-*.csv.gz" % ( self.gsp, tablename ) # temp
            gsfilename = "%s/%s.csv.gz" % (self.gsp, tablename)  # temp
            ret = bqutil.extract_table_to_gs(the_dataset,
                                             tablename,
                                             gsfilename,
                                             format=rdp_format,
                                             do_gzip=True,
                                             wait=True)
            msg = "[researchData]: CSV download link: %s" % gsutil.gs_download_link(
                gsfilename)
            print msg
            sys.stdout.flush()

        except Exception as err:

            print str(err)
            if ('BQ Error creating table' in str(err)):
                msg = "[researchData]: Retrying... by sharding."
                print msg
                sys.stdout.flush()
                gsfilename = "%s/%s-*.csv.gz" % (self.gsp, tablename)
                print gsfilename
                sys.stdout.flush()
                ret = bqutil.extract_table_to_gs(the_dataset,
                                                 tablename,
                                                 gsfilename,
                                                 format=rdp_format,
                                                 do_gzip=True,
                                                 wait=True)
                msg = "[researchData]: CSV download link: %s" % gsutil.gs_download_link(
                    gsfilename)
                print msg
                sys.stdout.flush()

        # Copy from Google Storage to Secure Data Warehouse for archiving
        archiveLocation = find_course_sql_dir(course_id=course_id,
                                              basedir=basedir,
                                              datedir=datedir,
                                              use_dataset_latest=True)
        #time.sleep( CFG.TIME_TO_WAIT_30s ) # delay needed to allow for GS to upload file fully (This should be size dependent, and may not be enough time)
        msg = "[researchData]: Archiving Research Data table %s from %s to %s" % (
            tablename, gsfilename, archiveLocation)
        print msg
        sys.stdout.flush()
        gsutil.upload_file_to_gs(src=gsfilename,
                                 dst=archiveLocation,
                                 verbose=True)

        pass

Exemple #4

0

Afficher le fichier

Fichier : make_combined_person_course.py Projet : musixhine/edx2bigquery

def do_combine(
    course_id_set,
    project_id,
    outdir="DATA",
    nskip=0,
    output_project_id=None,
    output_dataset_id=None,
    output_bucket=None,
    use_dataset_latest=False,
):

    print "=" * 77
    print "Concatenating person course datasets from the following courses:"
    print course_id_set
    print "-" * 77

    outdir = path(outdir)
    if not outdir.exists():
        os.mkdir(outdir)

    ofnset = []
    cnt = 0
    for course_id in course_id_set:
        gb = gsutil.gs_path_from_course_id(
            course_id, use_dataset_latest=use_dataset_latest)
        ofn = outdir / ('person_course_%s.csv.gz' %
                        (course_id.replace('/', '__')))
        ofnset.append(ofn)

        if (nskip > 0) and ofn.exists():
            print "%s already exists, not downloading" % ofn
            sys.stdout.flush()
            continue

        if ofn.exists():
            fnset = gsutil.get_gs_file_list(gb)
            local_dt = gsutil.get_local_file_mtime_in_utc(ofn)
            fnb = 'person_course.csv.gz'
            if not fnb in fnset:
                print "%s/%s missing!  skipping %s" % (gb, fnb, course_id)
                continue
            if (fnb in fnset) and (local_dt >= fnset[fnb]['date']):
                print "%s already exists with date %s (gs file date %s), not re-downloading" % (
                    ofn, local_dt, fnset[fnb]['date'])
                sys.stdout.flush()
                continue
            else:
                print "%s already exists but has date %s (gs file date %s), so re-downloading" % (
                    ofn, local_dt, fnset[fnb]['date'])
                sys.stdout.flush()

        cmd = 'gsutil cp %s/person_course.csv.gz %s' % (gb, ofn)
        print "Retrieving %s via %s" % (course_id, cmd)
        sys.stdout.flush()
        os.system(cmd)
        cnt += 1
        #if cnt>2:
        #    break

    org = course_id_set[0].split('/', 1)[0]

    ofn = "person_course_%s_%s.csv" % (
        org, datetime.datetime.now().strftime('%Y-%m-%d-%H%M%S'))

    print "=" * 77
    print "Combining CSV files to produce %s" % ofn
    sys.stdout.flush()

    if (nskip > 1) and os.path.exists(ofn):
        print "%s already exists, not downloading" % ofn
    else:
        first = 1
        for zfn in ofnset:
            if first:
                cmd = "zcat %s > %s" % (zfn, ofn)
            else:
                cmd = "zcat %s | tail -n +2 >> %s" % (
                    zfn, ofn
                )  # first row is header; don't keep when concatenating
            print cmd
            first = 0
            os.system(cmd)

    gb = gsutil.gs_path_from_course_id('course_report_%s' % org,
                                       gsbucket=output_bucket)

    print "=" * 77
    print "Uploading combined CSV file to google cloud storage in bucket: %s" % gb
    sys.stdout.flush()
    cmd = "TMPDIR=/var/tmp gsutil cp -z csv %s %s/" % (ofn, gb)
    print cmd
    os.system(cmd)

    gsfn = gb + '/' + ofn
    print "Combined person_course dataset CSV download link: %s" % gsutil.gs_download_link(
        gsfn)

    # import into BigQuery
    crname = ('course_report_%s' % org)
    if use_dataset_latest:
        crname = 'course_report_latest'
    dataset = output_dataset_id or crname
    table = ofn[:-4].replace('-', '_')

    print "Importing into BigQuery as %s:%s.%s" % (project_id, dataset, table)
    sys.stdout.flush()
    mypath = os.path.dirname(os.path.realpath(__file__))
    SCHEMA_FILE = '%s/schemas/schema_person_course.json' % mypath
    the_schema = json.loads(open(SCHEMA_FILE).read())['person_course']

    bqutil.load_data_to_table(dataset,
                              table,
                              gsfn,
                              the_schema,
                              format='csv',
                              skiprows=1,
                              project_id=output_project_id)

    msg = ''
    msg += "Combined person-course dataset, with data from:\n"
    msg += str(course_id_set)
    msg += "\n\n"
    msg += "=" * 100 + "\n"
    msg += "CSV download link: %s" % gsutil.gs_download_link(gsfn)

    bqutil.add_description_to_table(dataset,
                                    table,
                                    msg,
                                    append=True,
                                    project_id=output_project_id)

    print "Done"
    sys.stdout.flush()

Exemple #5

0

Afficher le fichier

def do_combine(
    course_id_set,
    project_id,
    outdir="DATA",
    nskip=0,
    output_project_id=None,
    output_dataset_id=None,
    output_bucket=None,
    use_dataset_latest=False,
    extract_subset_tables=True,
):
    '''
    Combine individual person_course tables (from the set of specified course_id's) to create one single large
    person_course table.  Do this by downloading each file, checking to make sure they all have the same
    fields, concatenating, and uploading back to bigquery.  This is cheaper than doing a select *, and also
    uncovers person_course files which have the wrong schema (and it works around BQ's limitation on large
    result sizes).  The result is stored in the course_report_latest dataset (if use_dataset_latest), else 
    in course_report_ORG, where ORG is the configured organization name.

    If extract_subset_tables is True, then the subset of those who viewed (ie "participants"), and the subset
    of those who enrolled for IDV, are extracted and saved as person_course_viewed, and person_course_idv.
    (those are created using a select *, for efficiency, despite the cost).
    '''

    print "=" * 77
    print "Concatenating person course datasets from the following courses:"
    print course_id_set
    print "-" * 77

    outdir = path(outdir)
    if not outdir.exists():
        os.mkdir(outdir)

    ofnset = []
    cnt = 0
    for course_id in course_id_set:
        gb = gsutil.gs_path_from_course_id(
            course_id, use_dataset_latest=use_dataset_latest)
        ofn = outdir / ('person_course_%s.csv.gz' %
                        (course_id.replace('/', '__')))
        ofnset.append(ofn)

        if (nskip > 0) and ofn.exists():
            print "%s already exists, not downloading" % ofn
            sys.stdout.flush()
            continue

        if ofn.exists():
            fnset = gsutil.get_gs_file_list(gb)
            local_dt = gsutil.get_local_file_mtime_in_utc(ofn)
            fnb = 'person_course.csv.gz'
            if not fnb in fnset:
                print "%s/%s missing!  skipping %s" % (gb, fnb, course_id)
                continue
            if (fnb in fnset) and (local_dt >= fnset[fnb]['date']):
                print "%s already exists with date %s (gs file date %s), not re-downloading" % (
                    ofn, local_dt, fnset[fnb]['date'])
                sys.stdout.flush()
                continue
            else:
                print "%s already exists but has date %s (gs file date %s), so re-downloading" % (
                    ofn, local_dt, fnset[fnb]['date'])
                sys.stdout.flush()

        cmd = 'gsutil cp %s/person_course.csv.gz %s' % (gb, ofn)
        print "Retrieving %s via %s" % (course_id, cmd)
        sys.stdout.flush()
        os.system(cmd)
        cnt += 1
        #if cnt>2:
        #    break

    org = course_id_set[0].split('/', 1)[0]

    ofn = "person_course_%s_%s.csv" % (
        org, datetime.datetime.now().strftime('%Y-%m-%d-%H%M%S'))

    print "=" * 77
    print "Combining CSV files to produce %s" % ofn
    sys.stdout.flush()

    if (nskip > 1) and os.path.exists(ofn):
        print "%s already exists, not downloading" % ofn
    else:
        first = 1
        header = None
        for zfn in ofnset:
            if first:
                cmd = "zcat %s > %s" % (zfn, ofn)
                header = os.popen("zcat %s | head -1" % zfn).read().strip()
                firstfn = zfn
            else:
                cmd = "zcat %s | tail -n +2 >> %s" % (
                    zfn, ofn
                )  # first row is header; don't keep when concatenating
            print cmd
            first = 0
            new_header = os.popen("zcat %s | head -1" % zfn).read().strip()
            if not header == new_header:
                print "==> Warning!  header mismatch for %s vs %s" % (zfn,
                                                                      firstfn)
                print "    %s has: %s" % (firstfn, header)
                print "    but %s has: %s" % (zfn, new_header)
            sys.stdout.flush()
            os.system(cmd)

    gb = gsutil.gs_path_from_course_id('course_report_%s' % org,
                                       gsbucket=output_bucket)

    print "=" * 77
    print "Uploading combined CSV file to google cloud storage in bucket: %s" % gb
    sys.stdout.flush()
    cmd = "TMPDIR=/var/tmp gsutil cp -z csv %s %s/" % (ofn, gb)
    print cmd
    os.system(cmd)

    gsfn = gb + '/' + ofn
    print "Combined person_course dataset CSV download link: %s" % gsutil.gs_download_link(
        gsfn)

    # import into BigQuery
    crname = ('course_report_%s' % org)
    if use_dataset_latest:
        crname = 'course_report_latest'
    dataset = output_dataset_id or crname
    table = ofn[:-4].replace('-', '_')

    print "Importing into BigQuery as %s:%s.%s" % (project_id, dataset, table)
    sys.stdout.flush()
    mypath = os.path.dirname(os.path.realpath(__file__))
    SCHEMA_FILE = '%s/schemas/schema_person_course.json' % mypath
    the_schema = json.loads(open(SCHEMA_FILE).read())['person_course']

    bqutil.load_data_to_table(dataset,
                              table,
                              gsfn,
                              the_schema,
                              format='csv',
                              skiprows=1,
                              project_id=output_project_id)

    msg = ''
    msg += "Combined person-course dataset, with data from:\n"
    msg += str(course_id_set)
    msg += "\n\n"
    msg += "=" * 100 + "\n"
    msg += "CSV download link: %s" % gsutil.gs_download_link(gsfn)

    bqutil.add_description_to_table(dataset,
                                    table,
                                    msg,
                                    append=True,
                                    project_id=output_project_id)

    # copy the new table (which has a specific date in its name) to a generically named "person_course_latest"
    # so that future SQL queries can simply use this as the latest person course table
    print "-> Copying %s to %s.person_course_latest" % (table, dataset)
    bqutil.copy_bq_table(dataset, table, "person_course_latest")

    if extract_subset_tables:
        do_extract_subset_person_course_tables(dataset, table)

    print "Done"
    sys.stdout.flush()

Exemple #6

0

Afficher le fichier

Fichier : make_combined_person_course.py Projet : nilichen/edx2bigquery

def do_combine(course_id_set, project_id, outdir="DATA", nskip=0,
               output_project_id=None, output_dataset_id=None, output_bucket=None,
               use_dataset_latest=False,
               extract_subset_tables=True,
               ):
    '''
    Combine individual person_course tables (from the set of specified course_id's) to create one single large
    person_course table.  Do this by downloading each file, checking to make sure they all have the same
    fields, concatenating, and uploading back to bigquery.  This is cheaper than doing a select *, and also
    uncovers person_course files which have the wrong schema (and it works around BQ's limitation on large
    result sizes).  The result is stored in the course_report_latest dataset (if use_dataset_latest), else 
    in course_report_ORG, where ORG is the configured organization name.

    If extract_subset_tables is True, then the subset of those who viewed (ie "participants"), and the subset
    of those who enrolled for IDV, are extracted and saved as person_course_viewed, and person_course_idv.
    (those are created using a select *, for efficiency, despite the cost).
    '''

    print "="*77
    print "Concatenating person course datasets from the following courses:"
    print course_id_set
    print "-"*77

    outdir = path(outdir)
    if not outdir.exists():
        os.mkdir(outdir)
        
    ofnset = []
    cnt = 0
    for course_id in course_id_set:
        gb = gsutil.gs_path_from_course_id(course_id, use_dataset_latest=use_dataset_latest)
        ofn = outdir / ('person_course_%s.csv.gz' % (course_id.replace('/', '__')))
        ofnset.append(ofn)

        if (nskip>0) and ofn.exists():
            print "%s already exists, not downloading" % ofn
            sys.stdout.flush()
            continue

        if ofn.exists():
            fnset = gsutil.get_gs_file_list(gb)
            local_dt = gsutil.get_local_file_mtime_in_utc(ofn)
            fnb = 'person_course.csv.gz'
            if not fnb in fnset:
                print "%s/%s missing!  skipping %s" % (gb, fnb, course_id)
                continue
            if (fnb in fnset) and (local_dt >= fnset[fnb]['date']):
                print "%s already exists with date %s (gs file date %s), not re-downloading" % (ofn, local_dt, fnset[fnb]['date'])
                sys.stdout.flush()
                continue
            else:
                print "%s already exists but has date %s (gs file date %s), so re-downloading" % (ofn, local_dt, fnset[fnb]['date'])
                sys.stdout.flush()

        cmd = 'gsutil cp %s/person_course.csv.gz %s' % (gb, ofn)
        print "Retrieving %s via %s" % (course_id, cmd)
        sys.stdout.flush()
        os.system(cmd)
        cnt += 1
        #if cnt>2:
        #    break

    org = course_id_set[0].split('/',1)[0]

    ofn = "person_course_%s_%s.csv" % (org, datetime.datetime.now().strftime('%Y-%m-%d-%H%M%S'))

    print "="*77
    print "Combining CSV files to produce %s" % ofn
    sys.stdout.flush()

    if (nskip>1) and os.path.exists(ofn):
        print "%s already exists, not downloading" % ofn
    else:
        first = 1
        header = None
        for zfn in ofnset:
            if first:
                cmd = "zcat %s > %s" % (zfn, ofn)
                header = os.popen("zcat %s | head -1" % zfn).read().strip()
                firstfn = zfn
            else:
                cmd = "zcat %s | tail -n +2 >> %s" % (zfn, ofn)	# first row is header; don't keep when concatenating
            print cmd
            first = 0
            new_header = os.popen("zcat %s | head -1" % zfn).read().strip()
            if not header == new_header:
                print "==> Warning!  header mismatch for %s vs %s" % (zfn, firstfn)
                print "    %s has: %s" % (firstfn, header)
                print "    but %s has: %s" % (zfn, new_header)
            sys.stdout.flush()
            os.system(cmd)

    gb = gsutil.gs_path_from_course_id('course_report_%s' % org, gsbucket=output_bucket)

    print "="*77
    print "Uploading combined CSV file to google cloud storage in bucket: %s" % gb
    sys.stdout.flush()
    cmd = "TMPDIR=/var/tmp gsutil cp -z csv %s %s/" % (ofn, gb)
    print cmd
    os.system(cmd)

    gsfn = gb + '/' + ofn
    print "Combined person_course dataset CSV download link: %s" % gsutil.gs_download_link(gsfn)

    # import into BigQuery
    crname = ('course_report_%s' % org)
    if use_dataset_latest:
        crname = 'course_report_latest'
    dataset = output_dataset_id or crname
    table = ofn[:-4].replace('-','_')
    
    print "Importing into BigQuery as %s:%s.%s" % (project_id, dataset, table)
    sys.stdout.flush()
    mypath = os.path.dirname(os.path.realpath(__file__))
    SCHEMA_FILE = '%s/schemas/schema_person_course.json' % mypath
    the_schema = json.loads(open(SCHEMA_FILE).read())['person_course']

    bqutil.load_data_to_table(dataset, table, gsfn, the_schema, format='csv', skiprows=1, project_id=output_project_id)

    msg = ''
    msg += "Combined person-course dataset, with data from:\n"
    msg += str(course_id_set)
    msg += "\n\n"
    msg += "="*100 + "\n"
    msg += "CSV download link: %s" % gsutil.gs_download_link(gsfn)

    bqutil.add_description_to_table(dataset, table, msg, append=True, project_id=output_project_id)
    
    # copy the new table (which has a specific date in its name) to a generically named "person_course_latest"
    # so that future SQL queries can simply use this as the latest person course table
    print "-> Copying %s to %s.person_course_latest" % (table, dataset)
    bqutil.copy_bq_table(dataset, table, "person_course_latest")

    if extract_subset_tables:
        do_extract_subset_person_course_tables(dataset, table)

    print "Done"
    sys.stdout.flush()

Exemple #7

0

Afficher le fichier

Fichier : make_combined_person_course.py Projet : kesiena115/edx2bigquery

def do_combine(course_id_set, project_id, outdir="DATA", nskip=0,
               output_project_id=None, output_dataset_id=None, output_bucket=None,
               use_dataset_latest=False,
               ):

    print "="*77
    print "Concatenating person course datasets from the following courses:"
    print course_id_set
    print "-"*77

    outdir = path(outdir)
    if not outdir.exists():
        os.mkdir(outdir)
        
    ofnset = []
    cnt = 0
    for course_id in course_id_set:
        gb = gsutil.gs_path_from_course_id(course_id, use_dataset_latest=use_dataset_latest)
        ofn = outdir / ('person_course_%s.csv.gz' % (course_id.replace('/', '__')))
        ofnset.append(ofn)

        if (nskip>0) and ofn.exists():
            print "%s already exists, not downloading" % ofn
            sys.stdout.flush()
            continue

        if ofn.exists():
            fnset = gsutil.get_gs_file_list(gb)
            local_dt = gsutil.get_local_file_mtime_in_utc(ofn)
            fnb = 'person_course.csv.gz'
            if not fnb in fnset:
                print "%s/%s missing!  skipping %s" % (gb, fnb, course_id)
                continue
            if (fnb in fnset) and (local_dt >= fnset[fnb]['date']):
                print "%s already exists with date %s (gs file date %s), not re-downloading" % (ofn, local_dt, fnset[fnb]['date'])
                sys.stdout.flush()
                continue
            else:
                print "%s already exists but has date %s (gs file date %s), so re-downloading" % (ofn, local_dt, fnset[fnb]['date'])
                sys.stdout.flush()

        cmd = 'gsutil cp %s/person_course.csv.gz %s' % (gb, ofn)
        print "Retrieving %s via %s" % (course_id, cmd)
        sys.stdout.flush()
        os.system(cmd)
        cnt += 1
        #if cnt>2:
        #    break

    org = course_id_set[0].split('/',1)[0]

    ofn = "person_course_%s_%s.csv" % (org, datetime.datetime.now().strftime('%Y-%m-%d-%H%M%S'))

    print "="*77
    print "Combining CSV files to produce %s" % ofn
    sys.stdout.flush()

    if (nskip>1) and os.path.exists(ofn):
        print "%s already exists, not downloading" % ofn
    else:
        first = 1
        for zfn in ofnset:
            if first:
                cmd = "zcat %s > %s" % (zfn, ofn)
            else:
                cmd = "zcat %s | tail -n +2 >> %s" % (zfn, ofn)	# first row is header; don't keep when concatenating
            print cmd
            first = 0
            os.system(cmd)

    gb = gsutil.gs_path_from_course_id('course_report_%s' % org, gsbucket=output_bucket)

    print "="*77
    print "Uploading combined CSV file to google cloud storage in bucket: %s" % gb
    sys.stdout.flush()
    cmd = "TMPDIR=/var/tmp gsutil cp -z csv %s %s/" % (ofn, gb)
    print cmd
    os.system(cmd)

    gsfn = gb + '/' + ofn
    print "Combined person_course dataset CSV download link: %s" % gsutil.gs_download_link(gsfn)

    # import into BigQuery
    crname = ('course_report_%s' % org)
    if use_dataset_latest:
        crname = 'course_report_latest'
    dataset = output_dataset_id or crname
    table = ofn[:-4].replace('-','_')
    
    print "Importing into BigQuery as %s:%s.%s" % (project_id, dataset, table)
    sys.stdout.flush()
    mypath = os.path.dirname(os.path.realpath(__file__))
    SCHEMA_FILE = '%s/schemas/schema_person_course.json' % mypath
    the_schema = json.loads(open(SCHEMA_FILE).read())['person_course']

    bqutil.load_data_to_table(dataset, table, gsfn, the_schema, format='csv', skiprows=1, project_id=output_project_id)

    msg = ''
    msg += "Combined person-course dataset, with data from:\n"
    msg += str(course_id_set)
    msg += "\n\n"
    msg += "="*100 + "\n"
    msg += "CSV download link: %s" % gsutil.gs_download_link(gsfn)

    bqutil.add_description_to_table(dataset, table, msg, append=True, project_id=output_project_id)
    
    print "Done"
    sys.stdout.flush()