def load(config): """ Load the bigquery table load_data_from_file accepts following params: project_id, dataset_id, table_name, schema_file, data_path, source_format, write_disposition, poll_interval, num_retries """ schemas_dir = os.environ.get('SCHEMA_DIR', 'schemas/') print "Loading Clinical data into BigQuery..." load_data_from_file.run( config['project_id'], config['bq_dataset'], config['clinical']['bq_table'], schemas_dir + config['clinical']['schema_file'], 'gs://' + config['buckets']['open'] + '/' +\ config['clinical']['output_dir'] + '*', 'NEWLINE_DELIMITED_JSON', 'WRITE_EMPTY' ) print "*"*30 print "Loading Biospecimen data into BigQuery..." load_data_from_file.run( config['project_id'], config['bq_dataset'], config['biospecimen']['bq_table'], schemas_dir + config['biospecimen']['schema_file'], 'gs://' + config['buckets']['open'] + '/' +\ config['biospecimen']['output_dir'] + '*', 'NEWLINE_DELIMITED_JSON', 'WRITE_EMPTY' )
def load(config): """ Load the bigquery table load_data_from_file accepts following params: project_id, dataset_id, table_name, schema_file, data_path, source_format, write_disposition, poll_interval, num_retries """ schemas_dir = os.environ.get('SCHEMA_DIR', 'schemas/') #print "Loading mRNA unc HiSeq data into BigQuery.." #load_data_from_file.run( # config['project_id'], # config['bq_dataset'], # config['mrna']['unc']['bq_table_hiseq'], # schemas_dir + config['mrna']['unc']['schema_file'], # 'gs://' + config['buckets']['open'] + '/' +\ # config['mrna']['unc']['output_dir'] + 'IlluminaHiSeq/*', # 'NEWLINE_DELIMITED_JSON', # 'WRITE_EMPTY' #) print "*"*30 print "Loading mRNA unc GA data into BigQuery.." load_data_from_file.run( config['project_id'], config['bq_dataset'], config['mrna']['unc']['bq_table_ga'], schemas_dir + config['mrna']['unc']['schema_file'], 'gs://' + config['buckets']['open'] + '/' +\ config['mrna']['unc']['output_dir'] + 'IlluminaGA/*', 'NEWLINE_DELIMITED_JSON', 'WRITE_EMPTY' )
def load(config): """ Load the bigquery table load_data_from_file accepts following params: project_id, dataset_id, table_name, schema_file, data_path, source_format, write_disposition, poll_interval, num_retries """ log = configure_logging('data_load', 'logs/data_load.log') log.info('begin load of data into bigquery') schemas_dir = os.environ.get('SCHEMA_DIR', 'schemas/') log.info("\tLoading Data data into BigQuery...") load_data_from_file.run( config['project_id'], config['bq_dataset'], config['data']['bq_table'], schemas_dir + config['data']['schema_file'], 'gs://' + config['buckets']['open'] + '/' +\ config['data']['output_dir'] + '*', 'NEWLINE_DELIMITED_JSON', 'WRITE_EMPTY' ) log.info('finished load of data into bigquery')
def load(config): """ Load the bigquery table load_data_from_file accepts following params: project_id, dataset_id, table_name, schema_file, data_path, source_format, write_disposition, poll_interval, num_retries """ schemas_dir = os.environ.get("SCHEMA_DIR", "schemas/") print "Loading Isoform HiSeq data into BigQuery.." load_data_from_file.run( config["project_id"], config["bq_dataset"], config["mirna"]["isoform"]["bq_table_hiseq"], schemas_dir + config["mirna"]["isoform"]["schema_file"], "gs://" + config["buckets"]["open"] + "/" + config["mirna"]["isoform"]["output_dir"] + "IlluminaHiSeq/*", "NEWLINE_DELIMITED_JSON", "WRITE_EMPTY", ) print "*" * 30 print "Loading Isoform GA data into BigQuery.." load_data_from_file.run( config["project_id"], config["bq_dataset"], config["mirna"]["isoform"]["bq_table_ga"], schemas_dir + config["mirna"]["isoform"]["schema_file"], "gs://" + config["buckets"]["open"] + "/" + config["mirna"]["isoform"]["output_dir"] + "IlluminaGA/*", "NEWLINE_DELIMITED_JSON", "WRITE_EMPTY", )
def load(config): """ Load the bigquery table load_data_from_file accepts following params: project_id, dataset_id, table_name, schema_file, data_path, source_format, write_disposition, poll_interval, num_retries """ schemas_dir = os.environ.get('SCHEMA_DIR', 'schemas/') #print "Loading Methylation 450K data into BigQuery.." #load_data_from_file.run( # config['project_id'], # config['bq_dataset'], # config['methylation']['bq_table'], # schemas_dir + config['methylation']['schema_file'], # 'gs://' + config['buckets']['open'] + '/' +\ # config['methylation']['output_dir'] + 'HumanMethylation450/*', # 'CSV', # 'WRITE_EMPTY' #) print "*" * 30 print "Loading Methylation 27K data into BigQuery.." load_data_from_file.run( config['project_id'], config['bq_dataset'], config['methylation']['bq_table'], schemas_dir + config['methylation']['schema_file'], 'gs://' + config['buckets']['open'] + '/' +\ config['methylation']['output_dir'] + 'HumanMethylation27/*', 'CSV', 'WRITE_APPEND' )
def load(config): """ Load the bigquery table load_data_from_file accepts following params: project_id, dataset_id, table_name, schema_file, data_path, source_format, write_disposition, poll_interval, num_retries """ schemas_dir = os.environ.get('SCHEMA_DIR', 'schemas/') print "Loading Isoform HiSeq data into BigQuery.." load_data_from_file.run( config['project_id'], config['bq_dataset'], config['mirna']['isoform']['bq_table_hiseq'], schemas_dir + config['mirna']['isoform']['schema_file'], 'gs://' + config['buckets']['open'] + '/' +\ config['mirna']['isoform']['output_dir'] + 'IlluminaHiSeq/*', 'NEWLINE_DELIMITED_JSON', 'WRITE_EMPTY' ) print "*"*30 print "Loading Isoform GA data into BigQuery.." load_data_from_file.run( config['project_id'], config['bq_dataset'], config['mirna']['isoform']['bq_table_ga'], schemas_dir + config['mirna']['isoform']['schema_file'], 'gs://' + config['buckets']['open'] + '/' +\ config['mirna']['isoform']['output_dir'] + 'IlluminaGA/*', 'NEWLINE_DELIMITED_JSON', 'WRITE_EMPTY' )
def load(config): """ Load the bigquery table load_data_from_file accepts following params: project_id, dataset_id, table_name, schema_file, data_path, source_format, write_disposition, poll_interval, num_retries """ schemas_dir = os.environ.get('SCHEMA_DIR', 'schemas/') print "Loading Clinical data into BigQuery.." load_data_from_file.run( config['project_id'], config['bq_dataset'], config['clinical']['bq_table'], schemas_dir + config['clinical']['schema_file'], 'gs://' + config['buckets']['open'] + '/' +\ config['clinical']['output_dir'] + '*', 'NEWLINE_DELIMITED_JSON', 'WRITE_EMPTY' ) print "*"*30 print "Loading Biospecimen data into BigQuery.." load_data_from_file.run( config['project_id'], config['bq_dataset'], config['biospecimen']['bq_table'], schemas_dir + config['biospecimen']['schema_file'], 'gs://' + config['buckets']['open'] + '/' +\ config['biospecimen']['output_dir'] + 'IlluminaGA/*', 'NEWLINE_DELIMITED_JSON', 'WRITE_EMPTY' )
def load(config): """ Load the bigquery table load_data_from_file accepts following params: project_id, dataset_id, table_name, schema_file, data_path, source_format, write_disposition, poll_interval, num_retries """ schemas_dir = os.environ.get('SCHEMA_DIR', 'schemas/') #print "Loading Methylation 450K data into BigQuery.." #load_data_from_file.run( # config['project_id'], # config['bq_dataset'], # config['methylation']['bq_table'], # schemas_dir + config['methylation']['schema_file'], # 'gs://' + config['buckets']['open'] + '/' +\ # config['methylation']['output_dir'] + 'HumanMethylation450/*', # 'CSV', # 'WRITE_EMPTY' #) print "*"*30 print "Loading Methylation 27K data into BigQuery.." load_data_from_file.run( config['project_id'], config['bq_dataset'], config['methylation']['bq_table'], schemas_dir + config['methylation']['schema_file'], 'gs://' + config['buckets']['open'] + '/' +\ config['methylation']['output_dir'] + 'HumanMethylation27/*', 'CSV', 'WRITE_APPEND' )
def load(): """ Load the bigquery table load_data_from_file accepts following params: project_id, dataset_id, table_name, schema_file, data_path, source_format, write_disposition, poll_interval, num_retries """ print "*" * 30 print "Loading CCLE GCT data into BigQuery.." load_data_from_file.run( '', '', '', 'schema.json', 'intermediary/CCLE_mrna_expr/bq_data_files/ccle_mrna_expr.csv', 'CSV', 'WRITE_EMPTY')
def load(config): """ Load the bigquery table load_data_from_file accepts following params: project_id, dataset_id, table_name, schema_file, data_path, source_format, write_disposition, poll_interval, num_retries """ schemas_dir = os.environ.get('SCHEMA_DIR', 'schemas/') load_data_from_file.run( config['project_id'], 'genomic_reference', config['mirtarbase']['bq_table'], schemas_dir + config['mirtarbase']['schema_file'], 'gs://' + config['buckets']['open'] + '/' + config['mirtarbase']['output_file'], 'NEWLINE_DELIMITED_JSON', 'WRITE_EMPTY')
def load(config): """ Load the bigquery table load_data_from_file accepts following params: project_id, dataset_id, table_name, schema_file, data_path, source_format, write_disposition, poll_interval, num_retries """ schemas_dir = os.environ.get('SCHEMA_DIR', 'schemas/') load_data_from_file.run( config['project_id'], config['bq_dataset'], config['tcga_annotations']['bq_table'], schemas_dir + config['tcga_annotations']['schema_file'], 'gs://' + config['buckets']['open'] + '/' + config['tcga_annotations']['output_file'], 'NEWLINE_DELIMITED_JSON', 'WRITE_EMPTY' )
def load(project_id, bq_datasets, bq_tables, schema_files, gcs_file_paths, write_dispositions, log): """ Load the bigquery table load_data_from_file accepts following params: project_id, dataset_id, table_name, schema_file, data_path, source_format, write_disposition, poll_interval, num_retries """ log.info('\tbegin load of %s data into bigquery' % (gcs_file_paths)) sep = '' for index in range(len(bq_datasets)): log.info("%s\t\tLoading %s table into BigQuery.." % (sep, bq_datasets[index])) load_data_from_file.run(project_id, bq_datasets[index], bq_tables[index], schema_files[index], gcs_file_paths[index] + '/*', 'NEWLINE_DELIMITED_JSON', write_dispositions[index]) sep = '\n\t\t"*"*30\n' log.info('done load %s of data into bigquery' % (gcs_file_paths))
def load(config): """ Load the bigquery table load_data_from_file accepts following params: project_id, dataset_id, table_name, schema_file, data_path, source_format, write_disposition, poll_interval, num_retries """ log = configure_logging('cnv_load', 'logs/cnv_load.log') log.info('begin load of cnv into bigquery') schemas_dir = os.environ.get('SCHEMA_DIR', 'schemas/') #print "Loading Methylation 450K data into BigQuery.." #load_data_from_file.run( # config['project_id'], # config['bq_dataset'], # config['methylation']['bq_table'], # schemas_dir + config['methylation']['schema_file'], # 'gs://' + config['buckets']['open'] + '/' +\ # config['methylation']['output_dir'] + 'HumanMethylation450/*', # 'CSV', # 'WRITE_EMPTY' #) dir_prefix = config['cnv']['output_dir_prefix'] dir_suffixes = config['cnv']['output_dir_suffixes'] for dir_suffix in dir_suffixes: log.info("\tLoading CNV data into BigQuery from %s..." % (dir_prefix + dir_suffix)) load_data_from_file.run( config['project_id'], config['bq_dataset'], config['cnv']['bq_table'], schemas_dir + config['cnv']['schema_file'], 'gs://' + config['buckets']['open'] + '/' +\ dir_prefix + dir_suffix + '*', 'NEWLINE_DELIMITED_JSON', 'WRITE_APPEND' ) log.info("*"*30) log.info('finished load of CNV into bigquery')
def load(config): """ Load the bigquery table load_data_from_file accepts following params: project_id, dataset_id, table_name, schema_file, data_path, source_format, write_disposition, poll_interval, num_retries """ log = configure_logging('mirna_isoform_matrix_load', 'logs/mirna_isoform_matrix_load.log') log.info('begin load of mirna isoform matrix into bigquery') schemas_dir = os.environ.get('SCHEMA_DIR', 'schemas/') log.info("\tLoading Isoform HiSeq matrix data into BigQuery..") load_data_from_file.run( config['project_id'], config['bq_dataset'], config['mirna_isoform_matrix']['bq_table_hiseq'], schemas_dir + config['mirna_isoform_matrix']['schema_file'], 'gs://' + config['buckets']['open'] + '/' +\ config['mirna_isoform_matrix']['IlluminaHiSeq']['output_dir'] + '*', 'NEWLINE_DELIMITED_JSON', 'WRITE_EMPTY' ) log.info("*" * 30) log.info("\tLoading Isoform GA matrix data into BigQuery..") load_data_from_file.run( config['project_id'], config['bq_dataset'], config['mirna_isoform_matrix']['bq_table_ga'], schemas_dir + config['mirna_isoform_matrix']['schema_file'], 'gs://' + config['buckets']['open'] + '/' +\ config['mirna_isoform_matrix']['IlluminaGA']['output_dir'] + '*', 'NEWLINE_DELIMITED_JSON', 'WRITE_APPEND' ) log.info('done load of mirna isoform matrix into bigquery')
def load(config): """ Load the bigquery table load_data_from_file accepts following params: project_id, dataset_id, table_name, schema_file, data_path, source_format, write_disposition, poll_interval, num_retries """ log = configure_logging('methylation_split', 'logs/methylation_load.log') log.info('begin load of methylation into bigquery') schemas_dir = os.environ.get('SCHEMA_DIR', 'schemas/') #print "Loading Methylation 450K data into BigQuery.." #load_data_from_file.run( # config['project_id'], # config['bq_dataset'], # config['methylation']['bq_table'], # schemas_dir + config['methylation']['schema_file'], # 'gs://' + config['buckets']['open'] + '/' +\ # config['methylation']['output_dir'] + 'HumanMethylation450/*', # 'CSV', # 'WRITE_EMPTY' #) log.info("\tLoading Methylation data into BigQuery...") load_data_from_file.run( config['project_id'], config['bq_dataset'], config['methylation']['bq_table'], schemas_dir + config['methylation']['schema_file'], 'gs://' + config['buckets']['open'] + '/' +\ config['methylation']['output_dir'] + '*', 'CSV', 'WRITE_APPEND' ) main(config, log) log.info('finished load of methylation into bigquery')
def process_user_gen_files(project_id, user_project_id, study_id, bucket_name, bq_dataset, cloudsql_tables, files): print 'Begin processing user_gen files.' # connect to the cloud bucket gcs = GcsConnector(project_id, bucket_name) data_df = pd.DataFrame() # Collect all columns that get passed in for generating BQ schema later all_columns = [] # For each file, download, convert to df for idx, file in enumerate(files): blob_name = file['FILENAME'].split('/')[1:] all_columns += file['COLUMNS'] metadata = { 'sample_barcode': file.get('SAMPLEBARCODE', ''), 'participant_barcode': file.get('PARTICIPANTBARCODE', ''), 'study_id': study_id, 'platform': file.get('PLATFORM', ''), 'pipeline': file.get('PIPELINE', ''), 'file_path': file['FILENAME'], 'file_name': file['FILENAME'].split('/')[-1], 'data_type': file['DATATYPE'] } # download, convert to df filebuffer = gcs.download_blob_to_file(blob_name) # Get column mapping column_mapping = get_column_mapping(file['COLUMNS']) if idx == 0: data_df = convert_file_to_dataframe(filebuffer, skiprows=0, header=0) data_df = cleanup_dataframe(data_df) data_df.rename(columns=column_mapping, inplace=True) # Generate Metadata for this file insert_metadata(data_df, metadata, cloudsql_tables['METADATA_DATA']) else: # convert blob into dataframe new_df = convert_file_to_dataframe(filebuffer, skiprows=0, header=0) new_df = cleanup_dataframe(new_df) new_df.rename(columns=column_mapping, inplace=True) # Generate Metadata for this file insert_metadata(new_df, metadata, cloudsql_tables['METADATA_DATA']) # TODO: Write function to check for participant barcodes, for now, we assume each file contains SampleBarcode Mapping data_df = pd.merge(data_df, new_df, on='sample_barcode', how='outer') # For complete dataframe, create metadata_samples rows print 'Inserting into data into {0}.'.format(cloudsql_tables['METADATA_SAMPLES']) data_df = cleanup_dataframe(data_df) data_df['has_mrna'] = 0 data_df['has_mirna'] = 0 data_df['has_protein'] = 0 data_df['has_meth'] = 0 insert_metadata_samples(data_df, cloudsql_tables['METADATA_SAMPLES']) # Update and create bq table file temp_outfile = cloudsql_tables['METADATA_SAMPLES'] + '.out' tmp_bucket = os.environ.get('tmp_bucket') gcs.convert_df_to_njson_and_upload(data_df, temp_outfile, tmp_bucket=tmp_bucket) # Using temporary file location (in case we don't have write permissions on user's bucket? source_path = 'gs://' + tmp_bucket + '/' + temp_outfile schema = generate_bq_schema(all_columns) table_name = 'cgc_user_{0}_{1}'.format(user_project_id, study_id) load_data_from_file.run( project_id, bq_dataset, table_name, schema, source_path, source_format='NEWLINE_DELIMITED_JSON', write_disposition='WRITE_APPEND', is_schema_file=False) # Generate feature_defs feature_defs = generate_feature_defs(study_id, project_id, bq_dataset, table_name, schema) # Update feature_defs table insert_feature_defs_list(cloudsql_tables['FEATURE_DEFS'], feature_defs) # Delete temporary files print 'Deleting temporary file {0}'.format(temp_outfile) gcs = GcsConnector(project_id, tmp_bucket) gcs.delete_blob(temp_outfile)
def process_user_gen_files(project_id, user_project_id, study_id, bucket_name, bq_dataset, cloudsql_tables, files): print 'Begin processing user_gen files.' # connect to the cloud bucket gcs = GcsConnector(project_id, bucket_name) data_df = pd.DataFrame() # Collect all columns that get passed in for generating BQ schema later all_columns = [] # For each file, download, convert to df for idx, file in enumerate(files): blob_name = file['FILENAME'].split('/')[1:] all_columns += file['COLUMNS'] metadata = { 'sample_barcode': file.get('SAMPLEBARCODE', ''), 'participant_barcode': file.get('PARTICIPANTBARCODE', ''), 'study_id': study_id, 'platform': file.get('PLATFORM', ''), 'pipeline': file.get('PIPELINE', ''), 'file_path': file['FILENAME'], 'file_name': file['FILENAME'].split('/')[-1], 'data_type': file['DATATYPE'] } # download, convert to df filebuffer = gcs.download_blob_to_file(blob_name) # Get column mapping column_mapping = get_column_mapping(file['COLUMNS']) if idx == 0: data_df = convert_file_to_dataframe(filebuffer, skiprows=0, header=0) data_df = cleanup_dataframe(data_df) data_df.rename(columns=column_mapping, inplace=True) # Generate Metadata for this file insert_metadata(data_df, metadata, cloudsql_tables['METADATA_DATA']) else: # convert blob into dataframe new_df = convert_file_to_dataframe(filebuffer, skiprows=0, header=0) new_df = cleanup_dataframe(new_df) new_df.rename(columns=column_mapping, inplace=True) # Generate Metadata for this file insert_metadata(new_df, metadata, cloudsql_tables['METADATA_DATA']) # TODO: Write function to check for participant barcodes, for now, we assume each file contains SampleBarcode Mapping data_df = pd.merge(data_df, new_df, on='sample_barcode', how='outer') # For complete dataframe, create metadata_samples rows print 'Inserting into data into {0}.'.format(cloudsql_tables['METADATA_SAMPLES']) data_df = cleanup_dataframe(data_df) data_df['has_mrna'] = 0 data_df['has_mirna'] = 0 data_df['has_protein'] = 0 data_df['has_meth'] = 0 insert_metadata_samples(data_df, cloudsql_tables['METADATA_SAMPLES']) # Update and create bq table file temp_outfile = cloudsql_tables['METADATA_SAMPLES'] + '.out' tmp_bucket = os.environ.get('tmp_bucket_location') gcs.convert_df_to_njson_and_upload(data_df, temp_outfile, tmp_bucket=tmp_bucket) # Using temporary file location (in case we don't have write permissions on user's bucket? source_path = 'gs://' + tmp_bucket + '/' + temp_outfile schema = generate_bq_schema(all_columns) table_name = 'cgc_user_{0}_{1}'.format(user_project_id, study_id) load_data_from_file.run( project_id, bq_dataset, table_name, schema, source_path, source_format='NEWLINE_DELIMITED_JSON', write_disposition='WRITE_APPEND', is_schema_file=False) # Generate feature_defs feature_defs = generate_feature_defs(study_id, project_id, bq_dataset, table_name, schema) # Update feature_defs table insert_feature_defs_list(cloudsql_tables['FEATURE_DEFS'], feature_defs) # Delete temporary files print 'Deleting temporary file {0}'.format(temp_outfile) gcs = GcsConnector(project_id, tmp_bucket) gcs.delete_blob(temp_outfile)
def parse_file(project_id, bq_dataset, bucket_name, file_data, filename, outfilename, metadata, cloudsql_tables): print 'Begin processing {0}.'.format(filename) # connect to the cloud bucket gcs = GcsConnector(project_id, bucket_name) #main steps: download, convert to df, cleanup, transform, add metadata filebuffer = gcs.download_blob_to_file(filename) # convert blob into dataframe data_df = convert_file_to_dataframe(filebuffer, skiprows=0, header=0) # clean-up dataframe data_df = cleanup_dataframe(data_df) new_df_data = [] map_values = {} # Get basic column information depending on datatype column_map = get_column_mapping(metadata['data_type']) # Column headers are sample ids for i, j in data_df.iteritems(): if i in column_map.keys(): map_values[column_map[i]] = [k for d, k in j.iteritems()] else: for k, m in j.iteritems(): new_df_obj = {} new_df_obj[ 'sample_barcode'] = i # Normalized to match user_gen new_df_obj['Project'] = metadata['project_id'] new_df_obj['Study'] = metadata['study_id'] new_df_obj['Platform'] = metadata['platform'] new_df_obj['Pipeline'] = metadata['pipeline'] # Optional values new_df_obj['Symbol'] = map_values['Symbol'][ k] if 'Symbol' in map_values.keys() else '' new_df_obj['ID'] = map_values['ID'][ k] if 'ID' in map_values.keys() else '' new_df_obj['TAB'] = map_values['TAB'][ k] if 'TAB' in map_values.keys() else '' new_df_obj['Level'] = m new_df_data.append(new_df_obj) new_df = pd.DataFrame(new_df_data) # Get unique barcodes and update metadata_data table sample_barcodes = list( set([k for d, k in new_df['SampleBarcode'].iteritems()])) sample_metadata_list = [] for barcode in sample_barcodes: new_metadata = metadata.copy() new_metadata['sample_barcode'] = barcode sample_metadata_list.append(new_metadata) update_metadata_data_list(cloudsql_tables['METADATA_DATA'], sample_metadata_list) # Update metadata_samples table update_molecular_metadata_samples_list(cloudsql_tables['METADATA_SAMPLES'], metadata['data_type'], sample_barcodes) # Generate feature names and bq_mappings table_name = file_data['BIGQUERY_TABLE_NAME'] feature_defs = generate_feature_Defs(metadata['data_type'], metadata['study_id'], project_id, bq_dataset, table_name, new_df) # Update feature_defs table insert_feature_defs_list(cloudsql_tables['FEATURE_DEFS'], feature_defs) # upload the contents of the dataframe in njson format tmp_bucket = os.environ.get('tmp_bucket_location') gcs.convert_df_to_njson_and_upload(new_df, outfilename, metadata=metadata, tmp_bucket=tmp_bucket) # Load into BigQuery # Using temporary file location (in case we don't have write permissions on user's bucket?) source_path = 'gs://' + tmp_bucket + '/' + outfilename schema = get_molecular_schema() load_data_from_file.run(project_id, bq_dataset, table_name, schema, source_path, source_format='NEWLINE_DELIMITED_JSON', write_disposition='WRITE_APPEND', is_schema_file=False) # Delete temporary files print 'Deleting temporary file {0}'.format(outfilename) gcs = GcsConnector(project_id, tmp_bucket) gcs.delete_blob(outfilename)
def parse_file(project_id, bq_dataset, bucket_name, file_data, filename, outfilename, metadata, cloudsql_tables): print 'Begin processing {0}.'.format(filename) # connect to the cloud bucket gcs = GcsConnector(project_id, bucket_name) #main steps: download, convert to df, cleanup, transform, add metadata filebuffer = gcs.download_blob_to_file(filename) # convert blob into dataframe data_df = convert_file_to_dataframe(filebuffer, skiprows=0, header=0) # clean-up dataframe data_df = cleanup_dataframe(data_df) new_df_data = [] map_values = {} # Get basic column information depending on datatype column_map = get_column_mapping(metadata['data_type']) # Column headers are sample ids for i, j in data_df.iteritems(): if i in column_map.keys(): map_values[column_map[i]] = [k for d, k in j.iteritems()] else: for k, m in j.iteritems(): new_df_obj = {} new_df_obj['sample_barcode'] = i # Normalized to match user_gen new_df_obj['project_id'] = metadata['project_id'] new_df_obj['study_id'] = metadata['study_id'] new_df_obj['Platform'] = metadata['platform'] new_df_obj['Pipeline'] = metadata['pipeline'] # Optional values new_df_obj['Symbol'] = map_values['Symbol'][k] if 'Symbol' in map_values.keys() else '' new_df_obj['ID'] = map_values['ID'][k] if 'ID' in map_values.keys() else '' new_df_obj['TAB'] = map_values['TAB'][k] if 'TAB' in map_values.keys() else '' new_df_obj['Level'] = m new_df_data.append(new_df_obj) new_df = pd.DataFrame(new_df_data) # Get unique barcodes and update metadata_data table sample_barcodes = list(set([k for d, k in new_df['sample_barcode'].iteritems()])) sample_metadata_list = [] for barcode in sample_barcodes: new_metadata = metadata.copy() new_metadata['sample_barcode'] = barcode sample_metadata_list.append(new_metadata) update_metadata_data_list(cloudsql_tables['METADATA_DATA'], sample_metadata_list) # Update metadata_samples table update_molecular_metadata_samples_list(cloudsql_tables['METADATA_SAMPLES'], metadata['data_type'], sample_barcodes) # Generate feature names and bq_mappings table_name = file_data['BIGQUERY_TABLE_NAME'] feature_defs = generate_feature_Defs(metadata['data_type'], metadata['study_id'], project_id, bq_dataset, table_name, new_df) # Update feature_defs table insert_feature_defs_list(cloudsql_tables['FEATURE_DEFS'], feature_defs) # upload the contents of the dataframe in njson format tmp_bucket = os.environ.get('tmp_bucket') gcs.convert_df_to_njson_and_upload(new_df, outfilename, metadata=metadata, tmp_bucket=tmp_bucket) # Load into BigQuery # Using temporary file location (in case we don't have write permissions on user's bucket?) source_path = 'gs://' + tmp_bucket + '/' + outfilename schema = get_molecular_schema() load_data_from_file.run( project_id, bq_dataset, table_name, schema, source_path, source_format='NEWLINE_DELIMITED_JSON', write_disposition='WRITE_APPEND', is_schema_file=False) # Delete temporary files print 'Deleting temporary file {0}'.format(outfilename) gcs = GcsConnector(project_id, tmp_bucket) gcs.delete_blob(outfilename)