def parse_file(project_id, bq_dataset, bucket_name, file_data, filename, outfilename, metadata, cloudsql_tables): print 'Begin processing {0}.'.format(filename) # connect to the cloud bucket gcs = GcsConnector(project_id, bucket_name) #main steps: download, convert to df, cleanup, transform, add metadata filebuffer = gcs.download_blob_to_file(filename) # convert blob into dataframe data_df = convert_file_to_dataframe(filebuffer, skiprows=0, header=0) # Get basic column information depending on datatype column_mapping = get_column_mapping(metadata['DataType']) data_df = cleanup_dataframe(data_df) data_df.rename(columns=column_mapping, inplace=True) # Get barcodes and update metadata_data table # Assuming second scenario where each file is a different platform/pipeline combination # TODO: Put in functionality for other scenario where all lists are in one file. sample_barcodes = list([k for d, k in data_df['SampleBarcode'].iteritems()]) file_list = list([k for d, k in data_df['filenamepath'].iteritems()]) sample_metadata_list = [] for idx, barcode in enumerate(sample_barcodes): new_metadata = metadata.copy() new_metadata['sample_barcode'] = barcode new_metadata['file_path'] = file_list[idx].replace('gs://', '') sample_metadata_list.append(new_metadata) update_metadata_data_list(cloudsql_tables['METADATA_DATA'], sample_metadata_list)
def get_sdrf_info(project_id, bucket_name, disease_codes, header, set_index_col, search_patterns): client = storage.Client(project_id) bucket = client.get_bucket(bucket_name) # connect to google cloud storage gcs = GcsConnector(project_id, bucket_name) sdrf_info = pd.DataFrame() for disease_code in disease_codes: for blob in bucket.list_blobs(prefix=disease_code): sdrf_filename = blob.name if not all(x in sdrf_filename for x in search_patterns): continue print(sdrf_filename) filebuffer = gcs.download_blob_to_file(sdrf_filename) # convert to a dataframe sdrf_df = convert_file_to_dataframe(filebuffer, skiprows=0) sdrf_df = cleanup_dataframe(sdrf_df) sdrf_df["Study"] = disease_code try: sdrf_df = sdrf_df.set_index(set_index_col) except: sdrf_df = sdrf_df.set_index("Derived_Array_Data_File") sdrf_info = sdrf_info.append(sdrf_df) print("Done loading SDRF files.") return sdrf_info
def parse_file(project_id, bq_dataset, bucket_name, file_data, filename, outfilename, metadata, cloudsql_tables): print 'Begin processing {0}.'.format(filename) # connect to the cloud bucket gcs = GcsConnector(project_id, bucket_name) #main steps: download, convert to df, cleanup, transform, add metadata filebuffer = gcs.download_blob_to_file(filename) # convert blob into dataframe data_df = convert_file_to_dataframe(filebuffer, skiprows=0, header=0) # Get basic column information depending on datatype column_mapping = get_column_mapping(metadata['DataType']) data_df = cleanup_dataframe(data_df) data_df.rename(columns=column_mapping, inplace=True) # Get barcodes and update metadata_data table # Assuming second scenario where each file is a different platform/pipeline combination # TODO: Put in functionality for other scenario where all lists are in one file. sample_barcodes = list( [k for d, k in data_df['SampleBarcode'].iteritems()]) file_list = list([k for d, k in data_df['filenamepath'].iteritems()]) sample_metadata_list = [] for idx, barcode in enumerate(sample_barcodes): new_metadata = metadata.copy() new_metadata['sample_barcode'] = barcode new_metadata['file_path'] = file_list[idx].replace('gs://', '') sample_metadata_list.append(new_metadata) update_metadata_data_list(cloudsql_tables['METADATA_DATA'], sample_metadata_list)
def get_sdrf_info(project_id, bucket_name, disease_codes, header, set_index_col, search_patterns): client = storage.Client(project_id) bucket = client.get_bucket(bucket_name) # connect to google cloud storage gcs = GcsConnector(project_id, bucket_name) sdrf_info = pd.DataFrame() for disease_code in disease_codes: for blob in bucket.list_blobs(prefix=disease_code): sdrf_filename = blob.name if not all(x in sdrf_filename for x in search_patterns): continue print(sdrf_filename) filebuffer = gcs.download_blob_to_file(sdrf_filename) # convert to a dataframe sdrf_df = convert_file_to_dataframe(filebuffer, skiprows=0) sdrf_df = cleanup_dataframe(sdrf_df) sdrf_df['Study'] = disease_code try: sdrf_df = sdrf_df.set_index(set_index_col) except: sdrf_df = sdrf_df.set_index("Derived_Array_Data_File") sdrf_info = sdrf_info.append(sdrf_df) print("Done loading SDRF files.") return sdrf_info
def process_file(self, config, outputdir, data_type, path, info, program_name, project, log): if config[program_name]['process_files']['datatype2bqscript'][ data_type]['file_compressed']: with gzip.open(outputdir + path) as input_file: file_df = convert_file_to_dataframe(input_file) else: with open(outputdir + path) as input_file: file_df = convert_file_to_dataframe(input_file) #now filter down to the desired columns use_columns = config[program_name]['process_files'][ 'datatype2bqscript'][data_type]['use_columns'] file_df = file_df[use_columns.keys()] #modify to BigQuery desired names, checking for columns that will be split in the next step new_names = [] for colname in file_df.columns: fields = use_columns[colname].split('~') if 1 == len(fields): new_names += [use_columns[colname]] else: new_names += [colname] file_df.columns = new_names # now process the splits for colname in use_columns: fields = use_columns[colname].split('~') if 2 == len(fields) and 'split' == fields[0]: extracted_df = file_df[colname].str.extract(fields[1], expand=True) file_df = pd.concat([file_df, extracted_df], axis=1) # add the metadata columns file_df = self.add_metadata(file_df, data_type, info, program_name, project, config) # allow subclasses to make final updates self.data_type_specific(config, file_df) # and reorder them file_df = file_df[config[program_name]['process_files'] ['datatype2bqscript'][data_type]['order_columns']] return file_df
def convert_blob_to_dataframe(gcs, project_id, bucket_name, filename, skiprows=0): """ Function to connect to google cloud storage, download the file, and convert to a dataframe """ filebuffer = gcs.download_blob_to_file(filename) # convert blob into dataframe data_df = convert_file_to_dataframe(filebuffer, skiprows=skiprows) # clean-up dataframe data_df = cleanup_dataframe(data_df) return data_df
def melt_matrix(matrix_file, Platform, studies_map, config, log): """ # melt matrix """ log.info('\tbegin melt matrix: \'%s\'' % (matrix_file)) # begin parsing the data data_df2 = pd.read_csv(matrix_file, delimiter='\t', header=0) data_df2 = data_df2.set_index(["Gene"]) # create a StingIO object with this info # call utils.convert_file_to_dataframe(buffer, sep=",") # call tools.cleanup_dataframe() # gcs.convert_df_to_njson_and_upload() log.info('\t\tstart processing saved matrix. size: %s' % (len(data_df2))) mod = int(len(data_df2) / 20) count = 0 buf = StringIO() buf.write( "ParticipantBarcode SampleBarcode AliquotBarcode SampleTypeLetterCode Study Platform mirna_id mirna_accession normalized_count\n" ) for i, j in data_df2.T.iteritems(): if 0 == count % mod: log.info('\t\t\tprocessed %s lines' % (count)) count += 1 for k, m in j.iteritems(): aliquot = k.strip(".mirbase20") aliquot = aliquot.strip(".hg19") SampleBarcode = "-".join(aliquot.split("-")[0:4]) ParticipantBarcode = "-".join(aliquot.split("-")[0:3]) SampleTypeLetterCode = config["sample_code2letter"][aliquot.split( "-")[3][0:2]] Study = studies_map[aliquot].upper() buf.write("\t".join( map(str, (ParticipantBarcode, SampleBarcode, aliquot, SampleTypeLetterCode, Study, Platform, i.split(".")[0], i.split(".")[1], m))) + '\n') log.info('\t\tprocessed %s total lines' % (count)) file_name = matrix_file.split('/')[-1] log.info('\t\tsave %s to GCS' % file_name) buf.seek(0) df = convert_file_to_dataframe(buf) df = cleanup_dataframe(df) gcs = GcsConnector(config['project_id'], config['buckets']['open']) gcs.convert_df_to_njson_and_upload( df, config['mirna_isoform_matrix'][Platform]['output_dir'] + file_name) log.info('\t\tcompleted save to GCS') log.info('\tfinished melt matrix')
def process_per_sample_files(self, config, outputdir, associated_paths, types, info, program_name, project, log): dfs = [None] * 3 curindex = 0 for associated_path in associated_paths: # convert blob into dataframe log.info('\t\tcalling convert_file_to_dataframe() for %s' % (associated_path)) dfs[curindex] = convert_file_to_dataframe( gzip.open(outputdir + associated_path), header=None) dfs[curindex].columns = [ 'Ensembl_versioned_gene_ID', types[curindex] ] self.add_metadata(dfs[curindex], info, program_name, project, config) if 'HTSeq - Counts' == types[curindex]: dfs[curindex] = dfs[curindex].drop( dfs[curindex].index[[60483, 60484, 60485, 60486, 60487]]) log.info('\t\tdone calling convert_file_to_dataframe() for %s' % (associated_path)) curindex += 1 merge_df = dfs[0] for df in dfs[1:]: merge_df = merge_df.merge( df, how='inner', on=[ 'Ensembl_versioned_gene_ID', 'file_gdc_id', 'aliquot_barcode', 'aliquot_barcode', 'sample_gdc_id', 'sample_barcode', 'case_gdc_id', 'case_barcode', 'program_name', 'project_short_name', 'sample_type_letter_code', 'data_type', 'experimental_strategy' ]) log.info('merge workflow(%d):\n%s\n\t...\n%s' % (len(merge_df), merge_df.head(3), merge_df.tail(3))) return merge_df
def convert_blob_to_dataframe(gcs, project_id, bucket_name, filename, skiprows=0, log = None): """ Function to connect to google cloud storage, download the file, and convert to a dataframe """ try: logit(log, 'calling download_blob_to_file() for %s' % (filename), 'info') filebuffer = gcs.download_blob_to_file(filename) logit(log, 'done calling download_blob_to_file() for %s' % (filename), 'info') # convert blob into dataframe logit(log, 'calling convert_file_to_dataframe() for %s' % (filename), 'info') data_df = convert_file_to_dataframe(filebuffer, skiprows=skiprows) logit(log, 'done calling convert_file_to_dataframe() for %s' % (filename), 'info') # clean-up dataframe logit(log, 'calling cleanup_dataframe() for %s' % (filename), 'info') data_df = cleanup_dataframe(data_df) logit(log, 'done calling cleanup_dataframe() for %s' % (filename), 'info') except Exception as e: logit(log, 'problem in convert_blob_to_dataframe(%s): %s' % (filename, e), 'exception') return data_df
def process_oncotator_output(project_id, bucket_name, data_library, bq_columns, sample_code2letter, oncotator_object_path): study = data_library['Study'].iloc[0] # this needed to stop pandas from converting them to FLOAT dtype = { "Transcript_Exon": "object", "NCBI_Build": "object", "COSMIC_Total_Alterations_In_Gene": "object", "CCLE_ONCOMAP_Total_Mutations_In_Gene": "object", "HGNC_HGNC_ID": "object", "UniProt_AApos": "object", "Transcript_Position": "object", "HGNC_OMIM_ID_Supplied_By_NCBI": "object" } file_count = 0 # create an empty dataframe. we use this to merge dataframe disease_bigdata_df = pd.DataFrame() # iterate over the selected files for oncotator_file in data_library['filename']: file_count += 1 log.info( '-' * 10 + "{0}: Processing file {1}".format(file_count, oncotator_file) + '-' * 10) try: # covert the file to a dataframe filename = oncotator_object_path + oncotator_file with open(filename) as infile: filestring = StringIO(infile.read()) df = convert_file_to_dataframe(filestring) try: df = cleanup_dataframe(df) except RuntimeError as re: log.warning('%s: problem cleaning dataframe for %s: %s' % (study, filename, re)) except Exception as e: print e raise if df.empty: log.debug('empty dataframe for file: ' + str(oncotator_file)) continue #------------------------------ # different operations on the frame #------------------------------ # get only the required BigQuery columns df = df[bq_columns] # format oncotator columns; name changes etc df = format_oncotator_columns(df) # add new columns df = add_columns(df, sample_code2letter, study) disease_bigdata_df = disease_bigdata_df.append(df, ignore_index=True) log.info('-' * 10 + "{0}: Finished file {1}. rows: {2}".format( file_count, oncotator_file, len(df)) + '-' * 10) # this is a merged dataframe if not disease_bigdata_df.empty: # remove duplicates; various rules; see check duplicates) log.info( '\tcalling check_duplicates to collapse aliquots with %s rows' % (len(disease_bigdata_df))) disease_bigdata_df = check_duplicates.remove_maf_duplicates( disease_bigdata_df, sample_code2letter, log) log.info( '\tfinished check_duplicates to collapse aliquots with %s rows' % (len(disease_bigdata_df))) # enforce unique mutation--previous # unique_mutation = ['Chromosome', 'Start_Position', 'End_Position', 'Tumor_Seq_Allele1', 'Tumor_Seq_Allele2', 'Tumor_AliquotBarcode'] # enforce unique mutation unique_mutation = [ 'Hugo_Symbol', 'Entrez_Gene_Id', 'Chromosome', 'Start_Position', 'End_Position', 'Reference_Allele', 'Tumor_Seq_Allele1', 'Tumor_Seq_Allele2', 'Tumor_AliquotBarcode' ] # merge mutations from multiple centers log.info('\tconsolidate the centers for duplicate mutations into list') seencenters = set() def concatcenters(df_group): if len(df_group) > 1: centers = list(set(df_group['Center'].tolist())) uniquecenters = set() delim = config['maf']['center_delim'] for center in centers: fields = center.split(delim) for field in fields: uniquecenters.add(field) sortedunique = delim.join(sorted(list(uniquecenters))) df_group.loc[:, 'Center'] = sortedunique if sortedunique not in seencenters: log.info('unique centers: %s' % sortedunique) seencenters.add(sortedunique) return df_group disease_bigdata_df = disease_bigdata_df.groupby(unique_mutation).apply( concatcenters) log.info('\tfinished consolidating centers for duplicate mutations') # enforce unique mutation log.info( '\tcalling remove_duplicates to collapse mutations with %s rows' % (len(disease_bigdata_df))) disease_bigdata_df = remove_duplicates(disease_bigdata_df, unique_mutation) log.info( '\tfinished remove_duplicates to collapse mutations with %s rows' % (len(disease_bigdata_df))) # convert the disease_bigdata_df to new-line JSON and the upload the file file_to_upload = StringIO() log.info('writing %s rows' % (len(disease_bigdata_df))) for _, rec in disease_bigdata_df.iterrows(): file_to_upload.write( rec.convert_objects(convert_numeric=False).to_json() + "\n") file_to_upload.seek(0) with open(oncotator_object_path + "{0}.json".format(study), 'w') as outfile: outfile.write(file_to_upload.getvalue()) else: log.warning('Empty dataframe for %s in %s!' % (oncotator_file, study)) return True
def generate_oncotator_inputfiles(project_id, bucket_name, filename, outputfilename, oncotator_columns): print(filename) # NEW connection gcs = GcsConnector(project_id, bucket_name) filebuffer = gcs.download_blob_to_file(filename) # convert blob into dataframe try: maf_df = convert_file_to_dataframe(filebuffer) except: print 'problem converting %s to a dataframe' % (filename) raise # clean-up dataframe maf_df = cleanup_dataframe(maf_df) print maf_df.columns # lowercase the column names (WHY?) maf_df.columns = map(lambda x: x.lower(), maf_df.columns) #-------------------------------------------- # data - manipulation #-------------------------------------------- maf_df["ncbi_build"] = maf_df["ncbi_build"].replace({ 'hg19': '37', 'GRCh37': '37', 'GRCh37-lite': '37' }) #--------------------------------------------- ## Filters ## remember all the column names are lowercase #--------------------------------------------- filters = { "chromosome": map(str, range(1, 23)) + ['X', 'Y'], "mutation_status": ['somatic', 'Somatic'], "sequencer": ['Illumina HiSeq', 'Illumina GAIIx', 'Illumina MiSeq'], "ncbi_build": ['37'] } filter_checklist_df = maf_df.isin(filters) filter_string = ((filter_checklist_df["chromosome"] == True) & (filter_checklist_df["mutation_status"] == True) & (filter_checklist_df["sequencer"] == True) & (filter_checklist_df["ncbi_build"] == True)) maf_df = maf_df[filter_string] #--------------------- #Oncotator part: generate intermediate files for Oncotator #--------------------- # oncotator needs these columns replace_column_names = { "ncbi_build": 'build', 'chromosome': 'chr', 'start_position': 'start', 'end_position': 'end', 'reference_allele': 'ref_allele', 'tumor_seq_allele1': 'tum_allele1', 'tumor_seq_allele2': 'tum_allele2', 'tumor_sample_barcode': 'tumor_barcode', 'matched_norm_sample_barcode': 'normal_barcode' } # replace columns with new headings; just name change for rcol in replace_column_names: maf_df.columns = [ replace_column_names[x] if x == rcol else x for x in maf_df.columns ] oncotator_columns = [ replace_column_names[y] if y == rcol else y for y in oncotator_columns ] # remove/mangle any duplicate columns ( we are naming line a, a.1, a.2 etc) maf_df.columns = mangle_dupe_cols(maf_df.columns.values) #--------------------- #Oncotator part: generate intermediate files for Oncotator #--------------------- oncotator_df = maf_df[oncotator_columns] print "df_columns", len(oncotator_df.columns) df_stringIO = oncotator_df.to_csv(sep='\t', index=False, columns=oncotator_columns) # upload the file gcs.upload_blob_from_string(outputfilename, df_stringIO) return True
def generate_oncotator_inputfiles(project_id, bucket_name, filename, outputfilename, oncotator_columns): print (filename) # NEW connection gcs = GcsConnector(project_id, bucket_name) filebuffer = gcs.download_blob_to_file(filename) # convert blob into dataframe maf_df = convert_file_to_dataframe(filebuffer) # clean-up dataframe maf_df = cleanup_dataframe(maf_df) print maf_df.columns # lowercase the column names (WHY?) maf_df.columns = map(lambda x: x.lower(), maf_df.columns) #-------------------------------------------- # data - manipulation #-------------------------------------------- maf_df["ncbi_build"] = maf_df["ncbi_build"].replace({ 'hg19': '37' ,'GRCh37': '37' ,'GRCh37-lite': '37' }) #--------------------------------------------- ## Filters ## remember all the column names are lowercase #--------------------------------------------- filters = { "chromosome" : map(str,range(1,23)) + ['X', 'Y'] ,"mutation_status": ['somatic', 'Somatic'] ,"sequencer": ['Illumina HiSeq', 'Illumina GAIIx', 'Illumina MiSeq'] ,"ncbi_build" : ['37'] } filter_checklist_df = maf_df.isin(filters) filter_string = ( (filter_checklist_df["chromosome"] == True) & (filter_checklist_df["mutation_status"] == True) & (filter_checklist_df["sequencer"] == True) & (filter_checklist_df["ncbi_build"] == True) ) maf_df = maf_df[filter_string] #--------------------- #Oncotator part: generate intermediate files for Oncotator #--------------------- # oncotator needs these columns replace_column_names = { "ncbi_build" : 'build' ,'chromosome' : 'chr' ,'start_position' : 'start' ,'end_position' : 'end' ,'reference_allele' : 'ref_allele' ,'tumor_seq_allele1' : 'tum_allele1' ,'tumor_seq_allele2' : 'tum_allele2' ,'tumor_sample_barcode': 'tumor_barcode' ,'matched_norm_sample_barcode': 'normal_barcode' } # replace columns with new headings; just name change for rcol in replace_column_names: maf_df.columns = [replace_column_names[x] if x==rcol else x for x in maf_df.columns] oncotator_columns = [replace_column_names[y] if y==rcol else y for y in oncotator_columns] # remove/mangle any duplicate columns ( we are naming line a, a.1, a.2 etc) maf_df.columns = mangle_dupe_cols(maf_df.columns.values) #--------------------- #Oncotator part: generate intermediate files for Oncotator #--------------------- oncotator_df = maf_df[oncotator_columns] print "df_columns", len(oncotator_df.columns) df_stringIO = oncotator_df.to_csv(sep='\t', index=False, columns= oncotator_columns) # upload the file gcs.upload_blob_from_string(outputfilename, df_stringIO) return True
def parse_file(project_id, bq_dataset, bucket_name, file_data, filename, outfilename, metadata, cloudsql_tables): print 'Begin processing {0}.'.format(filename) # connect to the cloud bucket gcs = GcsConnector(project_id, bucket_name) #main steps: download, convert to df, cleanup, transform, add metadata filebuffer = gcs.download_blob_to_file(filename) # convert blob into dataframe data_df = convert_file_to_dataframe(filebuffer, skiprows=0, header=0) # clean-up dataframe data_df = cleanup_dataframe(data_df) new_df_data = [] map_values = {} # Get basic column information depending on datatype column_map = get_column_mapping(metadata['data_type']) # Column headers are sample ids for i, j in data_df.iteritems(): if i in column_map.keys(): map_values[column_map[i]] = [k for d, k in j.iteritems()] else: for k, m in j.iteritems(): new_df_obj = {} new_df_obj[ 'sample_barcode'] = i # Normalized to match user_gen new_df_obj['Project'] = metadata['project_id'] new_df_obj['Study'] = metadata['study_id'] new_df_obj['Platform'] = metadata['platform'] new_df_obj['Pipeline'] = metadata['pipeline'] # Optional values new_df_obj['Symbol'] = map_values['Symbol'][ k] if 'Symbol' in map_values.keys() else '' new_df_obj['ID'] = map_values['ID'][ k] if 'ID' in map_values.keys() else '' new_df_obj['TAB'] = map_values['TAB'][ k] if 'TAB' in map_values.keys() else '' new_df_obj['Level'] = m new_df_data.append(new_df_obj) new_df = pd.DataFrame(new_df_data) # Get unique barcodes and update metadata_data table sample_barcodes = list( set([k for d, k in new_df['SampleBarcode'].iteritems()])) sample_metadata_list = [] for barcode in sample_barcodes: new_metadata = metadata.copy() new_metadata['sample_barcode'] = barcode sample_metadata_list.append(new_metadata) update_metadata_data_list(cloudsql_tables['METADATA_DATA'], sample_metadata_list) # Update metadata_samples table update_molecular_metadata_samples_list(cloudsql_tables['METADATA_SAMPLES'], metadata['data_type'], sample_barcodes) # Generate feature names and bq_mappings table_name = file_data['BIGQUERY_TABLE_NAME'] feature_defs = generate_feature_Defs(metadata['data_type'], metadata['study_id'], project_id, bq_dataset, table_name, new_df) # Update feature_defs table insert_feature_defs_list(cloudsql_tables['FEATURE_DEFS'], feature_defs) # upload the contents of the dataframe in njson format tmp_bucket = os.environ.get('tmp_bucket_location') gcs.convert_df_to_njson_and_upload(new_df, outfilename, metadata=metadata, tmp_bucket=tmp_bucket) # Load into BigQuery # Using temporary file location (in case we don't have write permissions on user's bucket?) source_path = 'gs://' + tmp_bucket + '/' + outfilename schema = get_molecular_schema() load_data_from_file.run(project_id, bq_dataset, table_name, schema, source_path, source_format='NEWLINE_DELIMITED_JSON', write_disposition='WRITE_APPEND', is_schema_file=False) # Delete temporary files print 'Deleting temporary file {0}'.format(outfilename) gcs = GcsConnector(project_id, tmp_bucket) gcs.delete_blob(outfilename)
def melt_matrix(self, matrix_file, platform, file2info, program_name, config, log): """ # melt matrix """ log.info('\t\t\tbegin melt matrix: \'%s\'' % (matrix_file)) # begin parsing the data data_df2 = pd.read_csv(matrix_file, delimiter='\t', header=0) data_df2 = data_df2.set_index(["Gene"]) # create a StingIO object with this info # call utils.convert_file_to_dataframe(buffer, sep=",") # call tools.cleanup_dataframe() # gcs.convert_df_to_njson_and_upload() log.info('\t\t\t\tstart processing saved matrix. size: %s' % (len(data_df2))) mod = int(len(data_df2) / 20) count = 0 total_count = 0 buf = StringIO() buf.write( "sample_barcode mirna_id mirna_accession normalized_count platform project_short_name program_name sample_type_code" + " file_name file_gdc_id aliquot_barcode case_barcode case_gdc_id sample_gdc_id aliquot_gdc_id\n" ) for i, j in data_df2.T.iteritems(): for k, m in j.iteritems(): aliquot = file2info[k]['aliquot_barcode'] SampleBarcode = "-".join(aliquot.split("-")[0:4]) ParticipantBarcode = "-".join(aliquot.split("-")[0:3]) SampleTypeCode = aliquot.split("-")[3][0:2] info = file2info[k] line = "\t".join( map(str, (SampleBarcode, i.split(".")[0], i.split(".")[1], m, platform, info['project_short_name'], info['program_name'], SampleTypeCode, info['file_name'], info['file_gdc_id'], aliquot, ParticipantBarcode, info['case_gdc_id'], info['sample_gdc_id'], info['aliquot_gdc_id']))) + '\n' buf.write(line) total_count += 1 if 0 == count % mod: log.info('\t\t\t\t\tprocessed %s lines:\n%s' % (count, line)) file_name = '%s_%s' % (matrix_file.split('/')[-1], count) log.info('\t\t\t\tsave %s to GCS' % file_name) buf.seek(0) df = convert_file_to_dataframe(buf) df = cleanup_dataframe(df, log) gcs = GcsConnector(config['cloud_projects']['open'], config['buckets']['open']) gcs.convert_df_to_njson_and_upload( df, config[program_name]['process_files']['datatype2bqscript'] ['Isoform Expression Quantification']['gcs_output_path'] + file_name, logparam=log) buf = StringIO() buf.write( "sample_barcode mirna_id mirna_accession normalized_count platform project_short_name program_name sample_type_code" + " file_name file_gdc_id aliquot_barcode case_barcode case_gdc_id sample_gdc_id aliquot_gdc_id\n" ) count += 1 log.info('\t\t\t\tprocessed %s total lines created %s records' % (count, total_count)) log.info('\t\t\t\tcompleted save to GCS') log.info('\t\t\tfinished melt matrix')
def parse_file(project_id, bq_dataset, bucket_name, file_data, filename, outfilename, metadata, cloudsql_tables): print 'Begin processing {0}.'.format(filename) # connect to the cloud bucket gcs = GcsConnector(project_id, bucket_name) #main steps: download, convert to df, cleanup, transform, add metadata filebuffer = gcs.download_blob_to_file(filename) # convert blob into dataframe data_df = convert_file_to_dataframe(filebuffer, skiprows=0, header=0) # clean-up dataframe data_df = cleanup_dataframe(data_df) new_df_data = [] map_values = {} # Get basic column information depending on datatype column_map = get_column_mapping(metadata['data_type']) # Column headers are sample ids for i, j in data_df.iteritems(): if i in column_map.keys(): map_values[column_map[i]] = [k for d, k in j.iteritems()] else: for k, m in j.iteritems(): new_df_obj = {} new_df_obj['sample_barcode'] = i # Normalized to match user_gen new_df_obj['project_id'] = metadata['project_id'] new_df_obj['study_id'] = metadata['study_id'] new_df_obj['Platform'] = metadata['platform'] new_df_obj['Pipeline'] = metadata['pipeline'] # Optional values new_df_obj['Symbol'] = map_values['Symbol'][k] if 'Symbol' in map_values.keys() else '' new_df_obj['ID'] = map_values['ID'][k] if 'ID' in map_values.keys() else '' new_df_obj['TAB'] = map_values['TAB'][k] if 'TAB' in map_values.keys() else '' new_df_obj['Level'] = m new_df_data.append(new_df_obj) new_df = pd.DataFrame(new_df_data) # Get unique barcodes and update metadata_data table sample_barcodes = list(set([k for d, k in new_df['sample_barcode'].iteritems()])) sample_metadata_list = [] for barcode in sample_barcodes: new_metadata = metadata.copy() new_metadata['sample_barcode'] = barcode sample_metadata_list.append(new_metadata) update_metadata_data_list(cloudsql_tables['METADATA_DATA'], sample_metadata_list) # Update metadata_samples table update_molecular_metadata_samples_list(cloudsql_tables['METADATA_SAMPLES'], metadata['data_type'], sample_barcodes) # Generate feature names and bq_mappings table_name = file_data['BIGQUERY_TABLE_NAME'] feature_defs = generate_feature_Defs(metadata['data_type'], metadata['study_id'], project_id, bq_dataset, table_name, new_df) # Update feature_defs table insert_feature_defs_list(cloudsql_tables['FEATURE_DEFS'], feature_defs) # upload the contents of the dataframe in njson format tmp_bucket = os.environ.get('tmp_bucket') gcs.convert_df_to_njson_and_upload(new_df, outfilename, metadata=metadata, tmp_bucket=tmp_bucket) # Load into BigQuery # Using temporary file location (in case we don't have write permissions on user's bucket?) source_path = 'gs://' + tmp_bucket + '/' + outfilename schema = get_molecular_schema() load_data_from_file.run( project_id, bq_dataset, table_name, schema, source_path, source_format='NEWLINE_DELIMITED_JSON', write_disposition='WRITE_APPEND', is_schema_file=False) # Delete temporary files print 'Deleting temporary file {0}'.format(outfilename) gcs = GcsConnector(project_id, tmp_bucket) gcs.delete_blob(outfilename)
gcs.convert_df_to_njson_and_upload(disease_bigdata_df, "tcga/intermediary/MAF/bigquery_data_files/{0}.json".format(study)) else: raise Exception('Empty dataframe!') return True if __name__ == '__main__': config = json.load(open(sys.argv[1])) project_id = config['project_id'] bucket_name = config['buckets']['open'] sample_code2letter = config['sample_code2letter'] # get disease_codes/studies( TODO this must be changed to get the disease code from the file name) df = convert_file_to_dataframe(open(sys.argv[2])) df = cleanup_dataframe(df) studies = list(set(df['Study'].tolist())) # get bq columns ( this allows the user to select the columns # , without worrying about the index, case-sensitivenes etc selected_columns = pd.read_table(sys.argv[3], names=['bq_columns']) transposed = selected_columns.T transposed.columns = transposed.loc['bq_columns'] transposed = cleanup_dataframe(transposed) bq_columns = transposed.columns.values # submit threads by disease code pm = process_manager.ProcessManager(max_workers=33, db='maf.db', table='task_queue_status') for idx, df_group in df.groupby(['Study']): future = pm.submit(process_oncotator_output, project_id, bucket_name, df_group, bq_columns, sample_code2letter)
def process_user_gen_files(project_id, user_project_id, study_id, bucket_name, bq_dataset, cloudsql_tables, files): print 'Begin processing user_gen files.' # connect to the cloud bucket gcs = GcsConnector(project_id, bucket_name) data_df = pd.DataFrame() # Collect all columns that get passed in for generating BQ schema later all_columns = [] # For each file, download, convert to df for idx, file in enumerate(files): blob_name = file['FILENAME'].split('/')[1:] all_columns += file['COLUMNS'] metadata = { 'sample_barcode': file.get('SAMPLEBARCODE', ''), 'participant_barcode': file.get('PARTICIPANTBARCODE', ''), 'study_id': study_id, 'platform': file.get('PLATFORM', ''), 'pipeline': file.get('PIPELINE', ''), 'file_path': file['FILENAME'], 'file_name': file['FILENAME'].split('/')[-1], 'data_type': file['DATATYPE'] } # download, convert to df filebuffer = gcs.download_blob_to_file(blob_name) # Get column mapping column_mapping = get_column_mapping(file['COLUMNS']) if idx == 0: data_df = convert_file_to_dataframe(filebuffer, skiprows=0, header=0) data_df = cleanup_dataframe(data_df) data_df.rename(columns=column_mapping, inplace=True) # Generate Metadata for this file insert_metadata(data_df, metadata, cloudsql_tables['METADATA_DATA']) else: # convert blob into dataframe new_df = convert_file_to_dataframe(filebuffer, skiprows=0, header=0) new_df = cleanup_dataframe(new_df) new_df.rename(columns=column_mapping, inplace=True) # Generate Metadata for this file insert_metadata(new_df, metadata, cloudsql_tables['METADATA_DATA']) # TODO: Write function to check for participant barcodes, for now, we assume each file contains SampleBarcode Mapping data_df = pd.merge(data_df, new_df, on='sample_barcode', how='outer') # For complete dataframe, create metadata_samples rows print 'Inserting into data into {0}.'.format(cloudsql_tables['METADATA_SAMPLES']) data_df = cleanup_dataframe(data_df) data_df['has_mrna'] = 0 data_df['has_mirna'] = 0 data_df['has_protein'] = 0 data_df['has_meth'] = 0 insert_metadata_samples(data_df, cloudsql_tables['METADATA_SAMPLES']) # Update and create bq table file temp_outfile = cloudsql_tables['METADATA_SAMPLES'] + '.out' tmp_bucket = os.environ.get('tmp_bucket_location') gcs.convert_df_to_njson_and_upload(data_df, temp_outfile, tmp_bucket=tmp_bucket) # Using temporary file location (in case we don't have write permissions on user's bucket? source_path = 'gs://' + tmp_bucket + '/' + temp_outfile schema = generate_bq_schema(all_columns) table_name = 'cgc_user_{0}_{1}'.format(user_project_id, study_id) load_data_from_file.run( project_id, bq_dataset, table_name, schema, source_path, source_format='NEWLINE_DELIMITED_JSON', write_disposition='WRITE_APPEND', is_schema_file=False) # Generate feature_defs feature_defs = generate_feature_defs(study_id, project_id, bq_dataset, table_name, schema) # Update feature_defs table insert_feature_defs_list(cloudsql_tables['FEATURE_DEFS'], feature_defs) # Delete temporary files print 'Deleting temporary file {0}'.format(temp_outfile) gcs = GcsConnector(project_id, tmp_bucket) gcs.delete_blob(temp_outfile)
def process_user_gen_files(project_id, user_project_id, study_id, bucket_name, bq_dataset, cloudsql_tables, files): print 'Begin processing user_gen files.' # connect to the cloud bucket gcs = GcsConnector(project_id, bucket_name) data_df = pd.DataFrame() # Collect all columns that get passed in for generating BQ schema later all_columns = [] # For each file, download, convert to df for idx, file in enumerate(files): blob_name = file['FILENAME'].split('/')[1:] all_columns += file['COLUMNS'] metadata = { 'sample_barcode': file.get('SAMPLEBARCODE', ''), 'participant_barcode': file.get('PARTICIPANTBARCODE', ''), 'study_id': study_id, 'platform': file.get('PLATFORM', ''), 'pipeline': file.get('PIPELINE', ''), 'file_path': file['FILENAME'], 'file_name': file['FILENAME'].split('/')[-1], 'data_type': file['DATATYPE'] } # download, convert to df filebuffer = gcs.download_blob_to_file(blob_name) # Get column mapping column_mapping = get_column_mapping(file['COLUMNS']) if idx == 0: data_df = convert_file_to_dataframe(filebuffer, skiprows=0, header=0) data_df = cleanup_dataframe(data_df) data_df.rename(columns=column_mapping, inplace=True) # Generate Metadata for this file insert_metadata(data_df, metadata, cloudsql_tables['METADATA_DATA']) else: # convert blob into dataframe new_df = convert_file_to_dataframe(filebuffer, skiprows=0, header=0) new_df = cleanup_dataframe(new_df) new_df.rename(columns=column_mapping, inplace=True) # Generate Metadata for this file insert_metadata(new_df, metadata, cloudsql_tables['METADATA_DATA']) # TODO: Write function to check for participant barcodes, for now, we assume each file contains SampleBarcode Mapping data_df = pd.merge(data_df, new_df, on='sample_barcode', how='outer') # For complete dataframe, create metadata_samples rows print 'Inserting into data into {0}.'.format(cloudsql_tables['METADATA_SAMPLES']) data_df = cleanup_dataframe(data_df) data_df['has_mrna'] = 0 data_df['has_mirna'] = 0 data_df['has_protein'] = 0 data_df['has_meth'] = 0 insert_metadata_samples(data_df, cloudsql_tables['METADATA_SAMPLES']) # Update and create bq table file temp_outfile = cloudsql_tables['METADATA_SAMPLES'] + '.out' tmp_bucket = os.environ.get('tmp_bucket') gcs.convert_df_to_njson_and_upload(data_df, temp_outfile, tmp_bucket=tmp_bucket) # Using temporary file location (in case we don't have write permissions on user's bucket? source_path = 'gs://' + tmp_bucket + '/' + temp_outfile schema = generate_bq_schema(all_columns) table_name = 'cgc_user_{0}_{1}'.format(user_project_id, study_id) load_data_from_file.run( project_id, bq_dataset, table_name, schema, source_path, source_format='NEWLINE_DELIMITED_JSON', write_disposition='WRITE_APPEND', is_schema_file=False) # Generate feature_defs feature_defs = generate_feature_defs(study_id, project_id, bq_dataset, table_name, schema) # Update feature_defs table insert_feature_defs_list(cloudsql_tables['FEATURE_DEFS'], feature_defs) # Delete temporary files print 'Deleting temporary file {0}'.format(temp_outfile) gcs = GcsConnector(project_id, tmp_bucket) gcs.delete_blob(temp_outfile)