def parse_file(project_id, bq_dataset, bucket_name, file_data, filename, outfilename, metadata, cloudsql_tables): print 'Begin processing {0}.'.format(filename) # connect to the cloud bucket gcs = GcsConnector(project_id, bucket_name) #main steps: download, convert to df, cleanup, transform, add metadata filebuffer = gcs.download_blob_to_file(filename) # convert blob into dataframe data_df = convert_file_to_dataframe(filebuffer, skiprows=0, header=0) # Get basic column information depending on datatype column_mapping = get_column_mapping(metadata['DataType']) data_df = cleanup_dataframe(data_df) data_df.rename(columns=column_mapping, inplace=True) # Get barcodes and update metadata_data table # Assuming second scenario where each file is a different platform/pipeline combination # TODO: Put in functionality for other scenario where all lists are in one file. sample_barcodes = list([k for d, k in data_df['SampleBarcode'].iteritems()]) file_list = list([k for d, k in data_df['filenamepath'].iteritems()]) sample_metadata_list = [] for idx, barcode in enumerate(sample_barcodes): new_metadata = metadata.copy() new_metadata['sample_barcode'] = barcode new_metadata['file_path'] = file_list[idx].replace('gs://', '') sample_metadata_list.append(new_metadata) update_metadata_data_list(cloudsql_tables['METADATA_DATA'], sample_metadata_list)
def get_sdrf_info(project_id, bucket_name, disease_codes, header, set_index_col, search_patterns): client = storage.Client(project_id) bucket = client.get_bucket(bucket_name) # connect to google cloud storage gcs = GcsConnector(project_id, bucket_name) sdrf_info = pd.DataFrame() for disease_code in disease_codes: for blob in bucket.list_blobs(prefix=disease_code): sdrf_filename = blob.name if not all(x in sdrf_filename for x in search_patterns): continue print(sdrf_filename) filebuffer = gcs.download_blob_to_file(sdrf_filename) # convert to a dataframe sdrf_df = convert_file_to_dataframe(filebuffer, skiprows=0) sdrf_df = cleanup_dataframe(sdrf_df) sdrf_df['Study'] = disease_code try: sdrf_df = sdrf_df.set_index(set_index_col) except: sdrf_df = sdrf_df.set_index("Derived_Array_Data_File") sdrf_info = sdrf_info.append(sdrf_df) print("Done loading SDRF files.") return sdrf_info
def parse_file(project_id, bq_dataset, bucket_name, file_data, filename, outfilename, metadata, cloudsql_tables): print 'Begin processing {0}.'.format(filename) # connect to the cloud bucket gcs = GcsConnector(project_id, bucket_name) #main steps: download, convert to df, cleanup, transform, add metadata filebuffer = gcs.download_blob_to_file(filename) # convert blob into dataframe data_df = convert_file_to_dataframe(filebuffer, skiprows=0, header=0) # Get basic column information depending on datatype column_mapping = get_column_mapping(metadata['DataType']) data_df = cleanup_dataframe(data_df) data_df.rename(columns=column_mapping, inplace=True) # Get barcodes and update metadata_data table # Assuming second scenario where each file is a different platform/pipeline combination # TODO: Put in functionality for other scenario where all lists are in one file. sample_barcodes = list( [k for d, k in data_df['SampleBarcode'].iteritems()]) file_list = list([k for d, k in data_df['filenamepath'].iteritems()]) sample_metadata_list = [] for idx, barcode in enumerate(sample_barcodes): new_metadata = metadata.copy() new_metadata['sample_barcode'] = barcode new_metadata['file_path'] = file_list[idx].replace('gs://', '') sample_metadata_list.append(new_metadata) update_metadata_data_list(cloudsql_tables['METADATA_DATA'], sample_metadata_list)
def process_paths(self, config, outputdir, data_type, paths, program_name, project, file2info, log): if 0 != len(paths) % 3: raise RuntimeError( 'need to process the three RNA files per sample together. adjust the configuration option \'download_files_per\' accordingly' ) types = config[program_name]['process_files']['datatype2bqscript'][ 'Gene Expression Quantification']['analysis_types'] idents = set() count = 0 associated_paths = [None] * 3 complete_df = None for path in paths: fields = path.split('/') info = file2info[fields[-2] + '/' + fields[-1]] type_index = types.index(info['analysis']['workflow_type']) if associated_paths[type_index]: raise ValueError( 'files in bad order, found two of the same type for %s:%s' % (associated_paths[type_index], fields[1])) idents.add(fields[1].split('.')[0]) if 1 < len(idents): raise ValueError( 'files in bad order, found two different identifiers %s:%s' % (associated_paths[type_index], fields[1])) associated_paths[type_index] = path count += 1 if 0 == count % 3: merge_df = self.process_per_sample_files( config, outputdir, associated_paths, types, info, program_name, project, log) if complete_df is None: complete_df = merge_df else: complete_df = pd.concat([complete_df, merge_df], ignore_index=True) idents = set() count = 0 associated_paths = [None] * 3 # add unversioned gene column complete_df['Ensembl_gene_ID'] = complete_df[ 'Ensembl_versioned_gene_ID'].str.split('.').str[0] # clean-up dataframe log.info('\t\tcalling cleanup_dataframe() for %s' % (paths)) complete_df = cleanup_dataframe(complete_df, log) log.info('\t\tdone calling cleanup_dataframe() for %s' % (paths)) log.info('\tcomplete data frame(%d):\n%s\n%s' % (len(complete_df), complete_df.head(3), complete_df.tail(3))) return complete_df
def parse_annotations(config): """ Download using annotation file using urllib2.Request Convert the file into a dataframe """ # connect to google.cloud wrapper project_id = config['project_id'] bucket_name = config['buckets']['open'] gcs = GcsConnector(project_id, bucket_name) # use Request to get the json data print 'Downloading the TCGA annotations file' request = Request(config['tcga_annotations']['input_file']) response = urlopen(request) elevations = response.read() data = json.loads(elevations) all_annotations = [] for annotation in data['dccAnnotation']: annotation = normalize_json(annotation) if annotation["items_item"].startswith('TCGA-') and len( annotation["items_item"]) >= 12: item = annotation["items_item"] annotation['ParticipantBarcode'] = "-".join(item.split("-")[0:3]) if (len(annotation["items_item"]) > 12 and annotation["items_itemType_itemTypeName"] != "Patient"): annotation['SampleBarcode'] = "-".join(item.split("-")[0:4]) if annotation["items_itemType_itemTypeName"] == "Aliquot": aliquot = annotation["items_item"] if aliquot.startswith('TCGA-') and len(aliquot) == 28: annotation['AliquotBarcode'] = aliquot all_annotations.append(annotation) #------------ # transform #------------ data_df = pd.DataFrame(all_annotations) # clean up the dataframe to upload to BigQuery data_df = cleanup_dataframe(data_df) # rename columns data_df = rename_columns(data_df) print data_df #---------- # upload #---------- # upload the contents of the dataframe in njson format print "Uploading the njson file" status = gcs.convert_df_to_njson_and_upload( data_df, config['tcga_annotations']['output_file']) print status
def parse_annotations(config): """ Download using annotation file using urllib2.Request Convert the file into a dataframe """ # connect to gcloud wrapper project_id = config['project_id'] bucket_name = config['buckets']['open'] gcs = GcsConnector(project_id, bucket_name) # use Request to get the json data print 'Downloading the TCGA annotations file' request = Request(config['tcga_annotations']['input_file']) response = urlopen(request) elevations = response.read() data = json.loads(elevations) all_annotations = [] for annotation in data['dccAnnotation']: annotation = normalize_json(annotation) if annotation["items_item"].startswith('TCGA-') and len(annotation["items_item"]) >= 12: item = annotation["items_item"] annotation['ParticipantBarcode'] = "-".join(item.split("-")[0:3]) if (len(annotation["items_item"]) > 12 and annotation["items_itemType_itemTypeName"] != "Patient"): annotation['SampleBarcode'] = "-".join(item.split("-")[0:4]) if annotation["items_itemType_itemTypeName"] == "Aliquot": aliquot = annotation["items_item"] if aliquot.startswith('TCGA-') and len(aliquot) == 28: annotation['AliquotBarcode'] = aliquot all_annotations.append(annotation) #------------ # transform #------------ data_df = pd.DataFrame(all_annotations) # clean up the dataframe to upload to BigQuery data_df = cleanup_dataframe(data_df) # rename columns data_df = rename_columns(data_df) print data_df #---------- # upload #---------- # upload the contents of the dataframe in njson format print "Uploading the njson file" status = gcs.convert_df_to_njson_and_upload(data_df, config['tcga_annotations']['output_file']) print status
def convert_blob_to_dataframe(gcs, project_id, bucket_name, filename, skiprows=0): """ Function to connect to google cloud storage, download the file, and convert to a dataframe """ filebuffer = gcs.download_blob_to_file(filename) # convert blob into dataframe data_df = convert_file_to_dataframe(filebuffer, skiprows=skiprows) # clean-up dataframe data_df = cleanup_dataframe(data_df) return data_df
def melt_matrix(matrix_file, Platform, studies_map, config, log): """ # melt matrix """ log.info('\tbegin melt matrix: \'%s\'' % (matrix_file)) # begin parsing the data data_df2 = pd.read_csv(matrix_file, delimiter='\t', header=0) data_df2 = data_df2.set_index(["Gene"]) # create a StingIO object with this info # call utils.convert_file_to_dataframe(buffer, sep=",") # call tools.cleanup_dataframe() # gcs.convert_df_to_njson_and_upload() log.info('\t\tstart processing saved matrix. size: %s' % (len(data_df2))) mod = int(len(data_df2) / 20) count = 0 buf = StringIO() buf.write( "ParticipantBarcode SampleBarcode AliquotBarcode SampleTypeLetterCode Study Platform mirna_id mirna_accession normalized_count\n" ) for i, j in data_df2.T.iteritems(): if 0 == count % mod: log.info('\t\t\tprocessed %s lines' % (count)) count += 1 for k, m in j.iteritems(): aliquot = k.strip(".mirbase20") aliquot = aliquot.strip(".hg19") SampleBarcode = "-".join(aliquot.split("-")[0:4]) ParticipantBarcode = "-".join(aliquot.split("-")[0:3]) SampleTypeLetterCode = config["sample_code2letter"][aliquot.split( "-")[3][0:2]] Study = studies_map[aliquot].upper() buf.write("\t".join( map(str, (ParticipantBarcode, SampleBarcode, aliquot, SampleTypeLetterCode, Study, Platform, i.split(".")[0], i.split(".")[1], m))) + '\n') log.info('\t\tprocessed %s total lines' % (count)) file_name = matrix_file.split('/')[-1] log.info('\t\tsave %s to GCS' % file_name) buf.seek(0) df = convert_file_to_dataframe(buf) df = cleanup_dataframe(df) gcs = GcsConnector(config['project_id'], config['buckets']['open']) gcs.convert_df_to_njson_and_upload( df, config['mirna_isoform_matrix'][Platform]['output_dir'] + file_name) log.info('\t\tcompleted save to GCS') log.info('\tfinished melt matrix')
def parse_mirtarbase(config): """ Download mirtarbase file with urllib2.Request Convert to a dataframe """ # use Request to get the data (change the link to get the most recent file) request = Request(config['mirtarbase']['input_file']) socket = urlopen(request) # load into a pandas dataframe xlsd = pd.ExcelFile(socket, dtype=object) df_sheet1 = xlsd.parse(0) # parse first sheet (converts ina dataframe) print 'Found {0} rows in the file'.format(len(df_sheet1)) # clean up the dataframe to upload to BigQuery data_df = cleanup_dataframe(df_sheet1) replace_column_names = { 'Species_miRNA': 'miRNA_Species', 'Species_Target_Gene': 'Target_Gene_Species', 'Target_Gene_Entrez_Gene_ID': 'Target_Gene_EntrezID', 'miRTarBase_ID': 'miRTarBaseID' } data_df.columns = [replace_column_names[x] if x in replace_column_names\ else x for x in data_df.columns] # convert to int data_df['Target_Gene_EntrezID'] = data_df['Target_Gene_EntrezID']\ .map(lambda x: str(int(float(x))), na_action='ignore') #------- # upload #------- # connect to google.cloud wrapper project_id = config['project_id'] bucket_name = config['buckets']['open'] # connect to google storage gcs = GcsConnector(project_id, bucket_name) # upload the contents of the dataframe in njson format status = gcs.convert_df_to_njson_and_upload( data_df, config['mirtarbase']['output_file']) print status
def parse_mirtarbase(config): """ Download mirtarbase file with urllib2.Request Convert to a dataframe """ # use Request to get the data (change the link to get the most recent file) request = Request(config['mirtarbase']['input_file']) socket = urlopen(request) # load into a pandas dataframe xlsd = pd.ExcelFile(socket, dtype=object) df_sheet1 = xlsd.parse(0) # parse first sheet (converts ina dataframe) print 'Found {0} rows in the file'. format(len(df_sheet1)) # clean up the dataframe to upload to BigQuery data_df = cleanup_dataframe(df_sheet1) replace_column_names = { 'Species_miRNA': 'miRNA_Species', 'Species_Target_Gene': 'Target_Gene_Species', 'Target_Gene_Entrez_Gene_ID': 'Target_Gene_EntrezID', 'miRTarBase_ID': 'miRTarBaseID' } data_df.columns = [replace_column_names[x] if x in replace_column_names\ else x for x in data_df.columns] # convert to int data_df['Target_Gene_EntrezID'] = data_df['Target_Gene_EntrezID']\ .map(lambda x: str(int(float(x))), na_action='ignore') #------- # upload #------- # connect to gcloud wrapper project_id = config['project_id'] bucket_name = config['buckets']['open'] # connect to google storage gcs = GcsConnector(project_id, bucket_name) # upload the contents of the dataframe in njson format status = gcs.convert_df_to_njson_and_upload(data_df, config['mirtarbase']['output_file']) print status
def main(): """Example to download a file from the Google Storage, transform, and load to Google Storage and BigQuery """ project_id = '' bucket_name = '' # example file in bucket filename = 'TCGA-OR-A5J1-01A-11D-A29J-05.txt' outfilename = '' # read the stringIO/file into a pandas dataframe # load the file into a table data_df = pandas.read_table(filename, sep="\t", skiprows=1, lineterminator='\n', comment='#') # clean up the dataframe for upload to BigQuery data_df = cleanup_dataframe(data_df) # connect to the google cloud bucket gcs = GcsConnector(project_id, bucket_name) # main steps: download, convert to df data_df = gcutils.convert_blob_to_dataframe(gcs, project_id, bucket_name, filename, skiprows=1) #--------------------------------------------------------- # get required information # get chromosome 1 and Genomic_Coordinate > 20000000 #--------------------------------------------------------- data_df = (data_df.query("Chromosome == '1' and Genomic_Coordinate > 20000000")\ .query("Beta_value > 0.2")) # we can assign this query to a new dataframe and have new data # upload the contents of the dataframe in njson format to google storage # set metadata on the blob/object metadata = {'info': 'etl-test'} status = gcs.convert_df_to_njson_and_upload(data_df, outfilename, metadata=metadata) print status
def process_paths(self, config, outputdir, data_type, paths, program_name, project, file2info, log): count = 0 complete_df = None log.info('\tprocessing %d paths for %s:%s' % (len(paths), data_type, project)) for path in paths: count += 1 fields = path.split('/') info = file2info[fields[-2] + '/' + fields[-1]] if self.skip_file(config, data_type, path, program_name, file2info, info, log): continue file_df = self.process_file(config, outputdir, data_type, path, info, program_name, project, log) if complete_df is None: complete_df = file_df else: complete_df = pd.concat([complete_df, file_df], ignore_index=True) if 0 == count % 128: log.info('\t\tprocessed %s path: %s' % (count, path)) log.info('\tdone processing %d paths for %s:%s' % (len(paths), data_type, project)) # clean-up dataframe if complete_df is not None: log.info('\t\tcalling cleanup_dataframe() for %s' % (paths)) complete_df = cleanup_dataframe(complete_df, log) log.info('\t\tdone calling cleanup_dataframe() for %s' % (paths)) log.info( '\tcomplete data frame(%d):\n%s\n%s' % (len(complete_df), complete_df.head(3), complete_df.tail(3))) else: log.info('\tno complete data frame created') return complete_df
def convert_blob_to_dataframe(gcs, project_id, bucket_name, filename, skiprows=0, log = None): """ Function to connect to google cloud storage, download the file, and convert to a dataframe """ try: logit(log, 'calling download_blob_to_file() for %s' % (filename), 'info') filebuffer = gcs.download_blob_to_file(filename) logit(log, 'done calling download_blob_to_file() for %s' % (filename), 'info') # convert blob into dataframe logit(log, 'calling convert_file_to_dataframe() for %s' % (filename), 'info') data_df = convert_file_to_dataframe(filebuffer, skiprows=skiprows) logit(log, 'done calling convert_file_to_dataframe() for %s' % (filename), 'info') # clean-up dataframe logit(log, 'calling cleanup_dataframe() for %s' % (filename), 'info') data_df = cleanup_dataframe(data_df) logit(log, 'done calling cleanup_dataframe() for %s' % (filename), 'info') except Exception as e: logit(log, 'problem in convert_blob_to_dataframe(%s): %s' % (filename, e), 'exception') return data_df
def process_oncotator_output(project_id, bucket_name, data_library, bq_columns, sample_code2letter, oncotator_object_path): study = data_library['Study'].iloc[0] # this needed to stop pandas from converting them to FLOAT dtype = { "Transcript_Exon": "object", "NCBI_Build": "object", "COSMIC_Total_Alterations_In_Gene": "object", "CCLE_ONCOMAP_Total_Mutations_In_Gene": "object", "HGNC_HGNC_ID": "object", "UniProt_AApos": "object", "Transcript_Position": "object", "HGNC_OMIM_ID_Supplied_By_NCBI": "object" } file_count = 0 # create an empty dataframe. we use this to merge dataframe disease_bigdata_df = pd.DataFrame() # iterate over the selected files for oncotator_file in data_library['filename']: file_count += 1 log.info( '-' * 10 + "{0}: Processing file {1}".format(file_count, oncotator_file) + '-' * 10) try: # covert the file to a dataframe filename = oncotator_object_path + oncotator_file with open(filename) as infile: filestring = StringIO(infile.read()) df = convert_file_to_dataframe(filestring) try: df = cleanup_dataframe(df) except RuntimeError as re: log.warning('%s: problem cleaning dataframe for %s: %s' % (study, filename, re)) except Exception as e: print e raise if df.empty: log.debug('empty dataframe for file: ' + str(oncotator_file)) continue #------------------------------ # different operations on the frame #------------------------------ # get only the required BigQuery columns df = df[bq_columns] # format oncotator columns; name changes etc df = format_oncotator_columns(df) # add new columns df = add_columns(df, sample_code2letter, study) disease_bigdata_df = disease_bigdata_df.append(df, ignore_index=True) log.info('-' * 10 + "{0}: Finished file {1}. rows: {2}".format( file_count, oncotator_file, len(df)) + '-' * 10) # this is a merged dataframe if not disease_bigdata_df.empty: # remove duplicates; various rules; see check duplicates) log.info( '\tcalling check_duplicates to collapse aliquots with %s rows' % (len(disease_bigdata_df))) disease_bigdata_df = check_duplicates.remove_maf_duplicates( disease_bigdata_df, sample_code2letter, log) log.info( '\tfinished check_duplicates to collapse aliquots with %s rows' % (len(disease_bigdata_df))) # enforce unique mutation--previous # unique_mutation = ['Chromosome', 'Start_Position', 'End_Position', 'Tumor_Seq_Allele1', 'Tumor_Seq_Allele2', 'Tumor_AliquotBarcode'] # enforce unique mutation unique_mutation = [ 'Hugo_Symbol', 'Entrez_Gene_Id', 'Chromosome', 'Start_Position', 'End_Position', 'Reference_Allele', 'Tumor_Seq_Allele1', 'Tumor_Seq_Allele2', 'Tumor_AliquotBarcode' ] # merge mutations from multiple centers log.info('\tconsolidate the centers for duplicate mutations into list') seencenters = set() def concatcenters(df_group): if len(df_group) > 1: centers = list(set(df_group['Center'].tolist())) uniquecenters = set() delim = config['maf']['center_delim'] for center in centers: fields = center.split(delim) for field in fields: uniquecenters.add(field) sortedunique = delim.join(sorted(list(uniquecenters))) df_group.loc[:, 'Center'] = sortedunique if sortedunique not in seencenters: log.info('unique centers: %s' % sortedunique) seencenters.add(sortedunique) return df_group disease_bigdata_df = disease_bigdata_df.groupby(unique_mutation).apply( concatcenters) log.info('\tfinished consolidating centers for duplicate mutations') # enforce unique mutation log.info( '\tcalling remove_duplicates to collapse mutations with %s rows' % (len(disease_bigdata_df))) disease_bigdata_df = remove_duplicates(disease_bigdata_df, unique_mutation) log.info( '\tfinished remove_duplicates to collapse mutations with %s rows' % (len(disease_bigdata_df))) # convert the disease_bigdata_df to new-line JSON and the upload the file file_to_upload = StringIO() log.info('writing %s rows' % (len(disease_bigdata_df))) for _, rec in disease_bigdata_df.iterrows(): file_to_upload.write( rec.convert_objects(convert_numeric=False).to_json() + "\n") file_to_upload.seek(0) with open(oncotator_object_path + "{0}.json".format(study), 'w') as outfile: outfile.write(file_to_upload.getvalue()) else: log.warning('Empty dataframe for %s in %s!' % (oncotator_file, study)) return True
def process_user_gen_files(project_id, user_project_id, study_id, bucket_name, bq_dataset, cloudsql_tables, files): print 'Begin processing user_gen files.' # connect to the cloud bucket gcs = GcsConnector(project_id, bucket_name) data_df = pd.DataFrame() # Collect all columns that get passed in for generating BQ schema later all_columns = [] # For each file, download, convert to df for idx, file in enumerate(files): blob_name = file['FILENAME'].split('/')[1:] all_columns += file['COLUMNS'] metadata = { 'sample_barcode': file.get('SAMPLEBARCODE', ''), 'participant_barcode': file.get('PARTICIPANTBARCODE', ''), 'study_id': study_id, 'platform': file.get('PLATFORM', ''), 'pipeline': file.get('PIPELINE', ''), 'file_path': file['FILENAME'], 'file_name': file['FILENAME'].split('/')[-1], 'data_type': file['DATATYPE'] } # download, convert to df filebuffer = gcs.download_blob_to_file(blob_name) # Get column mapping column_mapping = get_column_mapping(file['COLUMNS']) if idx == 0: data_df = convert_file_to_dataframe(filebuffer, skiprows=0, header=0) data_df = cleanup_dataframe(data_df) data_df.rename(columns=column_mapping, inplace=True) # Generate Metadata for this file insert_metadata(data_df, metadata, cloudsql_tables['METADATA_DATA']) else: # convert blob into dataframe new_df = convert_file_to_dataframe(filebuffer, skiprows=0, header=0) new_df = cleanup_dataframe(new_df) new_df.rename(columns=column_mapping, inplace=True) # Generate Metadata for this file insert_metadata(new_df, metadata, cloudsql_tables['METADATA_DATA']) # TODO: Write function to check for participant barcodes, for now, we assume each file contains SampleBarcode Mapping data_df = pd.merge(data_df, new_df, on='sample_barcode', how='outer') # For complete dataframe, create metadata_samples rows print 'Inserting into data into {0}.'.format(cloudsql_tables['METADATA_SAMPLES']) data_df = cleanup_dataframe(data_df) data_df['has_mrna'] = 0 data_df['has_mirna'] = 0 data_df['has_protein'] = 0 data_df['has_meth'] = 0 insert_metadata_samples(data_df, cloudsql_tables['METADATA_SAMPLES']) # Update and create bq table file temp_outfile = cloudsql_tables['METADATA_SAMPLES'] + '.out' tmp_bucket = os.environ.get('tmp_bucket_location') gcs.convert_df_to_njson_and_upload(data_df, temp_outfile, tmp_bucket=tmp_bucket) # Using temporary file location (in case we don't have write permissions on user's bucket? source_path = 'gs://' + tmp_bucket + '/' + temp_outfile schema = generate_bq_schema(all_columns) table_name = 'cgc_user_{0}_{1}'.format(user_project_id, study_id) load_data_from_file.run( project_id, bq_dataset, table_name, schema, source_path, source_format='NEWLINE_DELIMITED_JSON', write_disposition='WRITE_APPEND', is_schema_file=False) # Generate feature_defs feature_defs = generate_feature_defs(study_id, project_id, bq_dataset, table_name, schema) # Update feature_defs table insert_feature_defs_list(cloudsql_tables['FEATURE_DEFS'], feature_defs) # Delete temporary files print 'Deleting temporary file {0}'.format(temp_outfile) gcs = GcsConnector(project_id, tmp_bucket) gcs.delete_blob(temp_outfile)
def main(): """Parse GCT file, merge with barcodes info, melt(tidy) and load to Google Storage and BigQuery """ project_id = '' bucket_name = '' # example file in bucket filename = 'ccle/mRNA-gene-exp/CCLE_Expression_Entrez_2012-09-29.gct' outfilename = 'test' writer = ExcelWriter('ccle.xlsx') # connect to the google cloud bucket gcs = GcsConnector(project_id, bucket_name) #------------------------------ # load the GCT file # * the file has duplicate samples (columns) #------------------------------ # To remove duplicates, load the first few lines of the file only. Get columns, unique and select dataframe # this is a hack, but I cant find a elegant way to remove duplciates gct_df = pd.read_table('CCLE_Expression_Entrez_2012-09-29.gct', sep='\t', skiprows=2, mangle_dupe_cols=False, nrows=2) unqiue_columns = np.unique(gct_df.columns) gct_df = pd.read_table('CCLE_Expression_Entrez_2012-09-29.gct', sep='\t', skiprows=2, mangle_dupe_cols=True) # clean-up the dataset/dataframe gct_df = cleanup_dataframe(gct_df) gct_df = gct_df[unqiue_columns] # remove any gene_id starting with 'AFFX-' gct_df[gct_df['Name'].str.startswith('AFFX-')].to_excel( writer, sheet_name="affy_info") gct_df = gct_df[~gct_df['Name'].str.startswith('AFFX-')] #------------------------------ # HGNC validation #----------------------------- hgnc_df = hgnc_validation.get_hgnc_map() hgnc_df.to_excel(writer, sheet_name="hgnc_info") hgnc_dict = dict(zip(hgnc_df.entrez_id, hgnc_df.symbol)) gct_df['HGNC_gene_symbol'] = gct_df['Name'].map( lambda gene_id: hgnc_dict.get(gene_id.replace('_at', ''), np.nan)) gct_df[['HGNC_gene_symbol', 'Name', 'Description']].to_excel(writer, sheet_name="gene_info") gct_df['Name'] = gct_df['Name'].map( lambda gene_id: gene_id.replace('_at', '')) #------------------------------ # barcodes info #------------------------------ barcodes_filename = 'ccle/mRNA-gene-exp/mRNA_names.out.tsv' filebuffer = gcs.download_blob_to_file(barcodes_filename) barcodes_df = pd.read_table( filebuffer, header=None, names=['ParticipantBarcode', 'SampleBarcode', 'CCLE_long_name']) # convert into dataframe barcodes_df = cleanup_dataframe(barcodes_df) # clean-up dataframe #------------------------------ # ignore (drop) all of the columns from the gene-expression matrix #that don't have corresponding Participant and Sample barcodes, #------------------------------ columns_df = pd.DataFrame(unqiue_columns) columns_df.columns = ['CCLE_long_name'] samples_map_df = pd.merge(columns_df, barcodes_df, on='CCLE_long_name', how='inner') samples_map_df.to_excel(writer, sheet_name="sample_info") # select columns that are overlapping overlapping_samples = samples_map_df['CCLE_long_name'].tolist() overlapping_samples = overlapping_samples + [ 'Name', 'Description', 'HGNC_gene_symbol' ] gct_df = gct_df[overlapping_samples] print gct_df # melt the matrix value_vars = [ col for col in gct_df.columns if col not in ['Name', 'Description', 'HGNC_gene_symbol'] ] melted_df = pd.melt(gct_df, id_vars=['Name', 'Description', 'HGNC_gene_symbol'], value_vars=value_vars) melted_df = melted_df.rename( columns={ 'Name': 'gene_id', 'Description': 'original_gene_symbol', 'variable': 'CCLE_long_name', 'value': 'RMA_normalized_expression' }) # merge to get barcode information # changed from outer join to inner join. In this case it shouldnt matter, since we already did a inner join # while select the samples above. data_df = pd.merge(melted_df, samples_map_df, on='CCLE_long_name', how='inner') data_df['Platform'] = "Affymetrix U133 Plus 2.0" # reorder columns col_order = [ "ParticipantBarcode", "SampleBarcode", "CCLE_long_name", "gene_id", "HGNC_gene_symbol", "original_gene_symbol", "Platform", "RMA_normalized_expression" ] data_df = data_df[col_order] # upload the contents of the dataframe in CSV format print "Convert to CSV" outfilename = "tcga/intermediary/CCLE_mrna_expr/bq_data_files/ccle_mrna_expr.csv" df_string = data_df.to_csv(index=False, header=False) status = gcs.upload_blob_from_string(outfilename, df_string) print status # save the excel file writer.save()
def parse_file(project_id, bq_dataset, bucket_name, file_data, filename, outfilename, metadata, cloudsql_tables): print 'Begin processing {0}.'.format(filename) # connect to the cloud bucket gcs = GcsConnector(project_id, bucket_name) #main steps: download, convert to df, cleanup, transform, add metadata filebuffer = gcs.download_blob_to_file(filename) # convert blob into dataframe data_df = convert_file_to_dataframe(filebuffer, skiprows=0, header=0) # clean-up dataframe data_df = cleanup_dataframe(data_df) new_df_data = [] map_values = {} # Get basic column information depending on datatype column_map = get_column_mapping(metadata['data_type']) # Column headers are sample ids for i, j in data_df.iteritems(): if i in column_map.keys(): map_values[column_map[i]] = [k for d, k in j.iteritems()] else: for k, m in j.iteritems(): new_df_obj = {} new_df_obj[ 'sample_barcode'] = i # Normalized to match user_gen new_df_obj['Project'] = metadata['project_id'] new_df_obj['Study'] = metadata['study_id'] new_df_obj['Platform'] = metadata['platform'] new_df_obj['Pipeline'] = metadata['pipeline'] # Optional values new_df_obj['Symbol'] = map_values['Symbol'][ k] if 'Symbol' in map_values.keys() else '' new_df_obj['ID'] = map_values['ID'][ k] if 'ID' in map_values.keys() else '' new_df_obj['TAB'] = map_values['TAB'][ k] if 'TAB' in map_values.keys() else '' new_df_obj['Level'] = m new_df_data.append(new_df_obj) new_df = pd.DataFrame(new_df_data) # Get unique barcodes and update metadata_data table sample_barcodes = list( set([k for d, k in new_df['SampleBarcode'].iteritems()])) sample_metadata_list = [] for barcode in sample_barcodes: new_metadata = metadata.copy() new_metadata['sample_barcode'] = barcode sample_metadata_list.append(new_metadata) update_metadata_data_list(cloudsql_tables['METADATA_DATA'], sample_metadata_list) # Update metadata_samples table update_molecular_metadata_samples_list(cloudsql_tables['METADATA_SAMPLES'], metadata['data_type'], sample_barcodes) # Generate feature names and bq_mappings table_name = file_data['BIGQUERY_TABLE_NAME'] feature_defs = generate_feature_Defs(metadata['data_type'], metadata['study_id'], project_id, bq_dataset, table_name, new_df) # Update feature_defs table insert_feature_defs_list(cloudsql_tables['FEATURE_DEFS'], feature_defs) # upload the contents of the dataframe in njson format tmp_bucket = os.environ.get('tmp_bucket_location') gcs.convert_df_to_njson_and_upload(new_df, outfilename, metadata=metadata, tmp_bucket=tmp_bucket) # Load into BigQuery # Using temporary file location (in case we don't have write permissions on user's bucket?) source_path = 'gs://' + tmp_bucket + '/' + outfilename schema = get_molecular_schema() load_data_from_file.run(project_id, bq_dataset, table_name, schema, source_path, source_format='NEWLINE_DELIMITED_JSON', write_disposition='WRITE_APPEND', is_schema_file=False) # Delete temporary files print 'Deleting temporary file {0}'.format(outfilename) gcs = GcsConnector(project_id, tmp_bucket) gcs.delete_blob(outfilename)
def melt_matrix(self, matrix_file, platform, file2info, program_name, config, log): """ # melt matrix """ log.info('\t\t\tbegin melt matrix: \'%s\'' % (matrix_file)) # begin parsing the data data_df2 = pd.read_csv(matrix_file, delimiter='\t', header=0) data_df2 = data_df2.set_index(["Gene"]) # create a StingIO object with this info # call utils.convert_file_to_dataframe(buffer, sep=",") # call tools.cleanup_dataframe() # gcs.convert_df_to_njson_and_upload() log.info('\t\t\t\tstart processing saved matrix. size: %s' % (len(data_df2))) mod = int(len(data_df2) / 20) count = 0 total_count = 0 buf = StringIO() buf.write( "sample_barcode mirna_id mirna_accession normalized_count platform project_short_name program_name sample_type_code" + " file_name file_gdc_id aliquot_barcode case_barcode case_gdc_id sample_gdc_id aliquot_gdc_id\n" ) for i, j in data_df2.T.iteritems(): for k, m in j.iteritems(): aliquot = file2info[k]['aliquot_barcode'] SampleBarcode = "-".join(aliquot.split("-")[0:4]) ParticipantBarcode = "-".join(aliquot.split("-")[0:3]) SampleTypeCode = aliquot.split("-")[3][0:2] info = file2info[k] line = "\t".join( map(str, (SampleBarcode, i.split(".")[0], i.split(".")[1], m, platform, info['project_short_name'], info['program_name'], SampleTypeCode, info['file_name'], info['file_gdc_id'], aliquot, ParticipantBarcode, info['case_gdc_id'], info['sample_gdc_id'], info['aliquot_gdc_id']))) + '\n' buf.write(line) total_count += 1 if 0 == count % mod: log.info('\t\t\t\t\tprocessed %s lines:\n%s' % (count, line)) file_name = '%s_%s' % (matrix_file.split('/')[-1], count) log.info('\t\t\t\tsave %s to GCS' % file_name) buf.seek(0) df = convert_file_to_dataframe(buf) df = cleanup_dataframe(df, log) gcs = GcsConnector(config['cloud_projects']['open'], config['buckets']['open']) gcs.convert_df_to_njson_and_upload( df, config[program_name]['process_files']['datatype2bqscript'] ['Isoform Expression Quantification']['gcs_output_path'] + file_name, logparam=log) buf = StringIO() buf.write( "sample_barcode mirna_id mirna_accession normalized_count platform project_short_name program_name sample_type_code" + " file_name file_gdc_id aliquot_barcode case_barcode case_gdc_id sample_gdc_id aliquot_gdc_id\n" ) count += 1 log.info('\t\t\t\tprocessed %s total lines created %s records' % (count, total_count)) log.info('\t\t\t\tcompleted save to GCS') log.info('\t\t\tfinished melt matrix')
def parse_file(project_id, bq_dataset, bucket_name, file_data, filename, outfilename, metadata, cloudsql_tables): print 'Begin processing {0}.'.format(filename) # connect to the cloud bucket gcs = GcsConnector(project_id, bucket_name) #main steps: download, convert to df, cleanup, transform, add metadata filebuffer = gcs.download_blob_to_file(filename) # convert blob into dataframe data_df = convert_file_to_dataframe(filebuffer, skiprows=0, header=0) # clean-up dataframe data_df = cleanup_dataframe(data_df) new_df_data = [] map_values = {} # Get basic column information depending on datatype column_map = get_column_mapping(metadata['data_type']) # Column headers are sample ids for i, j in data_df.iteritems(): if i in column_map.keys(): map_values[column_map[i]] = [k for d, k in j.iteritems()] else: for k, m in j.iteritems(): new_df_obj = {} new_df_obj['sample_barcode'] = i # Normalized to match user_gen new_df_obj['project_id'] = metadata['project_id'] new_df_obj['study_id'] = metadata['study_id'] new_df_obj['Platform'] = metadata['platform'] new_df_obj['Pipeline'] = metadata['pipeline'] # Optional values new_df_obj['Symbol'] = map_values['Symbol'][k] if 'Symbol' in map_values.keys() else '' new_df_obj['ID'] = map_values['ID'][k] if 'ID' in map_values.keys() else '' new_df_obj['TAB'] = map_values['TAB'][k] if 'TAB' in map_values.keys() else '' new_df_obj['Level'] = m new_df_data.append(new_df_obj) new_df = pd.DataFrame(new_df_data) # Get unique barcodes and update metadata_data table sample_barcodes = list(set([k for d, k in new_df['sample_barcode'].iteritems()])) sample_metadata_list = [] for barcode in sample_barcodes: new_metadata = metadata.copy() new_metadata['sample_barcode'] = barcode sample_metadata_list.append(new_metadata) update_metadata_data_list(cloudsql_tables['METADATA_DATA'], sample_metadata_list) # Update metadata_samples table update_molecular_metadata_samples_list(cloudsql_tables['METADATA_SAMPLES'], metadata['data_type'], sample_barcodes) # Generate feature names and bq_mappings table_name = file_data['BIGQUERY_TABLE_NAME'] feature_defs = generate_feature_Defs(metadata['data_type'], metadata['study_id'], project_id, bq_dataset, table_name, new_df) # Update feature_defs table insert_feature_defs_list(cloudsql_tables['FEATURE_DEFS'], feature_defs) # upload the contents of the dataframe in njson format tmp_bucket = os.environ.get('tmp_bucket') gcs.convert_df_to_njson_and_upload(new_df, outfilename, metadata=metadata, tmp_bucket=tmp_bucket) # Load into BigQuery # Using temporary file location (in case we don't have write permissions on user's bucket?) source_path = 'gs://' + tmp_bucket + '/' + outfilename schema = get_molecular_schema() load_data_from_file.run( project_id, bq_dataset, table_name, schema, source_path, source_format='NEWLINE_DELIMITED_JSON', write_disposition='WRITE_APPEND', is_schema_file=False) # Delete temporary files print 'Deleting temporary file {0}'.format(outfilename) gcs = GcsConnector(project_id, tmp_bucket) gcs.delete_blob(outfilename)
else: raise Exception('Empty dataframe!') return True if __name__ == '__main__': config = json.load(open(sys.argv[1])) project_id = config['project_id'] bucket_name = config['buckets']['open'] sample_code2letter = config['sample_code2letter'] # get disease_codes/studies( TODO this must be changed to get the disease code from the file name) df = convert_file_to_dataframe(open(sys.argv[2])) df = cleanup_dataframe(df) studies = list(set(df['Study'].tolist())) # get bq columns ( this allows the user to select the columns # , without worrying about the index, case-sensitivenes etc selected_columns = pd.read_table(sys.argv[3], names=['bq_columns']) transposed = selected_columns.T transposed.columns = transposed.loc['bq_columns'] transposed = cleanup_dataframe(transposed) bq_columns = transposed.columns.values # submit threads by disease code pm = process_manager.ProcessManager(max_workers=33, db='maf.db', table='task_queue_status') for idx, df_group in df.groupby(['Study']): future = pm.submit(process_oncotator_output, project_id, bucket_name, df_group, bq_columns, sample_code2letter) #process_oncotator_output( project_id, bucket_name, df_group, bq_columns, sample_code2letter)
def process_user_gen_files(project_id, user_project_id, study_id, bucket_name, bq_dataset, cloudsql_tables, files): print 'Begin processing user_gen files.' # connect to the cloud bucket gcs = GcsConnector(project_id, bucket_name) data_df = pd.DataFrame() # Collect all columns that get passed in for generating BQ schema later all_columns = [] # For each file, download, convert to df for idx, file in enumerate(files): blob_name = file['FILENAME'].split('/')[1:] all_columns += file['COLUMNS'] metadata = { 'sample_barcode': file.get('SAMPLEBARCODE', ''), 'participant_barcode': file.get('PARTICIPANTBARCODE', ''), 'study_id': study_id, 'platform': file.get('PLATFORM', ''), 'pipeline': file.get('PIPELINE', ''), 'file_path': file['FILENAME'], 'file_name': file['FILENAME'].split('/')[-1], 'data_type': file['DATATYPE'] } # download, convert to df filebuffer = gcs.download_blob_to_file(blob_name) # Get column mapping column_mapping = get_column_mapping(file['COLUMNS']) if idx == 0: data_df = convert_file_to_dataframe(filebuffer, skiprows=0, header=0) data_df = cleanup_dataframe(data_df) data_df.rename(columns=column_mapping, inplace=True) # Generate Metadata for this file insert_metadata(data_df, metadata, cloudsql_tables['METADATA_DATA']) else: # convert blob into dataframe new_df = convert_file_to_dataframe(filebuffer, skiprows=0, header=0) new_df = cleanup_dataframe(new_df) new_df.rename(columns=column_mapping, inplace=True) # Generate Metadata for this file insert_metadata(new_df, metadata, cloudsql_tables['METADATA_DATA']) # TODO: Write function to check for participant barcodes, for now, we assume each file contains SampleBarcode Mapping data_df = pd.merge(data_df, new_df, on='sample_barcode', how='outer') # For complete dataframe, create metadata_samples rows print 'Inserting into data into {0}.'.format(cloudsql_tables['METADATA_SAMPLES']) data_df = cleanup_dataframe(data_df) data_df['has_mrna'] = 0 data_df['has_mirna'] = 0 data_df['has_protein'] = 0 data_df['has_meth'] = 0 insert_metadata_samples(data_df, cloudsql_tables['METADATA_SAMPLES']) # Update and create bq table file temp_outfile = cloudsql_tables['METADATA_SAMPLES'] + '.out' tmp_bucket = os.environ.get('tmp_bucket') gcs.convert_df_to_njson_and_upload(data_df, temp_outfile, tmp_bucket=tmp_bucket) # Using temporary file location (in case we don't have write permissions on user's bucket? source_path = 'gs://' + tmp_bucket + '/' + temp_outfile schema = generate_bq_schema(all_columns) table_name = 'cgc_user_{0}_{1}'.format(user_project_id, study_id) load_data_from_file.run( project_id, bq_dataset, table_name, schema, source_path, source_format='NEWLINE_DELIMITED_JSON', write_disposition='WRITE_APPEND', is_schema_file=False) # Generate feature_defs feature_defs = generate_feature_defs(study_id, project_id, bq_dataset, table_name, schema) # Update feature_defs table insert_feature_defs_list(cloudsql_tables['FEATURE_DEFS'], feature_defs) # Delete temporary files print 'Deleting temporary file {0}'.format(temp_outfile) gcs = GcsConnector(project_id, tmp_bucket) gcs.delete_blob(temp_outfile)