def parse_methylation(project_id, bucket_name, filename, outfilename, metadata): """Download and convert blob into dataframe Transform the file: includes data cleaning Add Metadata information """ # setup logging configure_logging('mirna.isoform', "logs/" + metadata['AliquotBarcode'] + '.log') # connect to the cloud bucket gcs = GcsConnector(project_id, bucket_name) #main steps: download, convert to df, cleanup, transform, add metadata data_df = gcutils.convert_blob_to_dataframe(gcs, project_id, bucket_name, filename, skiprows=1) data_df.columns = [ 'Probe_Id', "Beta_Value", "Gene_Symbol", "Chromosome", "Genomic_Coordinate" ] data_df = add_metadata(data_df, metadata) data_df = additional_changes(data_df) # upload the contents of the dataframe in njson format df_string = data_df.to_csv(index=False, header=False, float_format='%.2f') status = gcs.upload_blob_from_string(outfilename, df_string) return status
def parse_isoform(project_id, bucket_name, filename, outfilename, metadata): """Download and convert blob into dataframe Transform the file: includes data cleaning Add Metadata information """ # setup logging log = configure_logging('mirna.isoform.transform', "logs/mirna_isoform_transform_" + metadata['AliquotBarcode'] + '.log') try: log.info('start transform of %s' % (metadata['AliquotBarcode'])) # connect to the cloud bucket gcs = GcsConnector(project_id, bucket_name) #main steps: download, convert to df, cleanup, transform, add metadata log.info('\tadd changes and metadata for %s' % (metadata['AliquotBarcode'])) data_df = gcutils.convert_blob_to_dataframe(gcs, project_id, bucket_name, filename, log=log) data_df = additional_changes(data_df) data_df = add_metadata(data_df, metadata) # upload the contents of the dataframe in njson format status = gcs.convert_df_to_njson_and_upload(data_df, outfilename) log.info('finished transform of %s' % (metadata['AliquotBarcode'])) except Exception as e: log.exception('problem transforming %s' % (metadata['AliquotBarcode'])) raise e return status
def main(): """Example to download a file from the Google Storage, transform, and load to Google Storage and BigQuery """ project_id = '' bucket_name = '' # example file in bucket filename = '' outfilename = '' # connect to the google cloud bucket gcs = GcsConnector(project_id, bucket_name) # main steps: download, convert to df data_df = gcutils.convert_blob_to_dataframe(gcs, project_id, bucket_name, filename, skiprows=1) #--------------------------------------------------------- # get required information # get chromosome 1 and Genomic_Coordinate > 20000000 #--------------------------------------------------------- data_df = (data_df.query("Chromosome == '1' and Genomic_Coordinate > 20000000")\ .query("Beta_value > 0.2")) # we can assign this query to a new dataframe and have new data # upload the contents of the dataframe in njson format to google storage # set metadata on the blob/object metadata = {'info': 'etl-test'} status = gcs.convert_df_to_njson_and_upload(data_df, outfilename, metadata=metadata) print status
def parse_protein(project_id, bucket_name, filename, outfilename, metadata): """Download and convert blob into dataframe Transform the file: includes data cleaning Add Metadata information """ # setup logging configure_logging('protein', "logs/" + metadata['AliquotBarcode'] + '.log') # connect to the cloud bucket gcs = GcsConnector(project_id, bucket_name) #main steps: download, convert to df, cleanup, transform, add metadata data_df = gcutils.convert_blob_to_dataframe(gcs, project_id, bucket_name, filename, skiprows=1) data_df = additional_changes(data_df) data_df = add_metadata(data_df, metadata) # validation tests.assert_notnull_property(data_df, columns_list=['Protein_Name']) # upload the contents of the dataframe in njson format status = gcs.convert_df_to_njson_and_upload(data_df, outfilename, metadata=metadata) return status
def parse_cnv(project_id, bucket_name, filename, outfilename, metadata): """Download and convert blob into dataframe Transform the file: includes data cleaning Add Metadata information """ # setup logging configure_logging('cnv', "logs/" + metadata['AliquotBarcode'] + '.log') # connect to the cloud bucket gcs = GcsConnector(project_id, bucket_name) #main steps: download, convert to df, cleanup, transform, add metadata data_df = gcutils.convert_blob_to_dataframe(gcs, project_id, bucket_name, filename) data_df = additional_changes(data_df) data_df = add_metadata(data_df, metadata) # upload the contents of the dataframe in njson format status = gcs.convert_df_to_njson_and_upload(data_df, outfilename) return status
def parse_isoform(project_id, bucket_name, filename, outfilename, metadata): """Download and convert blob into dataframe Transform the file: includes data cleaning Add Metadata information """ # setup logging configure_logging('mirna.isoform', "logs/" + metadata['AliquotBarcode'] + '.log') # connect to the cloud bucket gcs = GcsConnector(project_id, bucket_name) #main steps: download, convert to df, cleanup, transform, add metadata data_df = gcutils.convert_blob_to_dataframe(gcs, project_id, bucket_name, filename) data_df = additional_changes(data_df) data_df = add_metadata(data_df, metadata) # upload the contents of the dataframe in njson format status = gcs.convert_df_to_njson_and_upload(data_df, outfilename) return status
def parse_protein(project_id, bucket_name, filename, outfilename, metadata): """Download and convert blob into dataframe Transform the file: includes data cleaning Add Metadata information """ # setup logging configure_logging("protein", "logs/" + metadata["AliquotBarcode"] + ".log") # connect to the cloud bucket gcs = GcsConnector(project_id, bucket_name) # main steps: download, convert to df, cleanup, transform, add metadata data_df = gcutils.convert_blob_to_dataframe(gcs, project_id, bucket_name, filename, skiprows=1) data_df = additional_changes(data_df) data_df = add_metadata(data_df, metadata) # validation tests.assert_notnull_property(data_df, columns_list=["Protein_Name"]) # upload the contents of the dataframe in njson format status = gcs.convert_df_to_njson_and_upload(data_df, outfilename, metadata=metadata) return status
def main(): """Example to download a file from the Google Storage, transform, and load to Google Storage and BigQuery """ project_id = '' bucket_name = '' # example file in bucket filename = 'TCGA-OR-A5J1-01A-11D-A29J-05.txt' outfilename = '' # read the stringIO/file into a pandas dataframe # load the file into a table data_df = pandas.read_table(filename, sep="\t", skiprows=1, lineterminator='\n', comment='#') # clean up the dataframe for upload to BigQuery data_df = cleanup_dataframe(data_df) # connect to the google cloud bucket gcs = GcsConnector(project_id, bucket_name) # main steps: download, convert to df data_df = gcutils.convert_blob_to_dataframe(gcs, project_id, bucket_name, filename, skiprows=1) #--------------------------------------------------------- # get required information # get chromosome 1 and Genomic_Coordinate > 20000000 #--------------------------------------------------------- data_df = (data_df.query("Chromosome == '1' and Genomic_Coordinate > 20000000")\ .query("Beta_value > 0.2")) # we can assign this query to a new dataframe and have new data # upload the contents of the dataframe in njson format to google storage # set metadata on the blob/object metadata = {'info': 'etl-test'} status = gcs.convert_df_to_njson_and_upload(data_df, outfilename, metadata=metadata) print status
def parse_methylation(project_id, bucket_name, filename, outfilename, metadata): """Download and convert blob into dataframe Transform the file: includes data cleaning Add Metadata information """ # setup logging configure_logging('mirna.isoform', "logs/" + metadata['AliquotBarcode'] + '.log') # connect to the cloud bucket gcs = GcsConnector(project_id, bucket_name) #main steps: download, convert to df, cleanup, transform, add metadata data_df = gcutils.convert_blob_to_dataframe(gcs, project_id, bucket_name, filename, skiprows=1) data_df.columns = ['Probe_Id', "Beta_Value", "Gene_Symbol", "Chromosome", "Genomic_Coordinate"] data_df = add_metadata(data_df, metadata) data_df = additional_changes(data_df) # upload the contents of the dataframe in njson format df_string = data_df.to_csv(index=False, header=False, float_format='%.2f') status = gcs.upload_blob_from_string(outfilename, df_string) return status
def process_oncotator_output(project_id, bucket_name, data_library, bq_columns, sample_code2letter): study = data_library['Study'].iloc[0] # this needed to stop pandas from converting them to FLOAT dtype = { "Transcript_Exon" : "object" ,"NCBI_Build" : "object" ,"COSMIC_Total_Alterations_In_Gene" : "object" ,"CCLE_ONCOMAP_Total_Mutations_In_Gene" : "object" ,"HGNC_HGNC_ID" : "object" ,"UniProt_AApos" : "object" ,"Transcript_Position" : "object" ,"HGNC_OMIM_ID_Supplied_By_NCBI" : "object" } file_count = 0 # create an empty dataframe. we use this to merge dataframe disease_bigdata_df = pd.DataFrame() # iterate over the selected files for oncotator_file in data_library['filename']: file_count+= 1 log.info('-'*10 + "{0}: Processing file {1}".format(file_count, oncotator_file) + '-'*10) try: gcs = GcsConnector(project_id, bucket_name) # covert the file to a dataframe filename = 'tcga/intermediary/MAF/oncotator_output_files/' + oncotator_file df = gcutils.convert_blob_to_dataframe(gcs, project_id, bucket_name, filename) except Exception as e: print e raise if df.empty: log.debug('empty dataframe for file: ' + str(oncotator_file)) continue #------------------------------ # different operations on the frame #------------------------------ # get only the required BigQuery columns df = df[bq_columns] # format oncotator columns; name changes etc df = format_oncotator_columns(df) # add new columns df = add_columns(df, sample_code2letter, study) disease_bigdata_df = disease_bigdata_df.append(df, ignore_index = True) # this is a merged dataframe if not disease_bigdata_df.empty: # remove duplicates; various rules; see check duplicates) df = check_duplicates.remove_maf_duplicates(df, sample_code2letter) # enforce unique mutation unique_mutation = ['Chromosome', 'Start_Position', 'End_Position', 'Tumor_Seq_Allele1', 'Tumor_Seq_Allele2', 'Tumor_AliquotBarcode'] # merge mutations from multiple centers concat_df = [] for idx, df_group in df.groupby(unique_mutation): if len(df_group) > 1: # tolist; unique list; sort; concat df_group.loc[:,'Center'] = ";".join(map(str,sorted(list(set(df_group['Center'].tolist()))))) concat_df.append(df_group) df = pd.concat(concat_df) # enforce unique mutation df = remove_duplicates(df, unique_mutation) # convert the df to new-line JSON and the upload the file gcs.convert_df_to_njson_and_upload(disease_bigdata_df, "tcga/intermediary/MAF/bigquery_data_files/{0}.json".format(study)) else: raise Exception('Empty dataframe!') return True
def process_oncotator_output(project_id, bucket_name, data_library, bq_columns, sample_code2letter, oncotator_object_path, oncotator_object_output_path): study = data_library['Study'].iloc[0] # this needed to stop pandas from converting them to FLOAT dtype = { "Transcript_Exon": "object", "NCBI_Build": "object", "COSMIC_Total_Alterations_In_Gene": "object", "CCLE_ONCOMAP_Total_Mutations_In_Gene": "object", "HGNC_HGNC_ID": "object", "UniProt_AApos": "object", "Transcript_Position": "object", "HGNC_OMIM_ID_Supplied_By_NCBI": "object" } file_count = 0 # create an empty dataframe. we use this to merge dataframe disease_bigdata_df = pd.DataFrame() # iterate over the selected files for oncotator_file in data_library['filename']: file_count += 1 log.info( '-' * 10 + "{0}: Processing file {1}".format(file_count, oncotator_file) + '-' * 10) try: # covert the file to a dataframe filename = oncotator_object_path + oncotator_file gcs = GcsConnector(project_id, bucket_name) log.info('%s: converting %s to dataframe' % (study, filename)) df = gcutils.convert_blob_to_dataframe(gcs, project_id, bucket_name, filename, log=log) log.info('%s: done converting %s to dataframe' % (study, filename)) except RuntimeError as re: log.warning('%s: problem cleaning dataframe for %s: %s' % (study, filename, re)) except Exception as e: log.exception('%s: problem converting to dataframe for %s: %s' % (study, filename, e)) raise e if df.empty: log.warning('%s: empty dataframe for file: %s' % (study, oncotator_file)) continue #------------------------------ # different operations on the frame #------------------------------ # get only the required BigQuery columns df = df[bq_columns] # format oncotator columns; name changes etc df = format_oncotator_columns(df) # add new columns df = add_columns(df, sample_code2letter, study) disease_bigdata_df = disease_bigdata_df.append(df, ignore_index=True) log.info('-' * 10 + "{0}: Finished file({3}) {1}. rows: {2}".format( file_count, oncotator_file, len(df), study) + '-' * 10) # this is a merged dataframe if not disease_bigdata_df.empty: # remove duplicates; various rules; see check duplicates) log.info( '\tcalling check_duplicates to collapse aliquots with %s rows' % (len(disease_bigdata_df))) disease_bigdata_df = check_duplicates.remove_maf_duplicates( disease_bigdata_df, sample_code2letter, log) log.info( '\tfinished check_duplicates to collapse aliquots with %s rows' % (len(disease_bigdata_df))) # enforce unique mutation--previous # unique_mutation = ['Chromosome', 'Start_Position', 'End_Position', 'Tumor_Seq_Allele1', 'Tumor_Seq_Allele2', 'Tumor_AliquotBarcode'] # enforce unique mutation unique_mutation = [ 'Hugo_Symbol', 'Entrez_Gene_Id', 'Chromosome', 'Start_Position', 'End_Position', 'Reference_Allele', 'Tumor_Seq_Allele1', 'Tumor_Seq_Allele2', 'Tumor_AliquotBarcode' ] # merge mutations from multiple centers log.info( '\tconsolidate the centers for duplicate mutations into list for %s' % (study)) seencenters = set() def concatcenters(df_group): if len(df_group) > 1: centers = list(set(df_group['Center'].tolist())) uniquecenters = set() delim = config['maf']['center_delim'] for center in centers: fields = center.split(delim) for field in fields: uniquecenters.add(field) sortedunique = delim.join(sorted(list(uniquecenters))) df_group.loc[:, 'Center'] = sortedunique if sortedunique not in seencenters: log.info('unique centers: %s' % sortedunique) seencenters.add(sortedunique) return df_group disease_bigdata_df = disease_bigdata_df.groupby(unique_mutation).apply( concatcenters) log.info( '\tfinished consolidating centers for duplicate mutations for %s' % (study)) # enforce unique mutation log.info( '\tcalling remove_duplicates to collapse mutations with %s rows for %s' % (len(disease_bigdata_df), study)) disease_bigdata_df = remove_duplicates(disease_bigdata_df, unique_mutation) log.info( '\tfinished remove_duplicates to collapse mutations with %s rows for %s' % (len(disease_bigdata_df), study)) # convert the disease_bigdata_df to new-line JSON and upload the file uploadpath = oncotator_object_output_path + "{0}.json".format(study) log.info('%s: uploading %s to GCS' % (study, uploadpath)) gcs.convert_df_to_njson_and_upload(disease_bigdata_df, uploadpath) log.info('%s: done uploading %s to GCS' % (study, uploadpath)) else: log.warning('Empty dataframe for %s in %s!' % (oncotator_file, study)) return True