def search_files(config): # project id project_id = config['project_id'] # bucket name bucket_name = config['buckets']['open'] # connect to google cloud storage gcs = GcsConnector(project_id, bucket_name) #--------------------- # search for MAF files #--------------------- maf_file = re.compile("^.*.maf$") # search only these tumor types defined in config files search_tumors = ["tcga/" + d.lower() for d in config['all_tumor_types']] data_library = gcs.search_files(search_patterns=['.maf'], regex_search_pattern=maf_file, prefixes=search_tumors) data_library['basefilename'] = map( lambda x: os.path.splitext(os.path.basename(x))[0], data_library['filename']) data_library['unique_filename'] = format_dupe_values( data_library['basefilename']) return data_library
def parse_file(project_id, bq_dataset, bucket_name, file_data, filename, outfilename, metadata, cloudsql_tables): print 'Begin processing {0}.'.format(filename) # connect to the cloud bucket gcs = GcsConnector(project_id, bucket_name) #main steps: download, convert to df, cleanup, transform, add metadata filebuffer = gcs.download_blob_to_file(filename) # convert blob into dataframe data_df = convert_file_to_dataframe(filebuffer, skiprows=0, header=0) # Get basic column information depending on datatype column_mapping = get_column_mapping(metadata['DataType']) data_df = cleanup_dataframe(data_df) data_df.rename(columns=column_mapping, inplace=True) # Get barcodes and update metadata_data table # Assuming second scenario where each file is a different platform/pipeline combination # TODO: Put in functionality for other scenario where all lists are in one file. sample_barcodes = list([k for d, k in data_df['SampleBarcode'].iteritems()]) file_list = list([k for d, k in data_df['filenamepath'].iteritems()]) sample_metadata_list = [] for idx, barcode in enumerate(sample_barcodes): new_metadata = metadata.copy() new_metadata['sample_barcode'] = barcode new_metadata['file_path'] = file_list[idx].replace('gs://', '') sample_metadata_list.append(new_metadata) update_metadata_data_list(cloudsql_tables['METADATA_DATA'], sample_metadata_list)
def upload_batch_etl(self, config, outputdir, paths, file2info, endpt_type, program_name, project, data_type, log): log.info('\tstart upload_batch_etl() for %s and %s' % (project, data_type)) try: complete_df = self.process_paths(config, outputdir, data_type, paths, program_name, project, file2info, log) if complete_df is not None: etl_uploaded = True gcs = GcsConnector(config['cloud_projects']['open'], config['buckets']['open']) keyname = config['buckets']['folders'][ 'base_run_folder'] + 'etl/%s/%s/%s/%s' % ( endpt_type, project, data_type, paths[0].replace( '/', '_')) log.info('\t\tstart convert and upload %s to the cloud' % (keyname)) gcs.convert_df_to_njson_and_upload(complete_df, keyname, logparam=log) log.info('\t\tfinished convert and upload %s to the cloud' % (keyname)) else: etl_uploaded = False log.info('\t\tno upload for this batch of files') except Exception as e: log.exception('problem finishing the etl: %s' % (e)) raise log.info('\tfinished upload_batch_etl() for %s and %s' % (project, data_type)) return etl_uploaded
def parse_protein(project_id, bucket_name, filename, outfilename, metadata): """Download and convert blob into dataframe Transform the file: includes data cleaning Add Metadata information """ # setup logging configure_logging('protein', "logs/" + metadata['AliquotBarcode'] + '.log') # connect to the cloud bucket gcs = GcsConnector(project_id, bucket_name) #main steps: download, convert to df, cleanup, transform, add metadata data_df = gcutils.convert_blob_to_dataframe(gcs, project_id, bucket_name, filename, skiprows=1) data_df = additional_changes(data_df) data_df = add_metadata(data_df, metadata) # validation tests.assert_notnull_property(data_df, columns_list=['Protein_Name']) # upload the contents of the dataframe in njson format status = gcs.convert_df_to_njson_and_upload(data_df, outfilename, metadata=metadata) return status
def get_sdrf_info(project_id, bucket_name, disease_codes, header, set_index_col, search_patterns): client = storage.Client(project_id) bucket = client.get_bucket(bucket_name) # connect to google cloud storage gcs = GcsConnector(project_id, bucket_name) sdrf_info = pd.DataFrame() for disease_code in disease_codes: for blob in bucket.list_blobs(prefix=disease_code): sdrf_filename = blob.name if not all(x in sdrf_filename for x in search_patterns): continue print(sdrf_filename) filebuffer = gcs.download_blob_to_file(sdrf_filename) # convert to a dataframe sdrf_df = convert_file_to_dataframe(filebuffer, skiprows=0) sdrf_df = cleanup_dataframe(sdrf_df) sdrf_df["Study"] = disease_code try: sdrf_df = sdrf_df.set_index(set_index_col) except: sdrf_df = sdrf_df.set_index("Derived_Array_Data_File") sdrf_info = sdrf_info.append(sdrf_df) print("Done loading SDRF files.") return sdrf_info
def parse_isoform(project_id, bucket_name, filename, outfilename, metadata): """Download and convert blob into dataframe Transform the file: includes data cleaning Add Metadata information """ # setup logging log = configure_logging('mirna.isoform.transform', "logs/mirna_isoform_transform_" + metadata['AliquotBarcode'] + '.log') try: log.info('start transform of %s' % (metadata['AliquotBarcode'])) # connect to the cloud bucket gcs = GcsConnector(project_id, bucket_name) #main steps: download, convert to df, cleanup, transform, add metadata log.info('\tadd changes and metadata for %s' % (metadata['AliquotBarcode'])) data_df = gcutils.convert_blob_to_dataframe(gcs, project_id, bucket_name, filename, log=log) data_df = additional_changes(data_df) data_df = add_metadata(data_df, metadata) # upload the contents of the dataframe in njson format status = gcs.convert_df_to_njson_and_upload(data_df, outfilename) log.info('finished transform of %s' % (metadata['AliquotBarcode'])) except Exception as e: log.exception('problem transforming %s' % (metadata['AliquotBarcode'])) raise e return status
def parse_file(project_id, bq_dataset, bucket_name, file_data, filename, outfilename, metadata, cloudsql_tables): print 'Begin processing {0}.'.format(filename) # connect to the cloud bucket gcs = GcsConnector(project_id, bucket_name) #main steps: download, convert to df, cleanup, transform, add metadata filebuffer = gcs.download_blob_to_file(filename) # convert blob into dataframe data_df = convert_file_to_dataframe(filebuffer, skiprows=0, header=0) # Get basic column information depending on datatype column_mapping = get_column_mapping(metadata['DataType']) data_df = cleanup_dataframe(data_df) data_df.rename(columns=column_mapping, inplace=True) # Get barcodes and update metadata_data table # Assuming second scenario where each file is a different platform/pipeline combination # TODO: Put in functionality for other scenario where all lists are in one file. sample_barcodes = list( [k for d, k in data_df['SampleBarcode'].iteritems()]) file_list = list([k for d, k in data_df['filenamepath'].iteritems()]) sample_metadata_list = [] for idx, barcode in enumerate(sample_barcodes): new_metadata = metadata.copy() new_metadata['sample_barcode'] = barcode new_metadata['file_path'] = file_list[idx].replace('gs://', '') sample_metadata_list.append(new_metadata) update_metadata_data_list(cloudsql_tables['METADATA_DATA'], sample_metadata_list)
def get_sdrf_info(project_id, bucket_name, disease_codes, header, set_index_col, search_patterns): client = storage.Client(project_id) bucket = client.get_bucket(bucket_name) # connect to google cloud storage gcs = GcsConnector(project_id, bucket_name) sdrf_info = pd.DataFrame() for disease_code in disease_codes: for blob in bucket.list_blobs(prefix=disease_code): sdrf_filename = blob.name if not all(x in sdrf_filename for x in search_patterns): continue print(sdrf_filename) filebuffer = gcs.download_blob_to_file(sdrf_filename) # convert to a dataframe sdrf_df = convert_file_to_dataframe(filebuffer, skiprows=0) sdrf_df = cleanup_dataframe(sdrf_df) sdrf_df['Study'] = disease_code try: sdrf_df = sdrf_df.set_index(set_index_col) except: sdrf_df = sdrf_df.set_index("Derived_Array_Data_File") sdrf_info = sdrf_info.append(sdrf_df) print("Done loading SDRF files.") return sdrf_info
def parse_methylation(project_id, bucket_name, filename, outfilename, metadata): """Download and convert blob into dataframe Transform the file: includes data cleaning Add Metadata information """ # setup logging configure_logging('mirna.isoform', "logs/" + metadata['AliquotBarcode'] + '.log') # connect to the cloud bucket gcs = GcsConnector(project_id, bucket_name) #main steps: download, convert to df, cleanup, transform, add metadata data_df = gcutils.convert_blob_to_dataframe(gcs, project_id, bucket_name, filename, skiprows=1) data_df.columns = [ 'Probe_Id', "Beta_Value", "Gene_Symbol", "Chromosome", "Genomic_Coordinate" ] data_df = add_metadata(data_df, metadata) data_df = additional_changes(data_df) # upload the contents of the dataframe in njson format df_string = data_df.to_csv(index=False, header=False, float_format='%.2f') status = gcs.upload_blob_from_string(outfilename, df_string) return status
def main(): """Example to download a file from the Google Storage, transform, and load to Google Storage and BigQuery """ project_id = '' bucket_name = '' # example file in bucket filename = '' outfilename = '' # connect to the google cloud bucket gcs = GcsConnector(project_id, bucket_name) # main steps: download, convert to df data_df = gcutils.convert_blob_to_dataframe(gcs, project_id, bucket_name, filename, skiprows=1) #--------------------------------------------------------- # get required information # get chromosome 1 and Genomic_Coordinate > 20000000 #--------------------------------------------------------- data_df = (data_df.query("Chromosome == '1' and Genomic_Coordinate > 20000000")\ .query("Beta_value > 0.2")) # we can assign this query to a new dataframe and have new data # upload the contents of the dataframe in njson format to google storage # set metadata on the blob/object metadata = {'info': 'etl-test'} status = gcs.convert_df_to_njson_and_upload(data_df, outfilename, metadata=metadata) print status
def test_blob_exists(project_id, bucket_name, df): """ Checks if the DataFileNameKey blob exists in the bucket """ # connect to the cloud bucket gcs = GcsConnector(project_id, bucket_name) for idx, row in df.iterrows(): records = row.to_dict() print records['DatafileNameKey'] blob_status = gcs.check_blob_exists(records['DatafileNameKey']) assert (blob_status is True), 'Blob doesnt exist'
def parse_annotations(config): """ Download using annotation file using urllib2.Request Convert the file into a dataframe """ # connect to google.cloud wrapper project_id = config['project_id'] bucket_name = config['buckets']['open'] gcs = GcsConnector(project_id, bucket_name) # use Request to get the json data print 'Downloading the TCGA annotations file' request = Request(config['tcga_annotations']['input_file']) response = urlopen(request) elevations = response.read() data = json.loads(elevations) all_annotations = [] for annotation in data['dccAnnotation']: annotation = normalize_json(annotation) if annotation["items_item"].startswith('TCGA-') and len( annotation["items_item"]) >= 12: item = annotation["items_item"] annotation['ParticipantBarcode'] = "-".join(item.split("-")[0:3]) if (len(annotation["items_item"]) > 12 and annotation["items_itemType_itemTypeName"] != "Patient"): annotation['SampleBarcode'] = "-".join(item.split("-")[0:4]) if annotation["items_itemType_itemTypeName"] == "Aliquot": aliquot = annotation["items_item"] if aliquot.startswith('TCGA-') and len(aliquot) == 28: annotation['AliquotBarcode'] = aliquot all_annotations.append(annotation) #------------ # transform #------------ data_df = pd.DataFrame(all_annotations) # clean up the dataframe to upload to BigQuery data_df = cleanup_dataframe(data_df) # rename columns data_df = rename_columns(data_df) print data_df #---------- # upload #---------- # upload the contents of the dataframe in njson format print "Uploading the njson file" status = gcs.convert_df_to_njson_and_upload( data_df, config['tcga_annotations']['output_file']) print status
def parse_annotations(config): """ Download using annotation file using urllib2.Request Convert the file into a dataframe """ # connect to gcloud wrapper project_id = config['project_id'] bucket_name = config['buckets']['open'] gcs = GcsConnector(project_id, bucket_name) # use Request to get the json data print 'Downloading the TCGA annotations file' request = Request(config['tcga_annotations']['input_file']) response = urlopen(request) elevations = response.read() data = json.loads(elevations) all_annotations = [] for annotation in data['dccAnnotation']: annotation = normalize_json(annotation) if annotation["items_item"].startswith('TCGA-') and len(annotation["items_item"]) >= 12: item = annotation["items_item"] annotation['ParticipantBarcode'] = "-".join(item.split("-")[0:3]) if (len(annotation["items_item"]) > 12 and annotation["items_itemType_itemTypeName"] != "Patient"): annotation['SampleBarcode'] = "-".join(item.split("-")[0:4]) if annotation["items_itemType_itemTypeName"] == "Aliquot": aliquot = annotation["items_item"] if aliquot.startswith('TCGA-') and len(aliquot) == 28: annotation['AliquotBarcode'] = aliquot all_annotations.append(annotation) #------------ # transform #------------ data_df = pd.DataFrame(all_annotations) # clean up the dataframe to upload to BigQuery data_df = cleanup_dataframe(data_df) # rename columns data_df = rename_columns(data_df) print data_df #---------- # upload #---------- # upload the contents of the dataframe in njson format print "Uploading the njson file" status = gcs.convert_df_to_njson_and_upload(data_df, config['tcga_annotations']['output_file']) print status
def melt_matrix(matrix_file, Platform, studies_map, config, log): """ # melt matrix """ log.info('\tbegin melt matrix: \'%s\'' % (matrix_file)) # begin parsing the data data_df2 = pd.read_csv(matrix_file, delimiter='\t', header=0) data_df2 = data_df2.set_index(["Gene"]) # create a StingIO object with this info # call utils.convert_file_to_dataframe(buffer, sep=",") # call tools.cleanup_dataframe() # gcs.convert_df_to_njson_and_upload() log.info('\t\tstart processing saved matrix. size: %s' % (len(data_df2))) mod = int(len(data_df2) / 20) count = 0 buf = StringIO() buf.write( "ParticipantBarcode SampleBarcode AliquotBarcode SampleTypeLetterCode Study Platform mirna_id mirna_accession normalized_count\n" ) for i, j in data_df2.T.iteritems(): if 0 == count % mod: log.info('\t\t\tprocessed %s lines' % (count)) count += 1 for k, m in j.iteritems(): aliquot = k.strip(".mirbase20") aliquot = aliquot.strip(".hg19") SampleBarcode = "-".join(aliquot.split("-")[0:4]) ParticipantBarcode = "-".join(aliquot.split("-")[0:3]) SampleTypeLetterCode = config["sample_code2letter"][aliquot.split( "-")[3][0:2]] Study = studies_map[aliquot].upper() buf.write("\t".join( map(str, (ParticipantBarcode, SampleBarcode, aliquot, SampleTypeLetterCode, Study, Platform, i.split(".")[0], i.split(".")[1], m))) + '\n') log.info('\t\tprocessed %s total lines' % (count)) file_name = matrix_file.split('/')[-1] log.info('\t\tsave %s to GCS' % file_name) buf.seek(0) df = convert_file_to_dataframe(buf) df = cleanup_dataframe(df) gcs = GcsConnector(config['project_id'], config['buckets']['open']) gcs.convert_df_to_njson_and_upload( df, config['mirna_isoform_matrix'][Platform]['output_dir'] + file_name) log.info('\t\tcompleted save to GCS') log.info('\tfinished melt matrix')
def main(config): # etl = util.DataETL("isb-cgc", "isb-cgc-open") # this starts a new connectioni project_id = config['project_id'] bucket_name = config['buckets']['open'] # connect to bucket to get files gcs = GcsConnector(project_id, bucket_name) isoform_file = re.compile("^.*.isoform.quantification.txt.json$") data_library = gcs.search_files( search_patterns=['.isoform.quantification.txt'], regex_search_pattern=isoform_file, prefixes=['tcga/intermediary/mirna/isoform/']) # we are eliminating bad files - size 0; could be hg18 etc data_library.loc[:, 'basefilename'] = data_library['filename'].map( lambda x: os.path.splitext(os.path.basename(x))[0].replace( '.json', '')) data_library = data_library.query('size > 0') conn = sqlite3.connect('../etl-mirna-isoform.db') sql = 'SELECT * from {0}'.format('task_queue') all_files_df = pd.read_sql_query(sql, conn) conn.close() with open('downloadedfiles.txt') as f: lines = f.read().splitlines() all_files_df = all_files_df[(all_files_df.DatafileName.isin( data_library.basefilename))] all_files_df = all_files_df[~(all_files_df.DatafileName.isin(lines))] data_library = all_files_df print data_library conn = sqlite3.connect('etl-isoform-download.db') submit_to_queue(data_library, conn, 'task_queue') queue_df = data_library # restart ETL; this gets the diff; also takes care of errors try: sql = 'SELECT * from task_queue_status where errors="None"' queue_df2 = pd.read_sql_query(sql, conn) print 'completed: ', len(queue_df2) queue_df = queue_df[~( queue_df.DatafileNameKey.isin(queue_df2.DatafileNameKey))] print 'Not completed: ', len(queue_df) except Exception, e: pass
def main(configfilename): """ Prefer 450K files over 270k (delete transformed 270K files) """ with open(configfilename) as configfile: config = json.load(configfile) print '*'*30 + "\nExtract - methylation" data_library = methylation.extract.identify_data(config) hiseq_aliquots = data_library.query('Platform == "HumanMethylation450"')['AliquotBarcode'].drop_duplicates().values.tolist() IlluminaGA_df = data_library.query('Platform == "HumanMethylation27"') IlluminaHiSeq_GA_df = IlluminaGA_df[ (IlluminaGA_df['AliquotBarcode'].isin(hiseq_aliquots))] project_id = config['project_id'] bucket_name = config['buckets']['open'] gcs = GcsConnector(project_id, bucket_name) for _, row in IlluminaHiSeq_GA_df.iterrows(): blob = row.to_dict()['OutDatafileNameKey'] gcs.delete_blob(blob)
def parse_cnv(project_id, bucket_name, filename, outfilename, metadata): """Download and convert blob into dataframe Transform the file: includes data cleaning Add Metadata information """ # setup logging configure_logging('cnv', "logs/" + metadata['AliquotBarcode'] + '.log') # connect to the cloud bucket gcs = GcsConnector(project_id, bucket_name) #main steps: download, convert to df, cleanup, transform, add metadata data_df = gcutils.convert_blob_to_dataframe(gcs, project_id, bucket_name, filename) data_df = additional_changes(data_df) data_df = add_metadata(data_df, metadata) # upload the contents of the dataframe in njson format status = gcs.convert_df_to_njson_and_upload(data_df, outfilename) return status
def parse_mirtarbase(config): """ Download mirtarbase file with urllib2.Request Convert to a dataframe """ # use Request to get the data (change the link to get the most recent file) request = Request(config['mirtarbase']['input_file']) socket = urlopen(request) # load into a pandas dataframe xlsd = pd.ExcelFile(socket, dtype=object) df_sheet1 = xlsd.parse(0) # parse first sheet (converts ina dataframe) print 'Found {0} rows in the file'.format(len(df_sheet1)) # clean up the dataframe to upload to BigQuery data_df = cleanup_dataframe(df_sheet1) replace_column_names = { 'Species_miRNA': 'miRNA_Species', 'Species_Target_Gene': 'Target_Gene_Species', 'Target_Gene_Entrez_Gene_ID': 'Target_Gene_EntrezID', 'miRTarBase_ID': 'miRTarBaseID' } data_df.columns = [replace_column_names[x] if x in replace_column_names\ else x for x in data_df.columns] # convert to int data_df['Target_Gene_EntrezID'] = data_df['Target_Gene_EntrezID']\ .map(lambda x: str(int(float(x))), na_action='ignore') #------- # upload #------- # connect to google.cloud wrapper project_id = config['project_id'] bucket_name = config['buckets']['open'] # connect to google storage gcs = GcsConnector(project_id, bucket_name) # upload the contents of the dataframe in njson format status = gcs.convert_df_to_njson_and_upload( data_df, config['mirtarbase']['output_file']) print status
def download_antibody_annotation_files(config, log): object_key_template = config['protein']['aa_object_key_template'] aa_file_dir = config['protein']['aa_file_dir'] gcs = GcsConnector(config['project_id'], config['buckets']['open']) studies = config['all_tumor_types'] nonrppa_studies = config['protein']['nonrppa'] log.info('\tstart downloading antibody annotation files to %s from %s:%s' % (aa_file_dir, config['project_id'], config['buckets']['open'])) if not os.path.isdir(aa_file_dir): os.makedirs(aa_file_dir) for study in studies: if study in nonrppa_studies: continue keypath = object_key_template % (study.lower(), study.upper()) log.info('\t\tdownloading %s' % (keypath)) tmpfile = gcs.download_blob_to_file(keypath) with open(aa_file_dir + keypath[keypath.rindex('/'):], 'w') as outfile: outfile.write(tmpfile.getvalue()) log.info('\tdone downloading antibody annotation files')
def parse_mirtarbase(config): """ Download mirtarbase file with urllib2.Request Convert to a dataframe """ # use Request to get the data (change the link to get the most recent file) request = Request(config['mirtarbase']['input_file']) socket = urlopen(request) # load into a pandas dataframe xlsd = pd.ExcelFile(socket, dtype=object) df_sheet1 = xlsd.parse(0) # parse first sheet (converts ina dataframe) print 'Found {0} rows in the file'. format(len(df_sheet1)) # clean up the dataframe to upload to BigQuery data_df = cleanup_dataframe(df_sheet1) replace_column_names = { 'Species_miRNA': 'miRNA_Species', 'Species_Target_Gene': 'Target_Gene_Species', 'Target_Gene_Entrez_Gene_ID': 'Target_Gene_EntrezID', 'miRTarBase_ID': 'miRTarBaseID' } data_df.columns = [replace_column_names[x] if x in replace_column_names\ else x for x in data_df.columns] # convert to int data_df['Target_Gene_EntrezID'] = data_df['Target_Gene_EntrezID']\ .map(lambda x: str(int(float(x))), na_action='ignore') #------- # upload #------- # connect to gcloud wrapper project_id = config['project_id'] bucket_name = config['buckets']['open'] # connect to google storage gcs = GcsConnector(project_id, bucket_name) # upload the contents of the dataframe in njson format status = gcs.convert_df_to_njson_and_upload(data_df, config['mirtarbase']['output_file']) print status
def main(config): # etl = util.DataETL("isb-cgc", "isb-cgc-open") # this starts a new connectioni project_id = config['project_id'] bucket_name = config['buckets']['open'] # connect to bucket to get files gcs = GcsConnector(project_id, bucket_name) isoform_file = re.compile("^.*.isoform.quantification.txt.json$") data_library = gcs.search_files(search_patterns=['.isoform.quantification.txt'], regex_search_pattern=isoform_file, prefixes=['tcga/intermediary/mirna/isoform/']) # we are eliminating bad files - size 0; could be hg18 etc data_library.loc[:, 'basefilename'] = data_library['filename'].map(lambda x: os.path.splitext(os.path.basename(x))[0].replace('.json', '')) data_library = data_library.query('size > 0') conn = sqlite3.connect('../etl-mirna-isoform.db') sql = 'SELECT * from {0}'.format('task_queue') all_files_df = pd.read_sql_query(sql, conn) conn.close() with open('downloadedfiles.txt') as f: lines = f.read().splitlines() all_files_df = all_files_df[ (all_files_df.DatafileName.isin(data_library.basefilename))] all_files_df = all_files_df[ ~ (all_files_df.DatafileName.isin(lines))] data_library = all_files_df print data_library conn = sqlite3.connect('etl-isoform-download.db') submit_to_queue(data_library, conn, 'task_queue') queue_df = data_library # restart ETL; this gets the diff; also takes care of errors try: sql = 'SELECT * from task_queue_status where errors="None"' queue_df2 = pd.read_sql_query(sql, conn) print 'completed: ', len(queue_df2) queue_df = queue_df[ ~(queue_df.DatafileNameKey.isin(queue_df2.DatafileNameKey))] print 'Not completed: ', len(queue_df) except Exception, e: pass
def parse_isoform(project_id, bucket_name, filename, outfilename, metadata): """Download and convert blob into dataframe Transform the file: includes data cleaning Add Metadata information """ # setup logging configure_logging('mirna.isoform', "logs/" + metadata['AliquotBarcode'] + '.log') # connect to the cloud bucket gcs = GcsConnector(project_id, bucket_name) #main steps: download, convert to df, cleanup, transform, add metadata data_df = gcutils.convert_blob_to_dataframe(gcs, project_id, bucket_name, filename) data_df = additional_changes(data_df) data_df = add_metadata(data_df, metadata) # upload the contents of the dataframe in njson format status = gcs.convert_df_to_njson_and_upload(data_df, outfilename) return status
def search_files(config): # project id project_id = config['project_id'] # bucket name bucket_name = config['buckets']['open'] # connect to google cloud storage gcs = GcsConnector(project_id, bucket_name) #--------------------- # search for MAF files #--------------------- maf_file = re.compile("^.*.maf$") # search only these tumor types defined in config files search_tumors = ["tcga/" + d.lower() for d in config['all_tumor_types']] data_library = gcs.search_files(search_patterns=['.maf'], regex_search_pattern=maf_file, prefixes=search_tumors) data_library['basefilename'] = map(lambda x: os.path.splitext(os.path.basename(x))[0], data_library['filename']) data_library['unique_filename'] = format_dupe_values(data_library['basefilename']) return data_library
def process_sql2bq(config, etl_config, sql, log): log.info("\t\tselect annotation records from db") metadata_df = read_mysql_query(config['cloudsql']['host'], config['cloudsql']['db'], config['cloudsql']['user'], config['cloudsql']['passwd'], sql) log.info("\t\tFound {0} rows, columns.".format(str(metadata_df.shape))) log.info("\tupload data to GCS.") project_id = config['cloud_projects']['open'] bucket_name = config['buckets']['open'] gcs_file_path = config['buckets']['folders'][ 'base_run_folder'] + 'etl/annotation' gcs = GcsConnector(project_id, bucket_name) gcs.convert_df_to_njson_and_upload(metadata_df, gcs_file_path + '/annotation.json', logparam=log) log.info('create the BigQuery table') Etl().load(project_id, [etl_config['bq_dataset']], [etl_config['bq_table']], [etl_config['schema_file']], ['gs://' + bucket_name + '/' + gcs_file_path], [etl_config['write_disposition']], 1, log)
def main(): """Example to download a file from the Google Storage, transform, and load to Google Storage and BigQuery """ project_id = '' bucket_name = '' # example file in bucket filename = 'TCGA-OR-A5J1-01A-11D-A29J-05.txt' outfilename = '' # read the stringIO/file into a pandas dataframe # load the file into a table data_df = pandas.read_table(filename, sep="\t", skiprows=1, lineterminator='\n', comment='#') # clean up the dataframe for upload to BigQuery data_df = cleanup_dataframe(data_df) # connect to the google cloud bucket gcs = GcsConnector(project_id, bucket_name) # main steps: download, convert to df data_df = gcutils.convert_blob_to_dataframe(gcs, project_id, bucket_name, filename, skiprows=1) #--------------------------------------------------------- # get required information # get chromosome 1 and Genomic_Coordinate > 20000000 #--------------------------------------------------------- data_df = (data_df.query("Chromosome == '1' and Genomic_Coordinate > 20000000")\ .query("Beta_value > 0.2")) # we can assign this query to a new dataframe and have new data # upload the contents of the dataframe in njson format to google storage # set metadata on the blob/object metadata = {'info': 'etl-test'} status = gcs.convert_df_to_njson_and_upload(data_df, outfilename, metadata=metadata) print status
def parse_protein(project_id, bucket_name, filename, outfilename, metadata): """Download and convert blob into dataframe Transform the file: includes data cleaning Add Metadata information """ # setup logging configure_logging("protein", "logs/" + metadata["AliquotBarcode"] + ".log") # connect to the cloud bucket gcs = GcsConnector(project_id, bucket_name) # main steps: download, convert to df, cleanup, transform, add metadata data_df = gcutils.convert_blob_to_dataframe(gcs, project_id, bucket_name, filename, skiprows=1) data_df = additional_changes(data_df) data_df = add_metadata(data_df, metadata) # validation tests.assert_notnull_property(data_df, columns_list=["Protein_Name"]) # upload the contents of the dataframe in njson format status = gcs.convert_df_to_njson_and_upload(data_df, outfilename, metadata=metadata) return status
def parse_methylation(project_id, bucket_name, filename, outfilename, metadata): """Download and convert blob into dataframe Transform the file: includes data cleaning Add Metadata information """ # setup logging configure_logging('mirna.isoform', "logs/" + metadata['AliquotBarcode'] + '.log') # connect to the cloud bucket gcs = GcsConnector(project_id, bucket_name) #main steps: download, convert to df, cleanup, transform, add metadata data_df = gcutils.convert_blob_to_dataframe(gcs, project_id, bucket_name, filename, skiprows=1) data_df.columns = ['Probe_Id', "Beta_Value", "Gene_Symbol", "Chromosome", "Genomic_Coordinate"] data_df = add_metadata(data_df, metadata) data_df = additional_changes(data_df) # upload the contents of the dataframe in njson format df_string = data_df.to_csv(index=False, header=False, float_format='%.2f') status = gcs.upload_blob_from_string(outfilename, df_string) return status
def main(): """ Pipeline """ #-------------- # methylation #-------------- config = json.load(open(sys.argv[1])) project_id = config['project_id'] bucket_name = config['buckets']['open'] print '*'*30 + "\nExtract - methylation" data_library = methylation.extract.identify_data(config) hiseq_aliquots = data_library.query('Platform == "HumanMethylation450"')['AliquotBarcode'].drop_duplicates().values.tolist() IlluminaGA_df = data_library.query('Platform == "HumanMethylation27"') IlluminaHiSeq_GA_df = IlluminaGA_df[ (IlluminaGA_df['AliquotBarcode'].isin(hiseq_aliquots))] gcs = GcsConnector(project_id, bucket_name) for idx, row in IlluminaHiSeq_GA_df.iterrows(): blob = row.to_dict()['OutDatafileNameKey'] print gcs.delete_blob(blob)
def main(): """ Pipeline """ #-------------- # methylation #-------------- config = json.load(open(sys.argv[1])) project_id = config['project_id'] bucket_name = config['buckets']['open'] print '*' * 30 + "\nExtract - methylation" data_library = methylation.extract.identify_data(config) hiseq_aliquots = data_library.query('Platform == "HumanMethylation450"')[ 'AliquotBarcode'].drop_duplicates().values.tolist() IlluminaGA_df = data_library.query('Platform == "HumanMethylation27"') IlluminaHiSeq_GA_df = IlluminaGA_df[( IlluminaGA_df['AliquotBarcode'].isin(hiseq_aliquots))] gcs = GcsConnector(project_id, bucket_name) for idx, row in IlluminaHiSeq_GA_df.iterrows(): blob = row.to_dict()['OutDatafileNameKey'] print gcs.delete_blob(blob)
def generate_oncotator_inputfiles(project_id, bucket_name, filename, outputfilename, oncotator_columns): print(filename) # NEW connection gcs = GcsConnector(project_id, bucket_name) filebuffer = gcs.download_blob_to_file(filename) # convert blob into dataframe try: maf_df = convert_file_to_dataframe(filebuffer) except: print 'problem converting %s to a dataframe' % (filename) raise # clean-up dataframe maf_df = cleanup_dataframe(maf_df) print maf_df.columns # lowercase the column names (WHY?) maf_df.columns = map(lambda x: x.lower(), maf_df.columns) #-------------------------------------------- # data - manipulation #-------------------------------------------- maf_df["ncbi_build"] = maf_df["ncbi_build"].replace({ 'hg19': '37', 'GRCh37': '37', 'GRCh37-lite': '37' }) #--------------------------------------------- ## Filters ## remember all the column names are lowercase #--------------------------------------------- filters = { "chromosome": map(str, range(1, 23)) + ['X', 'Y'], "mutation_status": ['somatic', 'Somatic'], "sequencer": ['Illumina HiSeq', 'Illumina GAIIx', 'Illumina MiSeq'], "ncbi_build": ['37'] } filter_checklist_df = maf_df.isin(filters) filter_string = ((filter_checklist_df["chromosome"] == True) & (filter_checklist_df["mutation_status"] == True) & (filter_checklist_df["sequencer"] == True) & (filter_checklist_df["ncbi_build"] == True)) maf_df = maf_df[filter_string] #--------------------- #Oncotator part: generate intermediate files for Oncotator #--------------------- # oncotator needs these columns replace_column_names = { "ncbi_build": 'build', 'chromosome': 'chr', 'start_position': 'start', 'end_position': 'end', 'reference_allele': 'ref_allele', 'tumor_seq_allele1': 'tum_allele1', 'tumor_seq_allele2': 'tum_allele2', 'tumor_sample_barcode': 'tumor_barcode', 'matched_norm_sample_barcode': 'normal_barcode' } # replace columns with new headings; just name change for rcol in replace_column_names: maf_df.columns = [ replace_column_names[x] if x == rcol else x for x in maf_df.columns ] oncotator_columns = [ replace_column_names[y] if y == rcol else y for y in oncotator_columns ] # remove/mangle any duplicate columns ( we are naming line a, a.1, a.2 etc) maf_df.columns = mangle_dupe_cols(maf_df.columns.values) #--------------------- #Oncotator part: generate intermediate files for Oncotator #--------------------- oncotator_df = maf_df[oncotator_columns] print "df_columns", len(oncotator_df.columns) df_stringIO = oncotator_df.to_csv(sep='\t', index=False, columns=oncotator_columns) # upload the file gcs.upload_blob_from_string(outputfilename, df_stringIO) return True
def process_user_gen_files(project_id, user_project_id, study_id, bucket_name, bq_dataset, cloudsql_tables, files): print 'Begin processing user_gen files.' # connect to the cloud bucket gcs = GcsConnector(project_id, bucket_name) data_df = pd.DataFrame() # Collect all columns that get passed in for generating BQ schema later all_columns = [] # For each file, download, convert to df for idx, file in enumerate(files): blob_name = file['FILENAME'].split('/')[1:] all_columns += file['COLUMNS'] metadata = { 'sample_barcode': file.get('SAMPLEBARCODE', ''), 'participant_barcode': file.get('PARTICIPANTBARCODE', ''), 'study_id': study_id, 'platform': file.get('PLATFORM', ''), 'pipeline': file.get('PIPELINE', ''), 'file_path': file['FILENAME'], 'file_name': file['FILENAME'].split('/')[-1], 'data_type': file['DATATYPE'] } # download, convert to df filebuffer = gcs.download_blob_to_file(blob_name) # Get column mapping column_mapping = get_column_mapping(file['COLUMNS']) if idx == 0: data_df = convert_file_to_dataframe(filebuffer, skiprows=0, header=0) data_df = cleanup_dataframe(data_df) data_df.rename(columns=column_mapping, inplace=True) # Generate Metadata for this file insert_metadata(data_df, metadata, cloudsql_tables['METADATA_DATA']) else: # convert blob into dataframe new_df = convert_file_to_dataframe(filebuffer, skiprows=0, header=0) new_df = cleanup_dataframe(new_df) new_df.rename(columns=column_mapping, inplace=True) # Generate Metadata for this file insert_metadata(new_df, metadata, cloudsql_tables['METADATA_DATA']) # TODO: Write function to check for participant barcodes, for now, we assume each file contains SampleBarcode Mapping data_df = pd.merge(data_df, new_df, on='sample_barcode', how='outer') # For complete dataframe, create metadata_samples rows print 'Inserting into data into {0}.'.format(cloudsql_tables['METADATA_SAMPLES']) data_df = cleanup_dataframe(data_df) data_df['has_mrna'] = 0 data_df['has_mirna'] = 0 data_df['has_protein'] = 0 data_df['has_meth'] = 0 insert_metadata_samples(data_df, cloudsql_tables['METADATA_SAMPLES']) # Update and create bq table file temp_outfile = cloudsql_tables['METADATA_SAMPLES'] + '.out' tmp_bucket = os.environ.get('tmp_bucket_location') gcs.convert_df_to_njson_and_upload(data_df, temp_outfile, tmp_bucket=tmp_bucket) # Using temporary file location (in case we don't have write permissions on user's bucket? source_path = 'gs://' + tmp_bucket + '/' + temp_outfile schema = generate_bq_schema(all_columns) table_name = 'cgc_user_{0}_{1}'.format(user_project_id, study_id) load_data_from_file.run( project_id, bq_dataset, table_name, schema, source_path, source_format='NEWLINE_DELIMITED_JSON', write_disposition='WRITE_APPEND', is_schema_file=False) # Generate feature_defs feature_defs = generate_feature_defs(study_id, project_id, bq_dataset, table_name, schema) # Update feature_defs table insert_feature_defs_list(cloudsql_tables['FEATURE_DEFS'], feature_defs) # Delete temporary files print 'Deleting temporary file {0}'.format(temp_outfile) gcs = GcsConnector(project_id, tmp_bucket) gcs.delete_blob(temp_outfile)
def generate_oncotator_inputfiles(project_id, bucket_name, filename, outputfilename, oncotator_columns): print (filename) # NEW connection gcs = GcsConnector(project_id, bucket_name) filebuffer = gcs.download_blob_to_file(filename) # convert blob into dataframe maf_df = convert_file_to_dataframe(filebuffer) # clean-up dataframe maf_df = cleanup_dataframe(maf_df) print maf_df.columns # lowercase the column names (WHY?) maf_df.columns = map(lambda x: x.lower(), maf_df.columns) #-------------------------------------------- # data - manipulation #-------------------------------------------- maf_df["ncbi_build"] = maf_df["ncbi_build"].replace({ 'hg19': '37' ,'GRCh37': '37' ,'GRCh37-lite': '37' }) #--------------------------------------------- ## Filters ## remember all the column names are lowercase #--------------------------------------------- filters = { "chromosome" : map(str,range(1,23)) + ['X', 'Y'] ,"mutation_status": ['somatic', 'Somatic'] ,"sequencer": ['Illumina HiSeq', 'Illumina GAIIx', 'Illumina MiSeq'] ,"ncbi_build" : ['37'] } filter_checklist_df = maf_df.isin(filters) filter_string = ( (filter_checklist_df["chromosome"] == True) & (filter_checklist_df["mutation_status"] == True) & (filter_checklist_df["sequencer"] == True) & (filter_checklist_df["ncbi_build"] == True) ) maf_df = maf_df[filter_string] #--------------------- #Oncotator part: generate intermediate files for Oncotator #--------------------- # oncotator needs these columns replace_column_names = { "ncbi_build" : 'build' ,'chromosome' : 'chr' ,'start_position' : 'start' ,'end_position' : 'end' ,'reference_allele' : 'ref_allele' ,'tumor_seq_allele1' : 'tum_allele1' ,'tumor_seq_allele2' : 'tum_allele2' ,'tumor_sample_barcode': 'tumor_barcode' ,'matched_norm_sample_barcode': 'normal_barcode' } # replace columns with new headings; just name change for rcol in replace_column_names: maf_df.columns = [replace_column_names[x] if x==rcol else x for x in maf_df.columns] oncotator_columns = [replace_column_names[y] if y==rcol else y for y in oncotator_columns] # remove/mangle any duplicate columns ( we are naming line a, a.1, a.2 etc) maf_df.columns = mangle_dupe_cols(maf_df.columns.values) #--------------------- #Oncotator part: generate intermediate files for Oncotator #--------------------- oncotator_df = maf_df[oncotator_columns] print "df_columns", len(oncotator_df.columns) df_stringIO = oncotator_df.to_csv(sep='\t', index=False, columns= oncotator_columns) # upload the file gcs.upload_blob_from_string(outputfilename, df_stringIO) return True
def main(): """Parse GCT file, merge with barcodes info, melt(tidy) and load to Google Storage and BigQuery """ project_id = '' bucket_name = '' # example file in bucket filename = 'ccle/mRNA-gene-exp/CCLE_Expression_Entrez_2012-09-29.gct' outfilename = 'test' writer = ExcelWriter('ccle.xlsx') # connect to the google cloud bucket gcs = GcsConnector(project_id, bucket_name) #------------------------------ # load the GCT file # * the file has duplicate samples (columns) #------------------------------ # To remove duplicates, load the first few lines of the file only. Get columns, unique and select dataframe # this is a hack, but I cant find a elegant way to remove duplciates gct_df = pd.read_table('CCLE_Expression_Entrez_2012-09-29.gct', sep='\t', skiprows=2, mangle_dupe_cols=False, nrows=2) unqiue_columns = np.unique(gct_df.columns) gct_df = pd.read_table('CCLE_Expression_Entrez_2012-09-29.gct', sep='\t', skiprows=2, mangle_dupe_cols=True) # clean-up the dataset/dataframe gct_df = cleanup_dataframe(gct_df) gct_df = gct_df[unqiue_columns] # remove any gene_id starting with 'AFFX-' gct_df[gct_df['Name'].str.startswith('AFFX-')].to_excel( writer, sheet_name="affy_info") gct_df = gct_df[~gct_df['Name'].str.startswith('AFFX-')] #------------------------------ # HGNC validation #----------------------------- hgnc_df = hgnc_validation.get_hgnc_map() hgnc_df.to_excel(writer, sheet_name="hgnc_info") hgnc_dict = dict(zip(hgnc_df.entrez_id, hgnc_df.symbol)) gct_df['HGNC_gene_symbol'] = gct_df['Name'].map( lambda gene_id: hgnc_dict.get(gene_id.replace('_at', ''), np.nan)) gct_df[['HGNC_gene_symbol', 'Name', 'Description']].to_excel(writer, sheet_name="gene_info") gct_df['Name'] = gct_df['Name'].map( lambda gene_id: gene_id.replace('_at', '')) #------------------------------ # barcodes info #------------------------------ barcodes_filename = 'ccle/mRNA-gene-exp/mRNA_names.out.tsv' filebuffer = gcs.download_blob_to_file(barcodes_filename) barcodes_df = pd.read_table( filebuffer, header=None, names=['ParticipantBarcode', 'SampleBarcode', 'CCLE_long_name']) # convert into dataframe barcodes_df = cleanup_dataframe(barcodes_df) # clean-up dataframe #------------------------------ # ignore (drop) all of the columns from the gene-expression matrix #that don't have corresponding Participant and Sample barcodes, #------------------------------ columns_df = pd.DataFrame(unqiue_columns) columns_df.columns = ['CCLE_long_name'] samples_map_df = pd.merge(columns_df, barcodes_df, on='CCLE_long_name', how='inner') samples_map_df.to_excel(writer, sheet_name="sample_info") # select columns that are overlapping overlapping_samples = samples_map_df['CCLE_long_name'].tolist() overlapping_samples = overlapping_samples + [ 'Name', 'Description', 'HGNC_gene_symbol' ] gct_df = gct_df[overlapping_samples] print gct_df # melt the matrix value_vars = [ col for col in gct_df.columns if col not in ['Name', 'Description', 'HGNC_gene_symbol'] ] melted_df = pd.melt(gct_df, id_vars=['Name', 'Description', 'HGNC_gene_symbol'], value_vars=value_vars) melted_df = melted_df.rename( columns={ 'Name': 'gene_id', 'Description': 'original_gene_symbol', 'variable': 'CCLE_long_name', 'value': 'RMA_normalized_expression' }) # merge to get barcode information # changed from outer join to inner join. In this case it shouldnt matter, since we already did a inner join # while select the samples above. data_df = pd.merge(melted_df, samples_map_df, on='CCLE_long_name', how='inner') data_df['Platform'] = "Affymetrix U133 Plus 2.0" # reorder columns col_order = [ "ParticipantBarcode", "SampleBarcode", "CCLE_long_name", "gene_id", "HGNC_gene_symbol", "original_gene_symbol", "Platform", "RMA_normalized_expression" ] data_df = data_df[col_order] # upload the contents of the dataframe in CSV format print "Convert to CSV" outfilename = "tcga/intermediary/CCLE_mrna_expr/bq_data_files/ccle_mrna_expr.csv" df_string = data_df.to_csv(index=False, header=False) status = gcs.upload_blob_from_string(outfilename, df_string) print status # save the excel file writer.save()
def process_oncotator_output(project_id, bucket_name, data_library, bq_columns, sample_code2letter): study = data_library['Study'].iloc[0] # this needed to stop pandas from converting them to FLOAT dtype = { "Transcript_Exon" : "object" ,"NCBI_Build" : "object" ,"COSMIC_Total_Alterations_In_Gene" : "object" ,"CCLE_ONCOMAP_Total_Mutations_In_Gene" : "object" ,"HGNC_HGNC_ID" : "object" ,"UniProt_AApos" : "object" ,"Transcript_Position" : "object" ,"HGNC_OMIM_ID_Supplied_By_NCBI" : "object" } file_count = 0 # create an empty dataframe. we use this to merge dataframe disease_bigdata_df = pd.DataFrame() # iterate over the selected files for oncotator_file in data_library['filename']: file_count+= 1 log.info('-'*10 + "{0}: Processing file {1}".format(file_count, oncotator_file) + '-'*10) try: gcs = GcsConnector(project_id, bucket_name) # covert the file to a dataframe filename = 'tcga/intermediary/MAF/oncotator_output_files/' + oncotator_file df = gcutils.convert_blob_to_dataframe(gcs, project_id, bucket_name, filename) except Exception as e: print e raise if df.empty: log.debug('empty dataframe for file: ' + str(oncotator_file)) continue #------------------------------ # different operations on the frame #------------------------------ # get only the required BigQuery columns df = df[bq_columns] # format oncotator columns; name changes etc df = format_oncotator_columns(df) # add new columns df = add_columns(df, sample_code2letter, study) disease_bigdata_df = disease_bigdata_df.append(df, ignore_index = True) # this is a merged dataframe if not disease_bigdata_df.empty: # remove duplicates; various rules; see check duplicates) df = check_duplicates.remove_maf_duplicates(df, sample_code2letter) # enforce unique mutation unique_mutation = ['Chromosome', 'Start_Position', 'End_Position', 'Tumor_Seq_Allele1', 'Tumor_Seq_Allele2', 'Tumor_AliquotBarcode'] # merge mutations from multiple centers concat_df = [] for idx, df_group in df.groupby(unique_mutation): if len(df_group) > 1: # tolist; unique list; sort; concat df_group.loc[:,'Center'] = ";".join(map(str,sorted(list(set(df_group['Center'].tolist()))))) concat_df.append(df_group) df = pd.concat(concat_df) # enforce unique mutation df = remove_duplicates(df, unique_mutation) # convert the df to new-line JSON and the upload the file gcs.convert_df_to_njson_and_upload(disease_bigdata_df, "tcga/intermediary/MAF/bigquery_data_files/{0}.json".format(study)) else: raise Exception('Empty dataframe!') return True
def process_oncotator_output(project_id, bucket_name, data_library, bq_columns, sample_code2letter, oncotator_object_path, oncotator_object_output_path): study = data_library['Study'].iloc[0] # this needed to stop pandas from converting them to FLOAT dtype = { "Transcript_Exon": "object", "NCBI_Build": "object", "COSMIC_Total_Alterations_In_Gene": "object", "CCLE_ONCOMAP_Total_Mutations_In_Gene": "object", "HGNC_HGNC_ID": "object", "UniProt_AApos": "object", "Transcript_Position": "object", "HGNC_OMIM_ID_Supplied_By_NCBI": "object" } file_count = 0 # create an empty dataframe. we use this to merge dataframe disease_bigdata_df = pd.DataFrame() # iterate over the selected files for oncotator_file in data_library['filename']: file_count += 1 log.info( '-' * 10 + "{0}: Processing file {1}".format(file_count, oncotator_file) + '-' * 10) try: # covert the file to a dataframe filename = oncotator_object_path + oncotator_file gcs = GcsConnector(project_id, bucket_name) log.info('%s: converting %s to dataframe' % (study, filename)) df = gcutils.convert_blob_to_dataframe(gcs, project_id, bucket_name, filename, log=log) log.info('%s: done converting %s to dataframe' % (study, filename)) except RuntimeError as re: log.warning('%s: problem cleaning dataframe for %s: %s' % (study, filename, re)) except Exception as e: log.exception('%s: problem converting to dataframe for %s: %s' % (study, filename, e)) raise e if df.empty: log.warning('%s: empty dataframe for file: %s' % (study, oncotator_file)) continue #------------------------------ # different operations on the frame #------------------------------ # get only the required BigQuery columns df = df[bq_columns] # format oncotator columns; name changes etc df = format_oncotator_columns(df) # add new columns df = add_columns(df, sample_code2letter, study) disease_bigdata_df = disease_bigdata_df.append(df, ignore_index=True) log.info('-' * 10 + "{0}: Finished file({3}) {1}. rows: {2}".format( file_count, oncotator_file, len(df), study) + '-' * 10) # this is a merged dataframe if not disease_bigdata_df.empty: # remove duplicates; various rules; see check duplicates) log.info( '\tcalling check_duplicates to collapse aliquots with %s rows' % (len(disease_bigdata_df))) disease_bigdata_df = check_duplicates.remove_maf_duplicates( disease_bigdata_df, sample_code2letter, log) log.info( '\tfinished check_duplicates to collapse aliquots with %s rows' % (len(disease_bigdata_df))) # enforce unique mutation--previous # unique_mutation = ['Chromosome', 'Start_Position', 'End_Position', 'Tumor_Seq_Allele1', 'Tumor_Seq_Allele2', 'Tumor_AliquotBarcode'] # enforce unique mutation unique_mutation = [ 'Hugo_Symbol', 'Entrez_Gene_Id', 'Chromosome', 'Start_Position', 'End_Position', 'Reference_Allele', 'Tumor_Seq_Allele1', 'Tumor_Seq_Allele2', 'Tumor_AliquotBarcode' ] # merge mutations from multiple centers log.info( '\tconsolidate the centers for duplicate mutations into list for %s' % (study)) seencenters = set() def concatcenters(df_group): if len(df_group) > 1: centers = list(set(df_group['Center'].tolist())) uniquecenters = set() delim = config['maf']['center_delim'] for center in centers: fields = center.split(delim) for field in fields: uniquecenters.add(field) sortedunique = delim.join(sorted(list(uniquecenters))) df_group.loc[:, 'Center'] = sortedunique if sortedunique not in seencenters: log.info('unique centers: %s' % sortedunique) seencenters.add(sortedunique) return df_group disease_bigdata_df = disease_bigdata_df.groupby(unique_mutation).apply( concatcenters) log.info( '\tfinished consolidating centers for duplicate mutations for %s' % (study)) # enforce unique mutation log.info( '\tcalling remove_duplicates to collapse mutations with %s rows for %s' % (len(disease_bigdata_df), study)) disease_bigdata_df = remove_duplicates(disease_bigdata_df, unique_mutation) log.info( '\tfinished remove_duplicates to collapse mutations with %s rows for %s' % (len(disease_bigdata_df), study)) # convert the disease_bigdata_df to new-line JSON and upload the file uploadpath = oncotator_object_output_path + "{0}.json".format(study) log.info('%s: uploading %s to GCS' % (study, uploadpath)) gcs.convert_df_to_njson_and_upload(disease_bigdata_df, uploadpath) log.info('%s: done uploading %s to GCS' % (study, uploadpath)) else: log.warning('Empty dataframe for %s in %s!' % (oncotator_file, study)) return True
def identify_data(config): """Gets the metadata info from database """ log = configure_logging('data_et', 'logs/data_et.log') log.info('start data extract and transform') # cloudSql connection params host = config['cloudsql']['host'] database = config['cloudsql']['db'] user = config['cloudsql']['user'] passwd = config['cloudsql']['passwd'] log.info("\tselect data records from db") sqlquery = """ SELECT ParticipantBarcode, SampleBarcode, AliquotBarcode, AliquotUUID, DataArchiveName, DataArchiveURL, DataArchiveVersion, DataCenterCode, DataCenterName, DataCenterType, DatafileMD5, DatafileName, DatafileNameKey, DatafileUploaded, DataLevel, Datatype, GenomeReference, IncludeForAnalysis, MAGETabArchiveName, MAGETabArchiveURL, Pipeline, Platform, Project, Repository, SampleType, SampleTypeCode, SDRFFileName, SDRFFileNameKey, SecurityProtocol, Species, Study, wasDerivedFrom, library_strategy, state, reason_for_state, analysis_id, analyte_code, last_modified, platform_full_name, GG_dataset_id, GG_readgroupset_id FROM metadata_data WHERE Project = 'TCGA' AND study = '%s' AND DatafileUploaded='true' AND DatafileNameKey is not null AND IncludeForAnalysis='yes' """ studies = config['all_tumor_types'] for study in studies: # connect to db and get results in a dataframe log.info("\tselect data records from db for %s" % (study)) metadata_df = read_mysql_query(host, database, user, passwd, sqlquery % (study)) log.info("\tFound {0} rows, columns." .format(str(metadata_df.shape))) project_id = config['project_id'] bucket_name = config['buckets']['open'] gcs = GcsConnector(project_id, bucket_name) log.info("\tupload data to GCS.") gcs.convert_df_to_njson_and_upload(metadata_df, config['data']['output_dir'] + config['data']['bq_table'] + '_%s.json' % (study)) log.info('finished extract and transform')
def parse_file(project_id, bq_dataset, bucket_name, file_data, filename, outfilename, metadata, cloudsql_tables): print 'Begin processing {0}.'.format(filename) # connect to the cloud bucket gcs = GcsConnector(project_id, bucket_name) #main steps: download, convert to df, cleanup, transform, add metadata filebuffer = gcs.download_blob_to_file(filename) # convert blob into dataframe data_df = convert_file_to_dataframe(filebuffer, skiprows=0, header=0) # clean-up dataframe data_df = cleanup_dataframe(data_df) new_df_data = [] map_values = {} # Get basic column information depending on datatype column_map = get_column_mapping(metadata['data_type']) # Column headers are sample ids for i, j in data_df.iteritems(): if i in column_map.keys(): map_values[column_map[i]] = [k for d, k in j.iteritems()] else: for k, m in j.iteritems(): new_df_obj = {} new_df_obj[ 'sample_barcode'] = i # Normalized to match user_gen new_df_obj['Project'] = metadata['project_id'] new_df_obj['Study'] = metadata['study_id'] new_df_obj['Platform'] = metadata['platform'] new_df_obj['Pipeline'] = metadata['pipeline'] # Optional values new_df_obj['Symbol'] = map_values['Symbol'][ k] if 'Symbol' in map_values.keys() else '' new_df_obj['ID'] = map_values['ID'][ k] if 'ID' in map_values.keys() else '' new_df_obj['TAB'] = map_values['TAB'][ k] if 'TAB' in map_values.keys() else '' new_df_obj['Level'] = m new_df_data.append(new_df_obj) new_df = pd.DataFrame(new_df_data) # Get unique barcodes and update metadata_data table sample_barcodes = list( set([k for d, k in new_df['SampleBarcode'].iteritems()])) sample_metadata_list = [] for barcode in sample_barcodes: new_metadata = metadata.copy() new_metadata['sample_barcode'] = barcode sample_metadata_list.append(new_metadata) update_metadata_data_list(cloudsql_tables['METADATA_DATA'], sample_metadata_list) # Update metadata_samples table update_molecular_metadata_samples_list(cloudsql_tables['METADATA_SAMPLES'], metadata['data_type'], sample_barcodes) # Generate feature names and bq_mappings table_name = file_data['BIGQUERY_TABLE_NAME'] feature_defs = generate_feature_Defs(metadata['data_type'], metadata['study_id'], project_id, bq_dataset, table_name, new_df) # Update feature_defs table insert_feature_defs_list(cloudsql_tables['FEATURE_DEFS'], feature_defs) # upload the contents of the dataframe in njson format tmp_bucket = os.environ.get('tmp_bucket_location') gcs.convert_df_to_njson_and_upload(new_df, outfilename, metadata=metadata, tmp_bucket=tmp_bucket) # Load into BigQuery # Using temporary file location (in case we don't have write permissions on user's bucket?) source_path = 'gs://' + tmp_bucket + '/' + outfilename schema = get_molecular_schema() load_data_from_file.run(project_id, bq_dataset, table_name, schema, source_path, source_format='NEWLINE_DELIMITED_JSON', write_disposition='WRITE_APPEND', is_schema_file=False) # Delete temporary files print 'Deleting temporary file {0}'.format(outfilename) gcs = GcsConnector(project_id, tmp_bucket) gcs.delete_blob(outfilename)
def parse_file(project_id, bq_dataset, bucket_name, file_data, filename, outfilename, metadata, cloudsql_tables): print 'Begin processing {0}.'.format(filename) # connect to the cloud bucket gcs = GcsConnector(project_id, bucket_name) #main steps: download, convert to df, cleanup, transform, add metadata filebuffer = gcs.download_blob_to_file(filename) # convert blob into dataframe data_df = convert_file_to_dataframe(filebuffer, skiprows=0, header=0) # clean-up dataframe data_df = cleanup_dataframe(data_df) new_df_data = [] map_values = {} # Get basic column information depending on datatype column_map = get_column_mapping(metadata['data_type']) # Column headers are sample ids for i, j in data_df.iteritems(): if i in column_map.keys(): map_values[column_map[i]] = [k for d, k in j.iteritems()] else: for k, m in j.iteritems(): new_df_obj = {} new_df_obj['sample_barcode'] = i # Normalized to match user_gen new_df_obj['project_id'] = metadata['project_id'] new_df_obj['study_id'] = metadata['study_id'] new_df_obj['Platform'] = metadata['platform'] new_df_obj['Pipeline'] = metadata['pipeline'] # Optional values new_df_obj['Symbol'] = map_values['Symbol'][k] if 'Symbol' in map_values.keys() else '' new_df_obj['ID'] = map_values['ID'][k] if 'ID' in map_values.keys() else '' new_df_obj['TAB'] = map_values['TAB'][k] if 'TAB' in map_values.keys() else '' new_df_obj['Level'] = m new_df_data.append(new_df_obj) new_df = pd.DataFrame(new_df_data) # Get unique barcodes and update metadata_data table sample_barcodes = list(set([k for d, k in new_df['sample_barcode'].iteritems()])) sample_metadata_list = [] for barcode in sample_barcodes: new_metadata = metadata.copy() new_metadata['sample_barcode'] = barcode sample_metadata_list.append(new_metadata) update_metadata_data_list(cloudsql_tables['METADATA_DATA'], sample_metadata_list) # Update metadata_samples table update_molecular_metadata_samples_list(cloudsql_tables['METADATA_SAMPLES'], metadata['data_type'], sample_barcodes) # Generate feature names and bq_mappings table_name = file_data['BIGQUERY_TABLE_NAME'] feature_defs = generate_feature_Defs(metadata['data_type'], metadata['study_id'], project_id, bq_dataset, table_name, new_df) # Update feature_defs table insert_feature_defs_list(cloudsql_tables['FEATURE_DEFS'], feature_defs) # upload the contents of the dataframe in njson format tmp_bucket = os.environ.get('tmp_bucket') gcs.convert_df_to_njson_and_upload(new_df, outfilename, metadata=metadata, tmp_bucket=tmp_bucket) # Load into BigQuery # Using temporary file location (in case we don't have write permissions on user's bucket?) source_path = 'gs://' + tmp_bucket + '/' + outfilename schema = get_molecular_schema() load_data_from_file.run( project_id, bq_dataset, table_name, schema, source_path, source_format='NEWLINE_DELIMITED_JSON', write_disposition='WRITE_APPEND', is_schema_file=False) # Delete temporary files print 'Deleting temporary file {0}'.format(outfilename) gcs = GcsConnector(project_id, tmp_bucket) gcs.delete_blob(outfilename)
def process_user_gen_files(project_id, user_project_id, study_id, bucket_name, bq_dataset, cloudsql_tables, files): print 'Begin processing user_gen files.' # connect to the cloud bucket gcs = GcsConnector(project_id, bucket_name) data_df = pd.DataFrame() # Collect all columns that get passed in for generating BQ schema later all_columns = [] # For each file, download, convert to df for idx, file in enumerate(files): blob_name = file['FILENAME'].split('/')[1:] all_columns += file['COLUMNS'] metadata = { 'sample_barcode': file.get('SAMPLEBARCODE', ''), 'participant_barcode': file.get('PARTICIPANTBARCODE', ''), 'study_id': study_id, 'platform': file.get('PLATFORM', ''), 'pipeline': file.get('PIPELINE', ''), 'file_path': file['FILENAME'], 'file_name': file['FILENAME'].split('/')[-1], 'data_type': file['DATATYPE'] } # download, convert to df filebuffer = gcs.download_blob_to_file(blob_name) # Get column mapping column_mapping = get_column_mapping(file['COLUMNS']) if idx == 0: data_df = convert_file_to_dataframe(filebuffer, skiprows=0, header=0) data_df = cleanup_dataframe(data_df) data_df.rename(columns=column_mapping, inplace=True) # Generate Metadata for this file insert_metadata(data_df, metadata, cloudsql_tables['METADATA_DATA']) else: # convert blob into dataframe new_df = convert_file_to_dataframe(filebuffer, skiprows=0, header=0) new_df = cleanup_dataframe(new_df) new_df.rename(columns=column_mapping, inplace=True) # Generate Metadata for this file insert_metadata(new_df, metadata, cloudsql_tables['METADATA_DATA']) # TODO: Write function to check for participant barcodes, for now, we assume each file contains SampleBarcode Mapping data_df = pd.merge(data_df, new_df, on='sample_barcode', how='outer') # For complete dataframe, create metadata_samples rows print 'Inserting into data into {0}.'.format(cloudsql_tables['METADATA_SAMPLES']) data_df = cleanup_dataframe(data_df) data_df['has_mrna'] = 0 data_df['has_mirna'] = 0 data_df['has_protein'] = 0 data_df['has_meth'] = 0 insert_metadata_samples(data_df, cloudsql_tables['METADATA_SAMPLES']) # Update and create bq table file temp_outfile = cloudsql_tables['METADATA_SAMPLES'] + '.out' tmp_bucket = os.environ.get('tmp_bucket') gcs.convert_df_to_njson_and_upload(data_df, temp_outfile, tmp_bucket=tmp_bucket) # Using temporary file location (in case we don't have write permissions on user's bucket? source_path = 'gs://' + tmp_bucket + '/' + temp_outfile schema = generate_bq_schema(all_columns) table_name = 'cgc_user_{0}_{1}'.format(user_project_id, study_id) load_data_from_file.run( project_id, bq_dataset, table_name, schema, source_path, source_format='NEWLINE_DELIMITED_JSON', write_disposition='WRITE_APPEND', is_schema_file=False) # Generate feature_defs feature_defs = generate_feature_defs(study_id, project_id, bq_dataset, table_name, schema) # Update feature_defs table insert_feature_defs_list(cloudsql_tables['FEATURE_DEFS'], feature_defs) # Delete temporary files print 'Deleting temporary file {0}'.format(temp_outfile) gcs = GcsConnector(project_id, tmp_bucket) gcs.delete_blob(temp_outfile)
def melt_matrix(self, matrix_file, platform, file2info, program_name, config, log): """ # melt matrix """ log.info('\t\t\tbegin melt matrix: \'%s\'' % (matrix_file)) # begin parsing the data data_df2 = pd.read_csv(matrix_file, delimiter='\t', header=0) data_df2 = data_df2.set_index(["Gene"]) # create a StingIO object with this info # call utils.convert_file_to_dataframe(buffer, sep=",") # call tools.cleanup_dataframe() # gcs.convert_df_to_njson_and_upload() log.info('\t\t\t\tstart processing saved matrix. size: %s' % (len(data_df2))) mod = int(len(data_df2) / 20) count = 0 total_count = 0 buf = StringIO() buf.write( "sample_barcode mirna_id mirna_accession normalized_count platform project_short_name program_name sample_type_code" + " file_name file_gdc_id aliquot_barcode case_barcode case_gdc_id sample_gdc_id aliquot_gdc_id\n" ) for i, j in data_df2.T.iteritems(): for k, m in j.iteritems(): aliquot = file2info[k]['aliquot_barcode'] SampleBarcode = "-".join(aliquot.split("-")[0:4]) ParticipantBarcode = "-".join(aliquot.split("-")[0:3]) SampleTypeCode = aliquot.split("-")[3][0:2] info = file2info[k] line = "\t".join( map(str, (SampleBarcode, i.split(".")[0], i.split(".")[1], m, platform, info['project_short_name'], info['program_name'], SampleTypeCode, info['file_name'], info['file_gdc_id'], aliquot, ParticipantBarcode, info['case_gdc_id'], info['sample_gdc_id'], info['aliquot_gdc_id']))) + '\n' buf.write(line) total_count += 1 if 0 == count % mod: log.info('\t\t\t\t\tprocessed %s lines:\n%s' % (count, line)) file_name = '%s_%s' % (matrix_file.split('/')[-1], count) log.info('\t\t\t\tsave %s to GCS' % file_name) buf.seek(0) df = convert_file_to_dataframe(buf) df = cleanup_dataframe(df, log) gcs = GcsConnector(config['cloud_projects']['open'], config['buckets']['open']) gcs.convert_df_to_njson_and_upload( df, config[program_name]['process_files']['datatype2bqscript'] ['Isoform Expression Quantification']['gcs_output_path'] + file_name, logparam=log) buf = StringIO() buf.write( "sample_barcode mirna_id mirna_accession normalized_count platform project_short_name program_name sample_type_code" + " file_name file_gdc_id aliquot_barcode case_barcode case_gdc_id sample_gdc_id aliquot_gdc_id\n" ) count += 1 log.info('\t\t\t\tprocessed %s total lines created %s records' % (count, total_count)) log.info('\t\t\t\tcompleted save to GCS') log.info('\t\t\tfinished melt matrix')
def main(config): log_filename = 'etl_download_isoform.log' log_name = 'etl_download_isoform' log = configure_logging(log_name, log_filename) log.info('begin downloading isoform files') # etl = util.DataETL("isb-cgc", "isb-cgc-open") # this starts a new connection project_id = config['project_id'] bucket_name = config['buckets']['open'] # connect to bucket to get files gcs = GcsConnector(project_id, bucket_name) isoform_file = re.compile("^.*.isoform.quantification.txt.json$") data_library = gcs.search_files( search_patterns=['.isoform.quantification.txt'], regex_search_pattern=isoform_file, prefixes=[config['mirna_isoform_matrix']['isoform_gcs_dir']]) # we are eliminating bad files - size 0; could be hg18 etc data_library.loc[:, 'basefilename'] = data_library['filename'].map( lambda x: os.path.splitext(os.path.basename(x))[0].replace( '.json', '')) data_library = data_library.query('size > 0') log.info('\tbegin selecting isoform files from sql-lite isoform db') conn = sqlite3.connect(config['mirna_isoform_matrix']['isoform_file_db']) sql = 'SELECT * from {0}'.format('task_queue') all_files_df = pd.read_sql_query(sql, conn) conn.close() log.info('\tfinished selecting isoform files') log.info('\tbegin reading from down loaded files') with open(config['mirna_isoform_matrix'] ['isoform_download_prev_files']) as f: lines = f.read().splitlines() log.info('\tfinished reading from down loaded files') log.info( 'filter files.\n\tfiles in cloud storage: %s\n\tfiles previously marked to download: %s\n%s\n' % (len(data_library), len(all_files_df), data_library)) all_files_df = all_files_df[(all_files_df.DatafileName.isin( data_library.basefilename))] all_files_df = all_files_df[~(all_files_df.DatafileName.isin(lines))] data_library = all_files_df log.info('finished filter files: %s\n%s\n' % (len(data_library), data_library)) conn = sqlite3.connect(config['mirna_isoform_matrix']['isoform_file_db']) submit_to_queue(data_library, conn, 'task_queue', log) queue_df = data_library # restart ETL; this gets the diff; also takes care of errors try: conn = sqlite3.connect('isoform_download.db') sql = 'SELECT * from task_queue_status where errors="None"' queue_df2 = pd.read_sql_query(sql, conn) log.info('\tso far completed: ' % (len(queue_df2))) queue_df = queue_df[~( queue_df.DatafileNameKey.isin(queue_df2.DatafileNameKey))] log.info('\tso far not completed: ' % (len(queue_df))) except Exception: log.exception( '\n++++++++++++++++++++++\n\tproblem filtering completed jobs, ignoring\n++++++++++++++++++++++\n' ) # ----------------------------------------------------- # thread this with concurrent futures #------------------------------------------------------ log.info('\tsubmit jobs to process manager') pm = process_manager.ProcessManager(max_workers=200, db='isoform_download.db', table='task_queue_status', log=log) for count, df in data_library.iterrows(): row = df.to_dict() if 0 == count % 512: time.sleep(10) if 0 == count % 2048: log.info('\t\tsubmitting %s file: %s' % (count, row['DatafileName'])) if not os.path.isdir( config['mirna_isoform_matrix']['isoform_download_dir'] + row['Platform']): os.makedirs( config['mirna_isoform_matrix']['isoform_download_dir'] + row['Platform']) outfilename = config['mirna_isoform_matrix'][ 'isoform_download_dir'] + row['Platform'] + "/" + row[ 'DatafileName'] pm.submit(download_file, project_id, bucket_name, row['DatafileNameKey'], outfilename, '') time.sleep(0.2) log.info('\tsubmitted %s total jobs to process manager' % (count)) log.info('\tstart process manager completion check') pm.start() log.info('finished downloading isoform files')