def identify_data(config): """Gets the metadata info from database """ log = configure_logging('cnv', 'logs/cnv_extract.log') log.info('start cnv extract') # cloudSql connection params host = config['cloudsql']['host'] database = config['cloudsql']['db'] user = config['cloudsql']['user'] passwd = config['cloudsql']['passwd'] log.info("\tselect file names from db") sqlquery = """ SELECT ParticipantBarcode, SampleBarcode, AliquotBarcode, Pipeline, Platform, SampleType, SampleTypeCode, Study, DatafileName, DatafileNameKey, Datatype, DatafileUploaded, IncludeForAnalysis, DataCenterName FROM metadata_data WHERE DataType='Copy Number Results-SNP' AND DatafileUploaded='true' AND DatafileNameKey LIKE '%.nocnv_hg19.seg.txt' AND IncludeForAnalysis='yes' """ # connect to db and get results in a dataframe metadata_df = read_mysql_query(host, database, user, passwd, sqlquery) # rename platforms in rows. because number of rows exceed import limitation for GCS to BigQuery, rotate into multiple # virtual directories dir_prefix = config['cnv']['output_dir_prefix'] dir_suffixes = config['cnv']['output_dir_suffixes'] num_suffix = len(dir_suffixes) for i, row in metadata_df.iterrows(): metadata = row.to_dict() metadata_df.loc[i, 'OutDatafileNameKey'] = dir_prefix + dir_suffixes[i % num_suffix] \ + metadata['DatafileName'] + '.json' metadata_df.loc[:, 'SampleTypeLetterCode'] = metadata_df['SampleTypeCode']\ .map(lambda code: config['sample_code2letter'][code]) metadata_df.loc[:, 'DatafileNameKey'] = metadata_df['DatafileNameKey']\ .map(lambda inputfile: re.sub(r"^/", "", inputfile)) # tag CELLC samples metadata_df['is_tumor'] = (metadata_df['SampleTypeLetterCode'] != "CELLC") metadata_df['transform_function'] = 'cnv.transform.parse_cnv' log.info("\tFound {0} rows, columns.".format(str(metadata_df.shape))) # Filter - check all "is_" fields - remember all 'is_' fields must be boolean all_flag_columns = [ key for key in metadata_df.columns.values if key.startswith("is_") ] flag_df = metadata_df[all_flag_columns] metadata_df = metadata_df[flag_df.all(axis=1)] log.info("\tAfter filtering: Found {0} rows, columns.".format( str(metadata_df.shape))) log.info('finished cnv extract') return metadata_df
def identify_data(config): """Gets the metadata info from database """ # setup logging log = configure_logging('methylation_extract', "logs/" + 'methylation_extract.log') log.info('start methylation extract') # cloudSql connection params host = config['cloudsql']['host'] database = config['cloudsql']['db'] user = config['cloudsql']['user'] passwd = config['cloudsql']['passwd'] sqlquery = """ SELECT ParticipantBarcode, SampleBarcode, AliquotBarcode, Pipeline, Platform, SampleType, SampleTypeCode, Study, DatafileName, DatafileNameKey, Datatype, DatafileUploaded, IncludeForAnalysis FROM metadata_data WHERE DataType='DNA Methylation' AND DatafileUploaded='true' AND IncludeForAnalysis='yes' """ # connect to db and get results in a dataframe log.info("\tselect file names from db") metadata_df = read_mysql_query(host, database, user, passwd, sqlquery) # rename platforms in rows for i, row in metadata_df.iterrows(): metadata = row.to_dict() platform = row['Platform'] metadata_df.loc[i, 'OutDatafileNameKey'] = config['methylation']['output_dir']\ + platform + '/' + metadata['DatafileName'] + '.csv' metadata_df.loc[:, 'SampleTypeLetterCode'] = metadata_df['SampleTypeCode']\ .map(lambda code: config['sample_code2letter'][code]) metadata_df.loc[:, 'DatafileNameKey'] = metadata_df['DatafileNameKey']\ .map(lambda inputfile: re.sub(r"^/", "", inputfile)) # tag CELLC samples metadata_df['is_tumor'] = (metadata_df['SampleTypeLetterCode'] != "CELLC") metadata_df[ 'transform_function'] = 'methylation.transform.parse_methylation' log.info("\tFound {0} rows, columns.".format(str(metadata_df.shape))) # Filter - check all "is_" fields - remember all 'is_' fields must be boolean all_flag_columns = [ key for key in metadata_df.columns.values if key.startswith("is_") ] flag_df = metadata_df[all_flag_columns] metadata_df = metadata_df[flag_df.all(axis=1)] log.info("\tAfter filtering: Found {0} rows, columns.".format( str(metadata_df.shape))) log.info('finished methylation extract') return metadata_df
def identify_data(config): """Gets the metadata info from database """ # cloudSql connection params host = config['cloudsql']['host'] database = config['cloudsql']['db'] user = config['cloudsql']['user'] passwd = config['cloudsql']['passwd'] sqlquery = """ SELECT ParticipantBarcode, SampleBarcode, AliquotBarcode, Pipeline, Platform, SampleType, SampleTypeCode, Study, DatafileName, DatafileNameKey, Datatype, DatafileUploaded, IncludeForAnalysis, DataCenterName FROM metadata_data WHERE DataType='Expression-miRNA' AND DatafileUploaded='true' AND DatafileNameKey LIKE '%.mirna.quantification.txt' AND IncludeForAnalysis='yes' """ # connect to db and get results in a dataframe metadata_df = read_mysql_query(host, database, user, passwd, sqlquery) # rename platforms in rows for i, row in metadata_df.iterrows(): metadata = row.to_dict() if 'IlluminaHiSeq' in metadata['Platform']: metadata_df.loc[i, 'Platform'] = 'IlluminaHiSeq' platform = 'IlluminaHiSeq' elif 'IlluminaGA' in metadata['Platform']: metadata_df.loc[i, 'Platform'] = 'IlluminaGA' platform = 'IlluminaGA' else: raise Exception('Did not match any given platform') metadata_df.loc[i, 'OutDatafileNameKey'] = config['mirna']['mirna']['output_dir']\ + platform + '/' + metadata['DatafileName'] + '.json' metadata_df.loc[:, 'SampleTypeLetterCode'] = metadata_df['SampleTypeCode']\ .map(lambda code: config['sample_code2letter'][code]) metadata_df.loc[:, 'DatafileNameKey'] = metadata_df['DatafileNameKey']\ .map(lambda inputfile: re.sub(r"^/", "", inputfile)) # tag CELLC samples metadata_df['is_tumor'] = (metadata_df['SampleTypeLetterCode'] != "CELLC") metadata_df['transform_function'] = 'mirna.mirna.transform.parse_mirna' print "Found {0} rows, columns.".format(str(metadata_df.shape)) # Filter - check all "is_" fields - remember all 'is_' fields must be boolean all_flag_columns = [ key for key in metadata_df.columns.values if key.startswith("is_") ] flag_df = metadata_df[all_flag_columns] metadata_df = metadata_df[flag_df.all(axis=1)] print "After filtering: Found {0} rows, columns.".format( str(metadata_df.shape)) return metadata_df
def identify_data(config): """Gets the metadata info from database """ # cloudSql connection params host = config['cloudsql']['host'] database = config['cloudsql']['db'] user = config['cloudsql']['user'] passwd = config['cloudsql']['passwd'] sqlquery = """ SELECT ParticipantBarcode, SampleBarcode, AliquotBarcode, Pipeline, Platform, SampleType, SampleTypeCode, Study, DatafileName, DatafileNameKey, Datatype, DatafileUploaded, IncludeForAnalysis, DataCenterName FROM metadata_data WHERE DataType='RSEM_genes_normalized' AND DatafileUploaded='true' AND DatafileNameKey LIKE '%.rsem.genes.normalized_results' AND IncludeForAnalysis='yes' """ # connect to db and get results in a dataframe metadata_df = read_mysql_query(host, database, user, passwd, sqlquery) # rename platforms in rows for i, row in metadata_df.iterrows(): metadata = row.to_dict() if 'IlluminaHiSeq' in metadata['Platform']: metadata_df.loc[i, 'Platform'] = 'IlluminaHiSeq' platform = 'IlluminaHiSeq' elif 'IlluminaGA' in metadata['Platform']: metadata_df.loc[i, 'Platform'] = 'IlluminaGA' platform = 'IlluminaGA' else: raise Exception('Did not match any given platform') metadata_df.loc[i, 'OutDatafileNameKey'] = config['mrna']['unc']['output_dir']\ + platform + '/' + metadata['DatafileName'] + '.json' metadata_df.loc[:, 'SampleTypeLetterCode'] = metadata_df['SampleTypeCode']\ .map(lambda code: config['sample_code2letter'][code]) metadata_df.loc[:, 'DatafileNameKey'] = metadata_df['DatafileNameKey']\ .map(lambda inputfile: re.sub(r"^/", "", inputfile)) # tag CELLC samples metadata_df['is_tumor'] = (metadata_df['SampleTypeLetterCode'] != "CELLC") metadata_df['transform_function'] = 'mrna.unc.transform.parse_unc' print "Found {0} rows, columns." .format(str(metadata_df.shape)) # Filter - check all "is_" fields - remember all 'is_' fields must be boolean all_flag_columns = [key for key in metadata_df.columns.values if key.startswith("is_")] flag_df = metadata_df[all_flag_columns] metadata_df = metadata_df[flag_df.all(axis=1)] print "After filtering: Found {0} rows, columns." .format(str(metadata_df.shape)) return metadata_df
def identify_data(config): """Gets the metadata info from database """ # cloudSql connection params host = config["cloudsql"]["host"] database = config["cloudsql"]["db"] user = config["cloudsql"]["user"] passwd = config["cloudsql"]["passwd"] sqlquery = """ SELECT ParticipantBarcode, SampleBarcode, AliquotBarcode, Pipeline, Platform, SampleType, SampleTypeCode, Study, DatafileName, DatafileNameKey, Datatype, DatafileUploaded, IncludeForAnalysis, DataCenterName FROM metadata_data WHERE DataType='Expression-Protein' AND DatafileUploaded='true' AND IncludeForAnalysis='yes' """ # AND DatafileNameKey LIKE '%protein_expression.%.txt' # connect to db and get results in a dataframe metadata_df = read_mysql_query(host, database, user, passwd, sqlquery) # rename platforms in rows for i, row in metadata_df.iterrows(): metadata = row.to_dict() metadata_df.loc[i, "OutDatafileNameKey"] = config["protein"]["output_dir"] + metadata["DatafileName"] + ".json" metadata_df.loc[:, "SampleTypeLetterCode"] = metadata_df["SampleTypeCode"].map( lambda code: config["sample_code2letter"][code] ) metadata_df.loc[:, "DatafileNameKey"] = metadata_df["DatafileNameKey"].map( lambda inputfile: re.sub(r"^/", "", inputfile) ) # tag CELLC samples metadata_df["is_tumor"] = metadata_df["SampleTypeLetterCode"] != "CELLC" metadata_df["transform_function"] = "protein.transform.parse_protein" print "Found {0} rows, columns.".format(str(metadata_df.shape)) # Filter - check all "is_" fields - remember all 'is_' fields must be boolean all_flag_columns = [key for key in metadata_df.columns.values if key.startswith("is_")] flag_df = metadata_df[all_flag_columns] metadata_df = metadata_df[flag_df.all(axis=1)] print "After filtering: Found {0} rows, columns.".format(str(metadata_df.shape)) return metadata_df
def process_sql2bq(config, etl_config, sql, log): log.info("\t\tselect annotation records from db") metadata_df = read_mysql_query(config['cloudsql']['host'], config['cloudsql']['db'], config['cloudsql']['user'], config['cloudsql']['passwd'], sql) log.info("\t\tFound {0} rows, columns.".format(str(metadata_df.shape))) log.info("\tupload data to GCS.") project_id = config['cloud_projects']['open'] bucket_name = config['buckets']['open'] gcs_file_path = config['buckets']['folders'][ 'base_run_folder'] + 'etl/annotation' gcs = GcsConnector(project_id, bucket_name) gcs.convert_df_to_njson_and_upload(metadata_df, gcs_file_path + '/annotation.json', logparam=log) log.info('create the BigQuery table') Etl().load(project_id, [etl_config['bq_dataset']], [etl_config['bq_table']], [etl_config['schema_file']], ['gs://' + bucket_name + '/' + gcs_file_path], [etl_config['write_disposition']], 1, log)
def identify_data(config): """Gets the metadata info from database """ # cloudSql connection params host = config['cloudsql']['host'] database = config['cloudsql']['db'] user = config['cloudsql']['user'] passwd = config['cloudsql']['passwd'] # sqlquery = """ # SELECT ParticipantBarcode, SampleBarcode, AliquotBarcode, Pipeline, Platform, # SampleType, SampleTypeCode, Study, DatafileName, DatafileNameKey, Datatype, # DatafileUploaded, IncludeForAnalysis, DataCenterName # FROM metadata_data # WHERE DatafileUploaded='true' # AND DatafileNameKey LIKE '%.maf' # AND IncludeForAnalysis='yes' # """ sqlquery = """ SELECT ParticipantBarcode, SampleBarcode, AliquotBarcode, Pipeline, Platform, SampleType, SampleTypeCode, Study, DatafileName, DatafileNameKey, Datatype, DatafileUploaded, IncludeForAnalysis, DataCenterName FROM metadata_data WHERE Datatype='Mutations' AND DatafileNameKey LIKE '%.maf' AND DatafileUploaded='true' AND IncludeForAnalysis='yes' """ #sqlquery = """ select datafilename, datafilenamekey from metadata_data where 0 < instr(datafilename, 'maf') and 'true' = datafileuploaded and 0 = instr(datafilename, 'protected') group by datafilename, dataarchivename; #""" # connect to db and get results in a dataframe metadata_df = read_mysql_query(host, database, user, passwd, sqlquery) # print metadata_df for i, x in metadata_df.iterrows(): print x.to_dict() # print metadata_df sys.exit() # rename platforms in rows for i, row in metadata_df.iterrows(): metadata = row.to_dict() metadata_df.loc[i, 'OutDatafileNameKey'] = config['mirna']['mirna']['output_dir']\ + metadata['DatafileName'] + '.json' #metadata_df.loc[:, 'SampleTypeLetterCode'] = metadata_df['SampleTypeCode']\ # .map(lambda code: config['sample_code2letter'][code]) metadata_df.loc[:, 'DatafileNameKey'] = metadata_df['DatafileNameKey']\ .map(lambda inputfile: re.sub(r"^/", "", inputfile)) # tag CELLC samples metadata_df['transform_function'] = 'mirna.mirna.transform.parse_mirna' print "Found {0} rows, columns." .format(str(metadata_df.shape)) # Filter - check all "is_" fields - remember all 'is_' fields must be boolean all_flag_columns = [key for key in metadata_df.columns.values if key.startswith("is_")] flag_df = metadata_df[all_flag_columns] metadata_df = metadata_df[flag_df.all(axis=1)] metadata_df.to_excel(writer, 'maf_files_metadata_table') writer.save() print "After filtering: Found {0} rows, columns." .format(str(metadata_df.shape)) return metadata_df
def identify_data(config): """Gets the metadata info from database """ log = configure_logging('data_et', 'logs/data_et.log') log.info('start data extract and transform') # cloudSql connection params host = config['cloudsql']['host'] database = config['cloudsql']['db'] user = config['cloudsql']['user'] passwd = config['cloudsql']['passwd'] log.info("\tselect data records from db") sqlquery = """ SELECT ParticipantBarcode, SampleBarcode, AliquotBarcode, AliquotUUID, DataArchiveName, DataArchiveURL, DataArchiveVersion, DataCenterCode, DataCenterName, DataCenterType, DatafileMD5, DatafileName, DatafileNameKey, DatafileUploaded, DataLevel, Datatype, GenomeReference, IncludeForAnalysis, MAGETabArchiveName, MAGETabArchiveURL, Pipeline, Platform, Project, Repository, SampleType, SampleTypeCode, SDRFFileName, SDRFFileNameKey, SecurityProtocol, Species, Study, wasDerivedFrom, library_strategy, state, reason_for_state, analysis_id, analyte_code, last_modified, platform_full_name, GG_dataset_id, GG_readgroupset_id FROM metadata_data WHERE Project = 'TCGA' AND study = '%s' AND DatafileUploaded='true' AND DatafileNameKey is not null AND IncludeForAnalysis='yes' """ studies = config['all_tumor_types'] for study in studies: # connect to db and get results in a dataframe log.info("\tselect data records from db for %s" % (study)) metadata_df = read_mysql_query(host, database, user, passwd, sqlquery % (study)) log.info("\tFound {0} rows, columns." .format(str(metadata_df.shape))) project_id = config['project_id'] bucket_name = config['buckets']['open'] gcs = GcsConnector(project_id, bucket_name) log.info("\tupload data to GCS.") gcs.convert_df_to_njson_and_upload(metadata_df, config['data']['output_dir'] + config['data']['bq_table'] + '_%s.json' % (study)) log.info('finished extract and transform')
def identify_data(config): """Gets the metadata info from database """ # cloudSql connection params host = config['cloudsql']['host'] database = config['cloudsql']['db'] user = config['cloudsql']['user'] passwd = config['cloudsql']['passwd'] # sqlquery = """ # SELECT ParticipantBarcode, SampleBarcode, AliquotBarcode, Pipeline, Platform, # SampleType, SampleTypeCode, Study, DatafileName, DatafileNameKey, Datatype, # DatafileUploaded, IncludeForAnalysis, DataCenterName # FROM metadata_data # WHERE DatafileUploaded='true' # AND DatafileNameKey LIKE '%.maf' # AND IncludeForAnalysis='yes' # """ sqlquery = """ SELECT ParticipantBarcode, SampleBarcode, AliquotBarcode, Pipeline, Platform, SampleType, SampleTypeCode, Study, DatafileName, DatafileNameKey, Datatype, DatafileUploaded, IncludeForAnalysis, DataCenterName FROM metadata_data WHERE Datatype='Mutations' AND DatafileNameKey LIKE '%.maf' AND DatafileUploaded='true' AND IncludeForAnalysis='yes' """ #sqlquery = """ select datafilename, datafilenamekey from metadata_data where 0 < instr(datafilename, 'maf') and 'true' = datafileuploaded and 0 = instr(datafilename, 'protected') group by datafilename, dataarchivename; #""" # connect to db and get results in a dataframe metadata_df = read_mysql_query(host, database, user, passwd, sqlquery) # print metadata_df for i, x in metadata_df.iterrows(): print x.to_dict() # print metadata_df sys.exit() # rename platforms in rows for i, row in metadata_df.iterrows(): metadata = row.to_dict() metadata_df.loc[i, 'OutDatafileNameKey'] = config['mirna']['mirna']['output_dir']\ + metadata['DatafileName'] + '.json' #metadata_df.loc[:, 'SampleTypeLetterCode'] = metadata_df['SampleTypeCode']\ # .map(lambda code: config['sample_code2letter'][code]) metadata_df.loc[:, 'DatafileNameKey'] = metadata_df['DatafileNameKey']\ .map(lambda inputfile: re.sub(r"^/", "", inputfile)) # tag CELLC samples metadata_df['transform_function'] = 'mirna.mirna.transform.parse_mirna' print "Found {0} rows, columns.".format(str(metadata_df.shape)) # Filter - check all "is_" fields - remember all 'is_' fields must be boolean all_flag_columns = [ key for key in metadata_df.columns.values if key.startswith("is_") ] flag_df = metadata_df[all_flag_columns] metadata_df = metadata_df[flag_df.all(axis=1)] metadata_df.to_excel(writer, 'maf_files_metadata_table') writer.save() print "After filtering: Found {0} rows, columns.".format( str(metadata_df.shape)) return metadata_df