def update_cloudsql_from_bigquery(config, postproc_config, project_name, cloudsql_table, log, data_type=None, endpt_type=None): update_stmt = 'update %s\nset \n\t%s\nwhere %s = %%s' % ( cloudsql_table, '\n\t'.join( '%s = %%s,' % (postproc_config['postproc_columns'][key]) for key in postproc_config['postproc_columns'].keys())[:-1], postproc_config['postproc_key_column']) if project_name: if data_type: # assumes that endpt_type is also supplied query_results = query_bq_table( postproc_config['postproc_query'] % (', '.join(postproc_config['postproc_columns'].keys()), endpt_type, project_name, data_type), False, postproc_config['postproc_project'], log) else: query_results = query_bq_table( postproc_config['postproc_query'] % (', '.join( postproc_config['postproc_columns'].keys()), project_name), False, postproc_config['postproc_project'], log) else: query_results = query_bq_table( postproc_config['postproc_query'] % (', '.join(postproc_config['postproc_columns'].keys())), False, postproc_config['postproc_project'], log) page_token = None log.info('\t\t\tupdate_stmt\n%s' % (update_stmt)) update_count = 0 while True: total_rows, rows, page_token = fetch_paged_results( query_results, postproc_config['postproc_fetch_count'], project_name, page_token, log) if 0 < total_rows: log.info('\t\t\ttotal rows: %s\n\t%s\n\t\t...\n\t%s' % (total_rows, str(rows[0]), str(rows[-1]))) else: log.info('\t\t\tno rows') return if config['update_cloudsql']: ISBCGC_database_helper.update(config, update_stmt, log, rows, True) update_count += len(rows) log.info( '\t\t\tupdated %s so far%s' % (update_count, ' for ' + project_name if project_name else '')) if not page_token: log.info( '\t\t\tupdated total of %s rows%s' % (update_count, ' for ' + project_name if project_name else '')) return
def select_sample_bq_barcodes(self, program): self.log.info('start select %s bq samples' % (program.lower())) biospecimen_query = 'SELECT case_barcode, sample_barcode FROM [isb-cgc:%s_bioclin_v0.Biospecimen]' % ( program) biospecimen_results = query_bq_table(biospecimen_query, True, 'isb-cgc', self.log) page_token = None sample_case_barcodes = set() sample_sample_barcodes = set() while True: total_rows, rows, page_token = fetch_paged_results( biospecimen_results, 10000, None, page_token, self.log) for row in rows: case_barcode = row[0].strip() sample_barcode = row[1].strip() if sample_barcode in sample_sample_barcodes: raise ValueError('found duplicate sample entry: %s' % (sample_barcode)) sample_case_barcodes.add(case_barcode) sample_sample_barcodes.add(sample_barcode) if not page_token: self.log.info('\tselected total of %s sample_barcodes' % (total_rows)) break else: self.log.info('\t\tselect %d sample barcodes' % (len(rows))) return sample_sample_barcodes, sample_case_barcodes
def main(config_file_name): log = None try: with open(config_file_name) as configFile: config = json.load(configFile) log_dir = str(date.today()).replace('-', '_') + '_' + 'ccle/' log_name = create_log(log_dir, 'update_ccle_gcs_paths') log = logging.getLogger(log_name) log.info('begin updating CCLE paths in production') # first thing to do is to read in the file paths from BigQuery query = 'SELECT file_gdc_id, file_gcs_url ' \ 'FROM [isb-cgc:GDC_metadata.GDCfileID_to_GCSurl] ' \ 'where 0 < instr(file_gcs_url, \'CCLE\')' query_results = query_bq_table(query, True, 'isb-cgc', log) _, rows, _ = fetch_paged_results(query_results, 2000, None, None, log) log.info('\tcreate map of filename to path') name2path = {} for row in rows: fields = row[1].split('/') name2path[fields[-1]] = '/'.join(fields[3:]) log.info('\tfinished map of filename to path') # get the db rows from production cloudsql log.info('\tselect ccle filenames from cloudsql') query = 'SELECT datafilename ' \ 'FROM main.metadata_data ' \ 'where 0 < instr(datafilename, \'bam\') and project = \'CCLE\'' rows = helper.select(config, query, log, []) log.info('\tselected %s ccle filenames from cloudsql' % (len(rows))) # now setup and do the update of paths in cloud sql log.info('\tstart updating paths in cloudsql') params = [] not_matched = [] for row in rows: if row[0] in name2path: params += [[name2path[row[0]], row[0]]] else: not_matched += [row[0]] update = 'update main.metadata_data set datafilenamekey = %s where datafilename = %s' helper.update(config, update, log, params) log.info('\tcompleted update of paths in cloudsql. updated %d, did not find matches from BQ in cloudsql for %s' % (len(params), ', '.join(not_matched))) log.info('finished updating CCLE paths in production') except: if log: log.exception('failed to update ccle GCS filepaths') finally: if log: close_log(log)
def call_bigquery(table, log): if table in ('TARGET_metadata_data_HG38', 'TCGA_metadata_data_HG38'): bq_table = '[isb-cgc:GDC_metadata.rel8_fileData_current]' else: bq_table = '[isb-cgc:GDC_metadata.rel8_fileData_legacy]' query_results = query_bq_table('select file_name from {} where cases__project__program__name = "{}"'.format(bq_table, table.split('_')[0]), True, 'isb-cgc', log) all_rows = set() page_token = None while True: _, rows, page_token = fetch_paged_results(query_results, 20000, None, page_token, log) all_rows.update(rows) if not page_token: log.info('\t\tfound %s rows ' % (len(all_rows))) break return all_rows
def get_bq_data_type_barcodes(self, program_name, bq_table, sample_barcode, has_file, log): log.info('\t\tgetting bq data type barcodes {}-{} for gcs'.format( program_name, bq_table)) try: if 'Methylation' in bq_table: project = '"ALL"' else: project = 'project_short_name' if has_file: stmt = 'select {}, case_barcode, {}, file_gdc_id from {} group by 1, 2, 3, 4'.format( project, sample_barcode, bq_table) else: stmt = 'select {}, case_barcode, {}, "" from {} group by 1, 2, 3'.format( project, sample_barcode, bq_table) project2cases = {} project2samples = {} project2files = {} results = query_bq_table(stmt, True, 'isb-cgc', self.log) count = 0 page_token = None while True: total, rows, page_token = fetch_paged_results( results, 1000, None, page_token, self.log) count += 1000 for row in rows: cases = project2cases.setdefault(row[0], set()) cases.add(row[1]) samples = project2samples.setdefault(row[0], set()) samples.add(row[2]) files = project2files.setdefault(row[0], set()) if 0 < len(files): files.add(row[3]) if not page_token: self.log.info( '\tfinished select from {}. selected {} total rows'. format(bq_table, total)) break log.info('\t\tfinished bq data type barcodes {}-{} for gcs'.format( program_name, bq_table)) return project2cases, project2samples, project2files except: log.exception('problem in get_bq_data_type_barcodes()') raise
def get_project_counts(self, table, column, values): self.log.info('start get project counts for %s:%s' % (table, column)) clinical_query = 'SELECT project_short_name, count(*) FROM %s where %s in (\'%s\') group by 1' % ( table, column, '\', \''.join(values)) results = query_bq_table(clinical_query, True, 'isb-cgc', self.log) page_token = None output = '\tproject\tcount\n' while True: _, rows, page_token = fetch_paged_results(results, 50, None, page_token, self.log) for row in rows: output += '\t%s\t%d\n' % (row[0], row[1]) if not page_token: self.log.info('project counts:\n%s' % (output)) break else: pass
def process_image_type(config, image_type, log): ''' based on the configuration map loaded from the configFileName, loads the DCC metadata into CloudSQL. also obtains metadata based on file paths, SDRF values, and CGHub manifest values parameters: config: configuration mappings image_type: the type of image (radiology, tissue or diagnostic) log: where to write progress and other messages ''' programs = config['program_names_for_images'] for program in programs: # for programs with images, select the appropriate section from the config file image_config = config[program]['process_files']['images'] # query the big query table bq_select_template = image_config[image_type]['bq_select_template'] bq_columns = image_config[image_type]['bq_columns'] query_results = query_bq_table( bq_select_template.format(','.join(bq_columns)), image_config[image_type]['use_legacy'], image_config['target_program'], log) page_token = None combined_rows = [] while True: # loop through the big query results total_rows, rows, page_token = fetch_paged_results( query_results, image_config['fetch_count'], None, page_token, log) combined_rows += rows # process updates to the metadata data table process_image_records(config, program, image_config, image_type, rows, log) # create inserts into the metadata data that for big query rows that didn't have a match already in the metadata data table if not page_token: log.info('\tchecked total of %s rows' % (total_rows)) break verify_barcodes_filenames(config, program, image_config, image_type, combined_rows, log)
def select_clinical_bq_barcodes(self, program): self.log.info('start select %s bq cases' % (program.lower())) if 'CCLE' == program: clinical_query = 'SELECT case_barcode FROM [isb-cgc:%s_bioclin_v0.clinical_v0]' % ( program) else: clinical_query = 'SELECT case_barcode FROM [isb-cgc:%s_bioclin_v0.Clinical]' % ( program) clinical_results = query_bq_table(clinical_query, True, 'isb-cgc', self.log) page_token = None clinical_case_barcodes = set() dup_barcodes = set() while True: total_rows, rows, page_token = fetch_paged_results( clinical_results, 10000, None, page_token, self.log) for row in rows: case_barcode = row[0].strip() if case_barcode in clinical_case_barcodes: dup_barcodes.add(case_barcode) else: clinical_case_barcodes.add(case_barcode) if not page_token: self.log.info('\tselected total of %s case_barcodes' % (total_rows)) break else: self.log.info('\t\tselect %d barcodes' % (len(rows))) if len(dup_barcodes) > 0: self.print_partial_list( 'duplicate case barcodes in BQ (%s)' % (len(dup_barcodes)), dup_barcodes) return clinical_case_barcodes
def get_bq_case_info(self, program, barcode_type, barcodes, tag): if 'CCLE' == program: table = 'isb-cgc:CCLE_bioclin_v0.clinical_v0' else: table = 'isb-cgc:{}_bioclin_v0.Clinical'.format(program) stmt = 'SELECT case_barcode FROM [{}] where case_barcode in ({})'.format( table, ', '.join('"{}"'.format('-'.join(barcode.split('-')[:-1])) for barcode in set(barcodes))) results = query_bq_table(stmt, True, 'isb-cgc', self.log) page_token = None case_barcodes = {} while True: _, rows, page_token = fetch_paged_results(results, 10000, None, page_token, self.log) for row in rows: case_barcode = row[0].strip() case_barcodes.setdefault(case_barcode, set()).add(case_barcode) if not page_token: break return case_barcodes
def test_populate_sample_availibility(self): bqTable2data_type = {'Somatic_Mutation': 'Masked Somatic Mutation'} for bqTable, data_type in bqTable2data_type.iteritems(): self.log.info('populate_sample_availibility() for %s' % (data_type)) #remove existing records stmt = 'delete from TCGA_metadata_sample_data_availability where metadata_data_type_availability_id = ' \ '(select metadata_data_type_availability_id from TCGA_metadata_data_type_availability where isb_label = "Somatic_Mutation" and genomic_build = "HG38")' ISBCGC_database_helper.update(self.config, stmt, self.log, [[]]) query_results = query_bq_table( 'select Tumor_Sample_Barcode, Matched_Norm_Sample_Barcode, fileUUID from [isb-cgc:TCGA_hg38_data_v0.{}] group by 1,2,3' .format(bqTable), True, None, self.log) page_token = None barcode2seen_files = {} barcode2infos = {} infos = [] while True: # loop through the big query results and get the sample_barcode into the info list as populate_sample_availibility() # expects it total_rows, rows, page_token = fetch_paged_results( query_results, 200000, None, page_token, self.log) for row in rows: tumor = row[0][:16] # normal = row[1][:16] files = row[2].split('|') for curfile in files: if tumor in barcode2seen_files: seen_files = barcode2seen_files[tumor] if row[2] in seen_files: continue seen_files.add(curfile) else: barcode2seen_files[tumor] = set([curfile]) samples_tumor = {'submitter_id': tumor} sample_list = [samples_tumor] info = {'access': 'open'} case_list = info.setdefault('cases', []) case_list += [{'samples': sample_list}] barcode2infos[tumor] = barcode2infos.setdefault( tumor, []) + [info] # # samples_normal = {'submitter_id': normal} # sample_list = [samples_normal] # # info = {'access': 'open'} # case_list = info.setdefault('cases', []) # case_list += [{'samples': sample_list}] # barcode2infos[normal] = barcode2infos.setdefault(normal, []) + [info] infos += [ info for curinfos in barcode2infos.itervalues() for info in curinfos ] # create inserts into the metadata data that for big query rows that didn't have a match already in the metadata data table if not page_token: self.log.info('\tprocessed total of %s rows for %s' % (total_rows, bqTable)) break populate_sample_availibility(self.config, 'current', 'TCGA', 'all', data_type, infos, self.log) self.log.info('finished populate_sample_availibility() for %s' % (data_type))
bq2cases = {} bq2samples = {} bq2files = {} for table in bq_tables: if not table[0]: continue if not table[2]: sql = 'select {}, "" from {}'.format(case, table[0]) elif not table[1]: sql = 'select "", {} from {}'.format(sample, table[0]) else: sql = 'select {}, {} from {}'.format(case, sample, table[0]) self.log.info('\tstart select for {} from bq{}'.format( table[0], ' where {}'.format(where) if where else '')) results = query_bq_table(sql, True, 'isb-cgc', self.log) count = 0 page_token = None cases = set() samples = set() while True: total, rows, page_token = fetch_paged_results( results, 1000, None, page_token, self.log) count += 1000 for row in rows: cases.add(row[0]) samples.add(row[1]) if not page_token: self.log.info( '\tfinished select from {}. select {} total rows'.
def get_bq_file_info(self, program, barcode_type, barcodes, tag): program2dataset2data_type2column_name = { 'TARGET': { 'TARGET_hg38_data_v0': { 'RNAseq_Gene_Expression': 'file_gdc_id', 'miRNAseq_Expression': 'file_gdc_id', 'miRNAseq_Isoform_Expression': 'file_gdc_id' }, }, 'TCGA': { 'TCGA_hg19_data_v0': { 'Copy_Number_Segment_Masked': 'aliquot_barcode', 'DNA_Methylation': 'aliquot_barcode', 'Protein_Expression': 'aliquot_barcode', 'RNAseq_Gene_Expression_UNC_RSEM': 'aliquot_barcode', 'miRNAseq_Expression': 'file_gdc_id', 'miRNAseq_Isoform_Expression': 'file_gdc_id', 'Somatic_Mutation_DCC': 'aliquot_barcode_tumor', 'Somatic_Mutation_MC3': 'aliquot_barcode_tumor' }, 'TCGA_hg38_data_v0': { 'Copy_Number_Segment_Masked': 'file_gdc_id', 'RNAseq_Gene_Expression': 'file_gdc_id', 'miRNAseq_Expression': 'file_gdc_id', 'miRNAseq_Isoform_Expression': 'file_gdc_id', 'DNA_Methylation': 'file_gdc_id', 'Protein_Expression': 'aliquot_barcode', 'Somatic_Mutation': 'fileName' } } } barcodes_in = ','.join('"{}"'.format(barcode) for barcode in set(barcodes)) query_template = 'select left(data.{0}, {8}), meta.data_type, meta.data_format, meta.experimental_strategy, meta.platform, {1} from [isb-cgc:{2}.{3}] data join [isb-cgc:GDC_metadata.{4}] meta\n' \ ' on data.{5} = meta.{6}\nwhere data.{0} in ({7})\ngroup by 1,2,3,4,5,6' barcode2infos = {} dataset2data_type2column_name = program2dataset2data_type2column_name.setdefault( program, {}) for dataset in dataset2data_type2column_name: if 'hg19' in dataset: column_sub = '"None"' table = 'rel5_legacy_fileData' else: column_sub = 'meta.analysis_workflow_type' table = 'rel5_current_fileData' data_type2column_name = dataset2data_type2column_name.setdefault( dataset, {}) for data_type, column_name in data_type2column_name.iteritems(): if column_name in ('aliquot_barcode', 'Tumor_Sample_Barcode', 'aliquot_barcode_tumor'): join_col = 'associated_entities__entity_submitter_id' elif column_name in ('fileName'): join_col = 'file_name' else: join_col = column_name if data_type in ('Somatic_Mutation', ): barcode_t = 'Tumor_Sample_Barcode' elif data_type in ('Somatic_Mutation_MC3', 'Somatic_Mutation_DCC'): barcode_t = 'sample_barcode_tumor' else: barcode_t = barcode_type if 'case_barcode' == barcode_type: if program == "CCLE": length = 200 elif program == 'TARGET': length = 16 else: length = 12 else: length = 200 query = query_template.format(barcode_t, column_sub, dataset, data_type, table, column_name, join_col, barcodes_in, length) results = query_bq_table(query, True, 'isb-cgc', self.log) page_token = None while True: _, rows, page_token = fetch_paged_results( results, 10000, None, page_token, self.log) for row in rows: barcode = row[0] if barcode in barcodes: barcode2infos[barcode] = barcode2infos.setdefault( barcode, []) + [[ row[1], row[2], str(row[3]), str(row[4]), str(row[5]) ]] else: raise ValueError( 'unexpected mismatch of return with barcodes:\n{}\n' .format(', '.join(barcodes), ', '.join(str(field) for field in row))) if not page_token: break return barcode2infos
def main(config_file_name): log = None try: with open(config_file_name) as configFile: config = json.load(configFile) log_dir = str(date.today()).replace('-', '_') + '_' + 'tcga/' log_name = create_log(log_dir, 'update_tcga_gcs_paths') log = logging.getLogger(log_name) log.info('begin updating TCGA paths in production') # get the db rows from production cloudsql log.info('\tselect tcga filenames from cloudsql') query = 'SELECT datafilename ' \ 'FROM metadata_data ' \ 'where 0 < instr(datafilename, \'bam\') and project = \'TCGA\'' cloudsql_rows = set(row[0] for row in helper.select(config, query, log, [])) log.info('\tselected %s tcga filenames from cloudsql' % (len(cloudsql_rows))) # read in the file paths from BigQuery query = 'SELECT file_gdc_id, file_gcs_url ' \ 'FROM [isb-cgc:GDC_metadata.GDCfileID_to_GCSurl] ' \ 'where 0 < instr(file_gcs_url, \'TCGA\') and 0 < instr(file_gcs_url, \'legacy\') and 0 < instr(file_gcs_url, \'bam\') ' \ 'order by file_gcs_url' query_results = query_bq_table(query, True, 'isb-cgc', log) total_not_matched = 0 total_distinct = set() page_token = None while True: total_rows, rows, page_token = fetch_paged_results( query_results, 2000, None, page_token, log) log.info('\t\tcreate map of filename to path') name2path = {} for row in rows: fields = row[1].split('/') name2path[fields[-1]] = '/'.join(fields[3:]) log.info('\t\tfinished map of filename to path') # now setup and do the update of paths in cloud sql log.info('\t\tstart updating paths in cloudsql') params = [] select_params = [] not_matched = [] for name, path in name2path.iteritems(): if name in cloudsql_rows: total_distinct.add(name) params += [[path, name]] select_params += [name] else: not_matched += [path] update = 'update metadata_data set datafilenamekey = %s, datafileuploaded = \'true\' where datafilename = %s' helper.update(config, update, log, params) select_in = '%s,' * len(select_params) select_in = select_in[:-1] select_query = 'select count(*) from metadata_data where datafilename in (%s)' % ( select_in) count = helper.select(config, select_query, log, select_params) log.info('select %s file name matches for %s file names.' % (count[0][0], len(select_params))) total_not_matched += len(not_matched) if not page_token: log.info( '\t\tupdated total of %s rows for TCGA with %d distinct file names' % (total_rows - total_not_matched, len(total_distinct))) break else: log.info( '\t\tupdated %d rows, did not find matches from BQ in cloudsql for %d:\n\t%s' % (len(params), len(not_matched), '\n\t'.join(not_matched))) log.info('\tcompleted update of paths in cloudsql') log.info('finished updating TCGA paths in production') except: if log: log.exception('failed to update tcga GCS filepaths') finally: if log: close_log(log)