def select_sample_sql_barcodes(self, program): self.log.info('start select %s sql cases' % (program.lower())) sample_query = 'SELECT case_barcode, sample_barcode FROM %s_metadata_biospecimen' % ( program) rows = ISBCGC_database_helper.select(self.config, sample_query, self.log, params=[]) sample_case_barcodes = set() sample_sample_barcodes = set() duplicates = list() for row in rows: case_barcode = row[0].strip() sample_barcode = row[1].strip() if sample_barcode in sample_sample_barcodes: duplicates += [sample_barcode] sample_case_barcodes.add(case_barcode) sample_sample_barcodes.add(sample_barcode) if 0 < len(duplicates): self.print_partial_list('found sample duplicates', duplicates) query = 'select l.endpoint_type, c.endpoint_type, count(distinct l.sample_barcode), count(distinct c.sample_barcode) ' \ 'from {0}_metadata_biospecimen l left join {0}_metadata_biospecimen c ' \ 'on l.sample_barcode = c.sample_barcode ' \ 'group by 1, 2'.format(program) rows = ISBCGC_database_helper.select(self.config, query, self.log, params=[]) self.log.info( '\nendpoint\tendpoint\tlegacy count\tcurrent count\n%s\n' % ('\n'.join('%s\t%s\t%d\t%d' % (row[0], row[1], row[2], row[3]) for row in rows))) return sample_sample_barcodes, sample_case_barcodes
def select_clinical_sql_barcodes(self, program): self.log.info('start select %s sql cases' % (program.lower())) clinical_query = 'SELECT case_barcode FROM %s_metadata_clinical' % ( program) rows = ISBCGC_database_helper.select(self.config, clinical_query, self.log, params=[]) clinical_case_barcodes = set() duplicates = list() for row in rows: case_barcode = row[0].strip() if case_barcode in clinical_case_barcodes: duplicates += [case_barcode] # raise ValueError('found duplicate case entry: %s' % (row[0])) continue clinical_case_barcodes.add(case_barcode) if 0 < len(duplicates): self.print_partial_list('found case duplicates', duplicates) query = 'select l.endpoint_type, c.endpoint_type, count(distinct l.case_barcode), count(distinct c.case_barcode) ' \ 'from {0}_metadata_clinical l left join {0}_metadata_clinical c ' \ 'on l.case_barcode = c.case_barcode ' \ 'group by 1, 2'.format(program) rows = ISBCGC_database_helper.select(self.config, query, self.log, params=[]) self.log.info( '\nendpoint\tendpoint\tlegacy count\tcurrent count\n%s' % ('\n'.join('%s\t%s\t%d\t%d' % (row[0], row[1], row[2], row[3]) for row in rows))) return clinical_case_barcodes
def main(configfilename, baminfo_filename): log_dir = str(date.today()).replace('-', '_') + '_gg_update' + '/' log_name = create_log(log_dir, 'update_gg_metadata') log = logging.getLogger(log_name) log.info('begin update gg metadata') idcol = 0 ggdatasetcol = 7 ggreadgroupset = 6 nexist = 'NA' updates = [] count = 0 with open(baminfo_filename) as baminfo: baminfo.readline() for line in baminfo: fields = line.strip().split('\t') if fields[ggreadgroupset] == nexist: continue values = [ fields[ggdatasetcol], fields[ggreadgroupset], fields[idcol] ] if 0 == count % 1000: log.info('%s processing row %s--%s' % (datetime.now(), count, ':'.join(values))) count += 1 updates += [values] stmt = 'update metadata_data set GG_dataset_id = %s, GG_readgroupset_id = %s where analysis_id = %s' with open(configfilename) as configFile: config = json.load(configFile) ISBCGC_database_helper.update(config, stmt, log, updates, False) log.info('finished update gg metadata')
def process_biospecimen_image_records(config, program, image_config, image_type, rows, log): ''' based on either the sample_barcode (for tissue or diagnostic images), creates a new metadata clinical record if necessary in the config-specified metadata clinical table parameters: config: configuration mappings image_config: section of the config file with specific mappings for these image types rows: rows selected from the BigQuery table log: where to write progress and other messages endpoint_type: 'legacy' sample_gdc_id: ? sample_barcode: row[0] sample_type: row[0][13:15] case_gdc_id: ? case_barcode: row[2] program_name: 'TCGA' disease_code: project_name.split('-')[1] project_short_name: row[4].split('/')[6] ''' # get the information from the config mapping log.info('\tchecking biospecimen records') barcode2row = dict([(row[image_config[image_type]['sample_barcode_index']], row) for row in rows]) log.info('\tbacodes--{}:{}'.format(len(set(barcode2row)), len(barcode2row))) biospecimen_select_template = image_config['biospecimen_select_template'] biospecimen_rows = ISBCGC_database_helper.select( config, biospecimen_select_template % ("'" + "','".join(barcode2row) + "'"), log, []) barcode_db = set( [biospecimen_row[0] for biospecimen_row in biospecimen_rows]) new_barcodes = set(barcode2row) - barcode_db if 0 < len(new_barcodes): log.info('\t\tinserting {} new biospecimen records'.format( len(new_barcodes))) rows = [] for barcode in new_barcodes: row = barcode2row[barcode] project_short_name = row[4].split('/')[6] rows += [[ 'legacy', row[0], row[0][13:15], row[2], 'TCGA', project_short_name.split('-')[1], project_short_name ]] rows += [[ 'current', row[0], row[0][13:15], row[2], 'TCGA', project_short_name.split('-')[1], project_short_name ]] ISBCGC_database_helper.column_insert( config, rows, image_config['biospecimen_table'], image_config['biospecimen_columns'], log) else: log.info('\t\tno rows to insert for biospecimen records')
def main(config_file_name): log = None try: with open(config_file_name) as configFile: config = json.load(configFile) log_dir = str(date.today()).replace('-', '_') + '_' + 'ccle/' log_name = create_log(log_dir, 'update_ccle_gcs_paths') log = logging.getLogger(log_name) log.info('begin updating CCLE paths in production') # first thing to do is to read in the file paths from BigQuery query = 'SELECT file_gdc_id, file_gcs_url ' \ 'FROM [isb-cgc:GDC_metadata.GDCfileID_to_GCSurl] ' \ 'where 0 < instr(file_gcs_url, \'CCLE\')' query_results = query_bq_table(query, True, 'isb-cgc', log) _, rows, _ = fetch_paged_results(query_results, 2000, None, None, log) log.info('\tcreate map of filename to path') name2path = {} for row in rows: fields = row[1].split('/') name2path[fields[-1]] = '/'.join(fields[3:]) log.info('\tfinished map of filename to path') # get the db rows from production cloudsql log.info('\tselect ccle filenames from cloudsql') query = 'SELECT datafilename ' \ 'FROM main.metadata_data ' \ 'where 0 < instr(datafilename, \'bam\') and project = \'CCLE\'' rows = helper.select(config, query, log, []) log.info('\tselected %s ccle filenames from cloudsql' % (len(rows))) # now setup and do the update of paths in cloud sql log.info('\tstart updating paths in cloudsql') params = [] not_matched = [] for row in rows: if row[0] in name2path: params += [[name2path[row[0]], row[0]]] else: not_matched += [row[0]] update = 'update main.metadata_data set datafilenamekey = %s where datafilename = %s' helper.update(config, update, log, params) log.info('\tcompleted update of paths in cloudsql. updated %d, did not find matches from BQ in cloudsql for %s' % (len(params), ', '.join(not_matched))) log.info('finished updating CCLE paths in production') except: if log: log.exception('failed to update ccle GCS filepaths') finally: if log: close_log(log)
def update_cloudsql_from_bigquery(config, postproc_config, project_name, cloudsql_table, log, data_type=None, endpt_type=None): update_stmt = 'update %s\nset \n\t%s\nwhere %s = %%s' % ( cloudsql_table, '\n\t'.join( '%s = %%s,' % (postproc_config['postproc_columns'][key]) for key in postproc_config['postproc_columns'].keys())[:-1], postproc_config['postproc_key_column']) if project_name: if data_type: # assumes that endpt_type is also supplied query_results = query_bq_table( postproc_config['postproc_query'] % (', '.join(postproc_config['postproc_columns'].keys()), endpt_type, project_name, data_type), False, postproc_config['postproc_project'], log) else: query_results = query_bq_table( postproc_config['postproc_query'] % (', '.join( postproc_config['postproc_columns'].keys()), project_name), False, postproc_config['postproc_project'], log) else: query_results = query_bq_table( postproc_config['postproc_query'] % (', '.join(postproc_config['postproc_columns'].keys())), False, postproc_config['postproc_project'], log) page_token = None log.info('\t\t\tupdate_stmt\n%s' % (update_stmt)) update_count = 0 while True: total_rows, rows, page_token = fetch_paged_results( query_results, postproc_config['postproc_fetch_count'], project_name, page_token, log) if 0 < total_rows: log.info('\t\t\ttotal rows: %s\n\t%s\n\t\t...\n\t%s' % (total_rows, str(rows[0]), str(rows[-1]))) else: log.info('\t\t\tno rows') return if config['update_cloudsql']: ISBCGC_database_helper.update(config, update_stmt, log, rows, True) update_count += len(rows) log.info( '\t\t\tupdated %s so far%s' % (update_count, ' for ' + project_name if project_name else '')) if not page_token: log.info( '\t\t\tupdated total of %s rows%s' % (update_count, ' for ' + project_name if project_name else '')) return
def set_uploaded_path(config, endpt_type, program_name, project_id, data_type, log): log.info('\tbegin set_uploaded_path()') postproc_config = config['postprocess_keypath'] # first set file_name_key to null cloudsql_table = '%s_metadata_data_%s' % ( program_name, config['endpt2genomebuild'][endpt_type]) ISBCGC_database_helper.update( config, postproc_config['postproc_file_name_key_null_update'] % (cloudsql_table, project_id, data_type), log, [[]]) # now use the BigQuery table to set file_name_key update_cloudsql_from_bigquery(config, postproc_config, project_id, cloudsql_table, log, data_type, endpt_type) ISBCGC_database_helper.update( config, postproc_config['postproc_file_uploaded_update'] % (cloudsql_table), log, [[]]) log.info('\tfinished set_uploaded_path()')
def populate_sample_availibility(config, endpt_type, program_name, project_id, data_type, infos, log): log.info('\tbegin populate_sample_availibility() for %s:%s' % (project_id, data_type)) # iterate through the gdc info and put together the counts for the sample barcodes sample_barcode2count = {} for info in infos: mapping = config[program_name]['process_files']['data_table_mapping'] flattened = flatten_map(info, mapping) for index in range(len(flattened)): if (data_type in ('Simple somatic mutation', 'Masked Somatic Mutation') and 'controlled ' == flattened[index]['access']) or \ (data_type in ('Aligned reads') and 'open' == flattened[index]['access']): continue sample_barcode = flattened[index]['sample_barcode'] count = sample_barcode2count.setdefault(sample_barcode, 0) sample_barcode2count[sample_barcode] = count + 1 # read in the appropriate data availability row to get the foreign key isb_label = config['data_type2isb_label'][data_type] stmt = 'select metadata_data_type_availability_id from %s_metadata_data_type_availability where genomic_build = %%s and isb_label = %%s' % ( program_name) foreign_key = ISBCGC_database_helper.select( config, stmt, log, [config['endpt2genomebuild'][endpt_type], isb_label])[0][0] params = [] for sample_barcode, count in sample_barcode2count.iteritems(): params += [[foreign_key, sample_barcode, count]] ISBCGC_database_helper.column_insert( config, params, '%s_metadata_sample_data_availability' % (program_name), ['metadata_data_type_availability_id', 'sample_barcode', 'count'], log) log.info('\tfinished populate_sample_availibility() for %s:%s' % (project_id, data_type))
def process_data_availability_records(config, program, image_config, image_type, rows, log): ''' NOTE: this assumes at the start of the run, that no sample barcode is associated with and Image ISB label. it is possible that a sample_barcode is added and then a subsequent batch will contain that barcode associated with more files ''' if 'Radiology' == image_type: return stmt = 'select metadata_data_type_availability_id from TCGA_metadata_data_type_availability where %s = isb_label and %s = genomic_build' image_rows = ISBCGC_database_helper.select(config, stmt, log, ["Pathology_Image", "HG19"]) if 1 != len(image_rows): raise ValueError( 'unexpected number of rows returned for availability id {}'.format( image_rows)) data_type_availability_id = image_rows[0][0] # get the current count for a barcode based on previous runs stmt = 'select sample_barcode, count(*) from {}_metadata_sample_data_availability where metadata_data_type_availability_id = %s group by 1'.format( program) db_rows = ISBCGC_database_helper.select(config, stmt, log, [ data_type_availability_id, ]) db_barcode2count = {} for db_row in db_rows: db_barcode2count[db_row[0]] = db_row[1] if len(db_rows) != len(set(db_rows)): raise ValueError( 'sample_barcode should only appear once per isb_label') # get the current count from the current batch for each sample barcode barcode2count = {} for row in rows: barcode2count[row[0]] = barcode2count.setdefault(row[0], 0) + 1 new_barcodes = set(barcode2count) - set(db_rows) new_params = [] for new_barcode in new_barcodes: new_params += [[ data_type_availability_id, new_barcode, barcode2count[new_barcode] ]] ISBCGC_database_helper.column_insert( config, new_params, 'TCGA_metadata_sample_data_availability', ['metadata_data_type_availability_id', 'sample_barcode', 'count'], log) update_barcodes = set(barcode2count) & set(db_rows) update_params = [] for update_barcode in update_barcodes: update_params += [[ barcode2count[update_barcode] + db_barcode2count[update_barcode], data_type_availability_id, update_barcode ]] stmt = 'update TCGA_metadata_sample_data_availability set count = %s where metadata_data_type_availability_id = %s and sample_barcode = %s' ISBCGC_database_helper.update(config, stmt, log, update_params, False)
def get_gcs_data_types_barcodes(self, program_name, data_types, log): log.info('\t\tgetting gcs data types barcodes {}-{} for gcs'.format( program_name, '"{}"'.format(', '.join(data_types)))) try: stmt = 'select project_short_name, case_barcode, sample_barcode, file_gdc_id from {} where data_type in ({})' project2cases = {} project2samples = {} project2files = {} for data_type in data_types: build = 'HG38' if data_type in ('Aligned Reads', \ 'miRNA Expression Quantification', \ 'Isoform Expression Quantification', \ 'Gene Expression Quantification', \ 'Masked Copy Number Segment', \ 'Methylation Beta Value', \ 'Masked Somatic Mutation') \ else 'HG19' rows = ISBCGC_database_helper.select( self.config, stmt.format( '{}_metadata_data_{}'.format(program_name, build), '"{}"'.format(data_type)), log, []) for row in rows: cases = project2cases.setdefault(row[0], set()) cases.add(row[1]) samples = project2samples.setdefault(row[0], set()) samples.add(row[2]) files = project2files.setdefault(row[0], set()) files.add(row[3]) log.info( '\t\tget_gcs_data_types_barcodes(): {}-{} cumulative counts, cases={}, samples={}' .format( program_name, data_type, sum( len(cases) for cases in project2cases.itervalues()), sum( len(samples) for samples in project2samples.itervalues()))) log.info( '\t\tfinished gcs data types barcodes {}-{} for gcs'.format( program_name, '"{}"'.format(', '.join(data_types)))) return project2cases, project2samples, project2files except: log.exception('problem in get_gcs_data_types_barcodes()') raise
def get_sql_file_info(self, program, barcode_type, barcodes, tag): barcode2infos = {} query_template = 'select {0}, data_type, data_format, experimental_strategy, platform, {1}\n' \ 'from {2}_metadata_data_HG{3}\n' \ 'where {0} in ({4})\n' barcodes_in = ','.join('"{}"'.format(barcode) for barcode in set(barcodes)) query = '' for build in ('19', '38'): if '38' == build: if 'CCLE' == program: break else: query += 'union\n' column = 'analysis_workflow_type' else: column = '"None"' query += query_template.format(barcode_type, column, program, build, barcodes_in) rows = ISBCGC_database_helper.select(self.config, query, self.log, []) for row in rows: barcode = row[0] if barcode in barcodes: barcode2infos[barcode] = barcode2infos.setdefault( barcode, []) + [[ row[1], row[2], str(row[3]), str(row[4]), str(row[5]) ]] else: raise ValueError( 'unexpected mismatch of return with barcodes:\n{}\n'. format(', '.join(barcodes), ', '.join(str(field) for field in row))) return barcode2infos
def get_gcs_isb_label_barcodes(self, program_name, isb_label, log): log.info('\t\tgetting isb_label barcodes {}-{} for gcs'.format( program_name, isb_label)) try: project2cases = {} project2samples = {} stmt = 'select bs.project_short_name, bs.case_barcode, sa.sample_barcode ' \ 'from {0}_metadata_sample_data_availability sa join {0}_metadata_data_type_availability da on sa.metadata_data_type_availability_id = da.metadata_data_type_availability_id ' \ 'join {0}_metadata_biospecimen bs on sa.sample_barcode = bs.sample_barcode ' \ 'where isb_label = %s group by 1, 2, 3'.format(program_name) rows = ISBCGC_database_helper().select(self.config, stmt, log, [isb_label]) for row in rows: cases = project2cases.setdefault(row[0], set()) cases.add(row[1]) samples = project2samples.setdefault(row[0], set()) samples.add(row[2]) log.info( '\t\tget_bq_isb_label_barcodes(): {}-{} cumulative counts, cases={}, samples={}' .format( program_name, isb_label, sum(len(cases) for cases in project2cases.itervalues()), sum( len(samples) for samples in project2samples.itervalues()))) log.info('\t\tfinished get_gcs_isb_label_barcodes() {}-{} for gcs'. format(program_name, isb_label)) return project2cases, project2samples except: log.exception('problem in get_bq_isb_label_barcodes()') raise log.info('\t\tfinished isb_label barcodes {}-{} for gcs'.format( program_name, isb_label))
def verify_barcodes_filenames(config, program, image_config, image_type, rows, log): barcode_bq = set() if 'Radiology' == image_type: barcode_bq = set([(row[0], row[1].split('/')[-1]) for row in rows]) else: barcode_bq = set([(row[0], row[1]) for row in rows]) data_select_template = image_config[image_type][ 'data_verify_select_template'] data_rows = ISBCGC_database_helper.select(config, data_select_template, log, []) barcode_db = set([(data_row[0], data_row[1]) for data_row in data_rows]) log.info( '\n\tBQ length:{}\tSQL length:{}\n\tbarcode/file combinations in BQ and not SQL:{}\n\tbarcode/file combinations in SQL and not BQ:{}' .format(len(barcode_bq), len(barcode_db), len(barcode_bq - barcode_db), len(barcode_db - barcode_bq))) if 0 < len(barcode_bq - barcode_db): log.info('\n\tfirst barcodes in bq only: {}'.format( list(barcode_bq - barcode_db)[:20])) if 0 < len(barcode_db - barcode_bq): log.info('first barcodes in sql only: {}'.format( list(barcode_db - barcode_bq)[:20]))
def test_populate_sample_availibility(self): bqTable2data_type = {'Somatic_Mutation': 'Masked Somatic Mutation'} for bqTable, data_type in bqTable2data_type.iteritems(): self.log.info('populate_sample_availibility() for %s' % (data_type)) #remove existing records stmt = 'delete from TCGA_metadata_sample_data_availability where metadata_data_type_availability_id = ' \ '(select metadata_data_type_availability_id from TCGA_metadata_data_type_availability where isb_label = "Somatic_Mutation" and genomic_build = "HG38")' ISBCGC_database_helper.update(self.config, stmt, self.log, [[]]) query_results = query_bq_table( 'select Tumor_Sample_Barcode, Matched_Norm_Sample_Barcode, fileUUID from [isb-cgc:TCGA_hg38_data_v0.{}] group by 1,2,3' .format(bqTable), True, None, self.log) page_token = None barcode2seen_files = {} barcode2infos = {} infos = [] while True: # loop through the big query results and get the sample_barcode into the info list as populate_sample_availibility() # expects it total_rows, rows, page_token = fetch_paged_results( query_results, 200000, None, page_token, self.log) for row in rows: tumor = row[0][:16] # normal = row[1][:16] files = row[2].split('|') for curfile in files: if tumor in barcode2seen_files: seen_files = barcode2seen_files[tumor] if row[2] in seen_files: continue seen_files.add(curfile) else: barcode2seen_files[tumor] = set([curfile]) samples_tumor = {'submitter_id': tumor} sample_list = [samples_tumor] info = {'access': 'open'} case_list = info.setdefault('cases', []) case_list += [{'samples': sample_list}] barcode2infos[tumor] = barcode2infos.setdefault( tumor, []) + [info] # # samples_normal = {'submitter_id': normal} # sample_list = [samples_normal] # # info = {'access': 'open'} # case_list = info.setdefault('cases', []) # case_list += [{'samples': sample_list}] # barcode2infos[normal] = barcode2infos.setdefault(normal, []) + [info] infos += [ info for curinfos in barcode2infos.itervalues() for info in curinfos ] # create inserts into the metadata data that for big query rows that didn't have a match already in the metadata data table if not page_token: self.log.info('\tprocessed total of %s rows for %s' % (total_rows, bqTable)) break populate_sample_availibility(self.config, 'current', 'TCGA', 'all', data_type, infos, self.log) self.log.info('finished populate_sample_availibility() for %s' % (data_type))
def update_database(path2bam2bai, config, log): log.info('\tbegin update database to match bucket index files') table2dbpath2bam2bai = {} for table in ['CCLE_metadata_data_HG19', 'TARGET_metadata_data_HG19', 'TARGET_metadata_data_HG38', 'TCGA_metadata_data_HG19', 'TCGA_metadata_data_HG38']: program_filenames = call_bigquery(table, log) dbpath2bam2bai = table2dbpath2bam2bai.setdefault(table, {}) rows = ISBCGC_database_helper.select(config, 'select file_name_key, index_file_name from {} where "BAM" = data_format order by 1'. format(table), log, []) log.info('\t\tfound {} rows for {}: {}...{}'.format(len(rows), table, rows[:5], rows[-5:])) count = 0 print_at = len(rows) / 11 missing_files = set() bucket2missing_files = {} mismatchs = 0 bucket2mismatchs = {} params = [] for row in rows: if not row[0]: continue if 0 == count % print_at: log.info('\t\t\tlooking at row {} for {}: {}'.format(count, table, row)) count += 1 parts = row[0].split('/') path = '/'.join(parts[:-1]) bam = parts[-1] bai = row[1] bam2bai = path2bam2bai.get(path) if bam2bai: assert bam in bam2bai, 'didn\'t find bam {} in tsv files'.format(bam) if bai != bam2bai[bam]: mismatchs += 1 curcount = bucket2mismatchs.setdefault(parts[2], 0) bucket2mismatchs[parts[2]] = curcount + 1 params += [[bam2bai[bam], row[0]]] else: missing_files.add(row[0]) curcount = bucket2missing_files.setdefault(parts[2], 0) bucket2missing_files[parts[2]] = curcount + 1 dbbam2bai = dbpath2bam2bai.setdefault(path, {}) if bam in dbbam2bai: log.warning('\t\t\tfound {} already in the db map for {}'.format(bam, table)) assert bai, 'didn\'t find a index file for {} in the db'.format(bam) dbbam2bai[bam] = bai if 0 < len(params): # ISBCGC_database_helper.update(config, 'update {} set index_file_name = %s where file_name_key = %s'.format(table), log, params) log.info('there were {} param sets for update'.format(len(params))) else: log.info('there were no param sets for update') if 0 < len(missing_files): missing_files = sorted(missing_files) log.warning('\n\t\t\tfound {} missing files in tsv that are in db out of {} total: \n\t{}\n'.format(len(missing_files), len(rows), '\n\t'.join(missing_files))) for bucket, bmissing_files in bucket2missing_files.iteritems(): if 0 < bmissing_files: log.warning('\n\t\t\tfound {} missing files in tsv that is in bucket {}\n'.format(bmissing_files, bucket)) if 0 < mismatchs: log.warning('\n\t\t\tfound {} index between tsv and db mismatches out of {} total\n'.format(mismatchs, len(rows))) for bucket, bmismatchs in bucket2mismatchs.iteritems(): if 0 < bmismatchs: log.warning('\n\t\t\tfound {} mismatches between tsv and db files in bucket {}\n'.format(bmismatchs, bucket)) log.info('\t\tfinished table {}\n\n'.format(table)) log.info('\t\tfind discrepencies in the db from tsv') for table, dbpath2bam2bai in table2dbpath2bam2bai.iteritems(): bucket2bambai_diff = {} missing_paths = set() path2missing_dbfiles = {} for path, bam2bai in path2bam2bai.iteritems(): if path not in dbpath2bam2bai: not_curpath = True for bam in bam2bai: if bam in program_filenames: not_curpath = False break if not_curpath: continue missing_paths.add(path) continue dbbam2bai = dbpath2bam2bai[path] for bam, bai in bam2bai.iteritems(): if bam not in program_filenames: continue if bam not in dbbam2bai: path2missing_dbfiles.setdefault(path, set()).add(bam) continue dbbai = dbbam2bai[bam] if dbbai != bai: if dbbai.endswith('bam.bai') and bai.endswith('bai'): bambai_diff = bucket2bambai_diff.setdefault(path.split('/')[2], 0) bucket2bambai_diff[path.split('/')[2]] = bambai_diff + 1 else: log.warning('\t\t\tdidn\'t find matching bai in db for table {}: {} vs {}'.format(table, bai, dbbai)) missing_paths = sorted(list(missing_paths)) log.warning('\t\t\tfound {} paths missing from db for table {}: {}...{}\n'.format(len(missing_paths), table, missing_paths[:4], missing_paths[-3:])) for path, bams in path2missing_dbfiles.iteritems(): bams = sorted(list(bams)) log.warning('\t\t\tfound {} files missing from db for path {} for table {}: {}...{}'.format(len(bams), path, table, bams[:4], bams[-3:])) log.info('\n\t\t\ttotal of {} files missing from the db that are in the tsv for table {}\n'.format(sum(len(dbfiles) for dbfiles in path2missing_dbfiles.itervalues()), table)) for bucket, bambai_diff in bucket2bambai_diff.iteritems(): if bambai_diff > 0: log.warning('\t\t\tfound {} cases of \'bai\' in tsv and \'bam.bai\' in db for {} for table {}'.format(bambai_diff, bucket, table)) log.info('\n\t\tfinished discrepancies in the db from tsv') log.info('\tfinished update database to match bucket index files')
return set(barcode2info.keys()), samples def get_sql_barcodes(self, tables, case='case_barcode', sample='sample_barcode'): table2cases = {} table2samples = {} for table, info in tables.iteritems(): if not info[2]: sql = 'select {}, "" from {}'.format(case, table) elif not info[1]: sql = 'select "", {} from {}'.format(sample, table) else: sql = 'select {}, {} from {}'.format(case, sample, table) rows = ISBCGC_database_helper().select(self.config, sql, self.log, []) cases = set() samples = set() for row in rows: cases.add(row[0]) samples.add(row[1]) table2cases[table] = cases if 1 < len(cases) else set() table2samples[table] = samples if 1 < len(samples) else set() return table2cases, table2samples def get_bq_barcodes(self, bq_tables, case='case_barcode', sample='sample_barcode', where=None):
def main(config_file_name): log = None try: with open(config_file_name) as configFile: config = json.load(configFile) log_dir = str(date.today()).replace('-', '_') + '_' + 'tcga/' log_name = create_log(log_dir, 'update_tcga_gcs_paths') log = logging.getLogger(log_name) log.info('begin updating TCGA paths in production') # get the db rows from production cloudsql log.info('\tselect tcga filenames from cloudsql') query = 'SELECT datafilename ' \ 'FROM metadata_data ' \ 'where 0 < instr(datafilename, \'bam\') and project = \'TCGA\'' cloudsql_rows = set(row[0] for row in helper.select(config, query, log, [])) log.info('\tselected %s tcga filenames from cloudsql' % (len(cloudsql_rows))) # read in the file paths from BigQuery query = 'SELECT file_gdc_id, file_gcs_url ' \ 'FROM [isb-cgc:GDC_metadata.GDCfileID_to_GCSurl] ' \ 'where 0 < instr(file_gcs_url, \'TCGA\') and 0 < instr(file_gcs_url, \'legacy\') and 0 < instr(file_gcs_url, \'bam\') ' \ 'order by file_gcs_url' query_results = query_bq_table(query, True, 'isb-cgc', log) total_not_matched = 0 total_distinct = set() page_token = None while True: total_rows, rows, page_token = fetch_paged_results( query_results, 2000, None, page_token, log) log.info('\t\tcreate map of filename to path') name2path = {} for row in rows: fields = row[1].split('/') name2path[fields[-1]] = '/'.join(fields[3:]) log.info('\t\tfinished map of filename to path') # now setup and do the update of paths in cloud sql log.info('\t\tstart updating paths in cloudsql') params = [] select_params = [] not_matched = [] for name, path in name2path.iteritems(): if name in cloudsql_rows: total_distinct.add(name) params += [[path, name]] select_params += [name] else: not_matched += [path] update = 'update metadata_data set datafilenamekey = %s, datafileuploaded = \'true\' where datafilename = %s' helper.update(config, update, log, params) select_in = '%s,' * len(select_params) select_in = select_in[:-1] select_query = 'select count(*) from metadata_data where datafilename in (%s)' % ( select_in) count = helper.select(config, select_query, log, select_params) log.info('select %s file name matches for %s file names.' % (count[0][0], len(select_params))) total_not_matched += len(not_matched) if not page_token: log.info( '\t\tupdated total of %s rows for TCGA with %d distinct file names' % (total_rows - total_not_matched, len(total_distinct))) break else: log.info( '\t\tupdated %d rows, did not find matches from BQ in cloudsql for %d:\n\t%s' % (len(params), len(not_matched), '\n\t'.join(not_matched))) log.info('\tcompleted update of paths in cloudsql') log.info('finished updating TCGA paths in production') except: if log: log.exception('failed to update tcga GCS filepaths') finally: if log: close_log(log)
def process_clinical_image_records(config, program, image_config, image_type, rows, log): ''' based on either the case_barcode (for radiology images) or the sample_barcode (for tissue or diagnostic images), creates a new metadata data record in the config-specified metadata data table parameters: config: configuration mappings image_config: section of the config file with specific mappings for these image types not_barcodes: an output parameter for barcodes that weren't found in the underlying clinical or biospecimen table rows: rows selected from the BigQuery table log: where to write progress and other messages endpoint_type: 'legacy' case_gdc_id: ? case_barcode: row[2] program_name: 'TCGA' disease_code: project_name.split('-')[1] project_short_name: row[4].split('/')[6] ''' # get the information from the config mapping log.info('\tchecking clinical records.') barcode2row = dict([(row[image_config[image_type]['case_barcode_index']], row) for row in rows]) log.info('\tbacodes--{}:{}'.format(len(set(barcode2row)), len(barcode2row))) clinical_select_template = image_config['clinical_select_template'] clinical_rows = ISBCGC_database_helper.select( config, clinical_select_template % ("'" + "','".join(barcode2row) + "'"), log, []) barcode_db = set([clinical_row[0] for clinical_row in clinical_rows]) new_barcodes = set(barcode2row) - barcode_db if 0 < len(new_barcodes): log.info('\t\tinserting {} new clinical records'.format( len(new_barcodes))) rows = [] for barcode in new_barcodes: row = barcode2row[barcode] if 'Radiology' == image_type: case_barcode = row[0] project_short_name = row[1].split('/')[5] else: case_barcode = row[2] project_short_name = row[4].split('/')[6] if 'NA' == project_short_name: continue rows += [[ 'legacy', case_barcode, program, project_short_name.split('-')[1], project_short_name ]] rows += [[ 'current', case_barcode, program, project_short_name.split('-')[1], project_short_name ]] ISBCGC_database_helper.column_insert(config, rows, image_config['clinical_table'], image_config['clinical_columns'], log) else: log.info('\t\tno rows to insert for clinical records')
def process_data_image_records(config, program, image_config, image_type, rows, log): ''' based on either the case_barcode (for radiology images) or the sample_barcode (for tissue or diagnostic images), either updates or creates a new metadata data record in the config-specified metadata data table parameters: config: configuration mappings image_config: section of the config file with specific mappings for these image types not_barcodes: an output parameter for barcodes that weren't found in the underlying clinical or biospecimen table rows: rows selected from the BigQuery table log: where to write progress and other messages file_gdc_id: ? case_gdc_id: ? case_barcode: row[2] sample_gdc_id: ? sample_barcode: row[0] sample_type: sample_barcode[13:15] aliquot_barcode: row[3] aliquot_gdc_id: ? project_short_name: row[4].split('/')[6] disease_code: project_short_name.split('-')[1] program_name: 'TCGA' data_type: image_type data_category: 'Clinical' type: 'file' file_name: row[1] data_format: 'SVS' access: 'open' acl: 'open' platform: 'Clinical' file_name_key: row[4] file_uploaded: 'true' endpoint_type: 'legacy' species: 'H**o sapien' ''' # get the information from the config mapping log.info('\tchecking data records') barcode2rows = {} for row in rows: if 'Radiology' == image_type: bcrows = barcode2rows.setdefault((row[0], row[1].split('/')[-1]), []) bcrows += [row] else: bcrows = barcode2rows.setdefault((row[0], row[1]), []) bcrows += [row] data_select_template = image_config[image_type]['data_select_template'] if 0 == len(data_select_template): barcode_db = set() else: barcodes = '' for barcode, file_name in barcode2rows: barcodes += '("{}", "{}")'.format(barcode, file_name) barcodes = barcodes[:-1] data_rows = ISBCGC_database_helper.select( config, data_select_template % (','.join('("{}", "{}")'.format(barcode, file_name) for barcode, file_name in barcode2rows)), log, []) barcode_db = set([(data_row[0], data_row[1]) for data_row in data_rows]) new_barcodes = set(barcode2rows) - barcode_db if 0 < len(new_barcodes): log.info('\t\tinserting {} new data records'.format(len(new_barcodes))) db_rows = [] for barcode in new_barcodes: for row in barcode2rows[barcode]: row_method = image_config['image_tag2row_method'][image_type] next_row = getattr(sys.modules[__name__], row_method)(row, program, image_config, image_type) if next_row is not None: db_rows += next_row ISBCGC_database_helper.column_insert( config, db_rows, image_config['data_table'], image_config[image_type]['data_columns'], log) else: log.info('\t\tno rows to insert for data records') if 0 < len(barcode_db): log.info('\t\tupdating {} existing data records'.format( len(barcode_db))) rows = [] for barcode in barcode_db: for row in barcode2rows[barcode]: if 'Radiology' == image_type: rows += [[ row[1], row[0], row[1].split('/')[5], image_config['image_tag2data_type'][image_type] ]] else: rows += [[ row[4], row[0], row[1], image_config['image_tag2data_type'][image_type] ]] ISBCGC_database_helper.update( config, image_config[image_type]['data_update_template'], log, rows) else: log.info('\t\tno rows to update for data records')