def main(configfilename, baminfo_filename): log_dir = str(date.today()).replace('-', '_') + '_gg_update' + '/' log_name = create_log(log_dir, 'update_gg_metadata') log = logging.getLogger(log_name) log.info('begin update gg metadata') idcol = 0 ggdatasetcol = 7 ggreadgroupset = 6 nexist = 'NA' updates = [] count = 0 with open(baminfo_filename) as baminfo: baminfo.readline() for line in baminfo: fields = line.strip().split('\t') if fields[ggreadgroupset] == nexist: continue values = [ fields[ggdatasetcol], fields[ggreadgroupset], fields[idcol] ] if 0 == count % 1000: log.info('%s processing row %s--%s' % (datetime.now(), count, ':'.join(values))) count += 1 updates += [values] stmt = 'update metadata_data set GG_dataset_id = %s, GG_readgroupset_id = %s where analysis_id = %s' with open(configfilename) as configFile: config = json.load(configFile) ISBCGC_database_helper.update(config, stmt, log, updates, False) log.info('finished update gg metadata')
def process_data_availability_records(config, program, image_config, image_type, rows, log): ''' NOTE: this assumes at the start of the run, that no sample barcode is associated with and Image ISB label. it is possible that a sample_barcode is added and then a subsequent batch will contain that barcode associated with more files ''' if 'Radiology' == image_type: return stmt = 'select metadata_data_type_availability_id from TCGA_metadata_data_type_availability where %s = isb_label and %s = genomic_build' image_rows = ISBCGC_database_helper.select(config, stmt, log, ["Pathology_Image", "HG19"]) if 1 != len(image_rows): raise ValueError( 'unexpected number of rows returned for availability id {}'.format( image_rows)) data_type_availability_id = image_rows[0][0] # get the current count for a barcode based on previous runs stmt = 'select sample_barcode, count(*) from {}_metadata_sample_data_availability where metadata_data_type_availability_id = %s group by 1'.format( program) db_rows = ISBCGC_database_helper.select(config, stmt, log, [ data_type_availability_id, ]) db_barcode2count = {} for db_row in db_rows: db_barcode2count[db_row[0]] = db_row[1] if len(db_rows) != len(set(db_rows)): raise ValueError( 'sample_barcode should only appear once per isb_label') # get the current count from the current batch for each sample barcode barcode2count = {} for row in rows: barcode2count[row[0]] = barcode2count.setdefault(row[0], 0) + 1 new_barcodes = set(barcode2count) - set(db_rows) new_params = [] for new_barcode in new_barcodes: new_params += [[ data_type_availability_id, new_barcode, barcode2count[new_barcode] ]] ISBCGC_database_helper.column_insert( config, new_params, 'TCGA_metadata_sample_data_availability', ['metadata_data_type_availability_id', 'sample_barcode', 'count'], log) update_barcodes = set(barcode2count) & set(db_rows) update_params = [] for update_barcode in update_barcodes: update_params += [[ barcode2count[update_barcode] + db_barcode2count[update_barcode], data_type_availability_id, update_barcode ]] stmt = 'update TCGA_metadata_sample_data_availability set count = %s where metadata_data_type_availability_id = %s and sample_barcode = %s' ISBCGC_database_helper.update(config, stmt, log, update_params, False)
def main(config_file_name): log = None try: with open(config_file_name) as configFile: config = json.load(configFile) log_dir = str(date.today()).replace('-', '_') + '_' + 'ccle/' log_name = create_log(log_dir, 'update_ccle_gcs_paths') log = logging.getLogger(log_name) log.info('begin updating CCLE paths in production') # first thing to do is to read in the file paths from BigQuery query = 'SELECT file_gdc_id, file_gcs_url ' \ 'FROM [isb-cgc:GDC_metadata.GDCfileID_to_GCSurl] ' \ 'where 0 < instr(file_gcs_url, \'CCLE\')' query_results = query_bq_table(query, True, 'isb-cgc', log) _, rows, _ = fetch_paged_results(query_results, 2000, None, None, log) log.info('\tcreate map of filename to path') name2path = {} for row in rows: fields = row[1].split('/') name2path[fields[-1]] = '/'.join(fields[3:]) log.info('\tfinished map of filename to path') # get the db rows from production cloudsql log.info('\tselect ccle filenames from cloudsql') query = 'SELECT datafilename ' \ 'FROM main.metadata_data ' \ 'where 0 < instr(datafilename, \'bam\') and project = \'CCLE\'' rows = helper.select(config, query, log, []) log.info('\tselected %s ccle filenames from cloudsql' % (len(rows))) # now setup and do the update of paths in cloud sql log.info('\tstart updating paths in cloudsql') params = [] not_matched = [] for row in rows: if row[0] in name2path: params += [[name2path[row[0]], row[0]]] else: not_matched += [row[0]] update = 'update main.metadata_data set datafilenamekey = %s where datafilename = %s' helper.update(config, update, log, params) log.info('\tcompleted update of paths in cloudsql. updated %d, did not find matches from BQ in cloudsql for %s' % (len(params), ', '.join(not_matched))) log.info('finished updating CCLE paths in production') except: if log: log.exception('failed to update ccle GCS filepaths') finally: if log: close_log(log)
def update_cloudsql_from_bigquery(config, postproc_config, project_name, cloudsql_table, log, data_type=None, endpt_type=None): update_stmt = 'update %s\nset \n\t%s\nwhere %s = %%s' % ( cloudsql_table, '\n\t'.join( '%s = %%s,' % (postproc_config['postproc_columns'][key]) for key in postproc_config['postproc_columns'].keys())[:-1], postproc_config['postproc_key_column']) if project_name: if data_type: # assumes that endpt_type is also supplied query_results = query_bq_table( postproc_config['postproc_query'] % (', '.join(postproc_config['postproc_columns'].keys()), endpt_type, project_name, data_type), False, postproc_config['postproc_project'], log) else: query_results = query_bq_table( postproc_config['postproc_query'] % (', '.join( postproc_config['postproc_columns'].keys()), project_name), False, postproc_config['postproc_project'], log) else: query_results = query_bq_table( postproc_config['postproc_query'] % (', '.join(postproc_config['postproc_columns'].keys())), False, postproc_config['postproc_project'], log) page_token = None log.info('\t\t\tupdate_stmt\n%s' % (update_stmt)) update_count = 0 while True: total_rows, rows, page_token = fetch_paged_results( query_results, postproc_config['postproc_fetch_count'], project_name, page_token, log) if 0 < total_rows: log.info('\t\t\ttotal rows: %s\n\t%s\n\t\t...\n\t%s' % (total_rows, str(rows[0]), str(rows[-1]))) else: log.info('\t\t\tno rows') return if config['update_cloudsql']: ISBCGC_database_helper.update(config, update_stmt, log, rows, True) update_count += len(rows) log.info( '\t\t\tupdated %s so far%s' % (update_count, ' for ' + project_name if project_name else '')) if not page_token: log.info( '\t\t\tupdated total of %s rows%s' % (update_count, ' for ' + project_name if project_name else '')) return
def set_uploaded_path(config, endpt_type, program_name, project_id, data_type, log): log.info('\tbegin set_uploaded_path()') postproc_config = config['postprocess_keypath'] # first set file_name_key to null cloudsql_table = '%s_metadata_data_%s' % ( program_name, config['endpt2genomebuild'][endpt_type]) ISBCGC_database_helper.update( config, postproc_config['postproc_file_name_key_null_update'] % (cloudsql_table, project_id, data_type), log, [[]]) # now use the BigQuery table to set file_name_key update_cloudsql_from_bigquery(config, postproc_config, project_id, cloudsql_table, log, data_type, endpt_type) ISBCGC_database_helper.update( config, postproc_config['postproc_file_uploaded_update'] % (cloudsql_table), log, [[]]) log.info('\tfinished set_uploaded_path()')
def test_populate_sample_availibility(self): bqTable2data_type = {'Somatic_Mutation': 'Masked Somatic Mutation'} for bqTable, data_type in bqTable2data_type.iteritems(): self.log.info('populate_sample_availibility() for %s' % (data_type)) #remove existing records stmt = 'delete from TCGA_metadata_sample_data_availability where metadata_data_type_availability_id = ' \ '(select metadata_data_type_availability_id from TCGA_metadata_data_type_availability where isb_label = "Somatic_Mutation" and genomic_build = "HG38")' ISBCGC_database_helper.update(self.config, stmt, self.log, [[]]) query_results = query_bq_table( 'select Tumor_Sample_Barcode, Matched_Norm_Sample_Barcode, fileUUID from [isb-cgc:TCGA_hg38_data_v0.{}] group by 1,2,3' .format(bqTable), True, None, self.log) page_token = None barcode2seen_files = {} barcode2infos = {} infos = [] while True: # loop through the big query results and get the sample_barcode into the info list as populate_sample_availibility() # expects it total_rows, rows, page_token = fetch_paged_results( query_results, 200000, None, page_token, self.log) for row in rows: tumor = row[0][:16] # normal = row[1][:16] files = row[2].split('|') for curfile in files: if tumor in barcode2seen_files: seen_files = barcode2seen_files[tumor] if row[2] in seen_files: continue seen_files.add(curfile) else: barcode2seen_files[tumor] = set([curfile]) samples_tumor = {'submitter_id': tumor} sample_list = [samples_tumor] info = {'access': 'open'} case_list = info.setdefault('cases', []) case_list += [{'samples': sample_list}] barcode2infos[tumor] = barcode2infos.setdefault( tumor, []) + [info] # # samples_normal = {'submitter_id': normal} # sample_list = [samples_normal] # # info = {'access': 'open'} # case_list = info.setdefault('cases', []) # case_list += [{'samples': sample_list}] # barcode2infos[normal] = barcode2infos.setdefault(normal, []) + [info] infos += [ info for curinfos in barcode2infos.itervalues() for info in curinfos ] # create inserts into the metadata data that for big query rows that didn't have a match already in the metadata data table if not page_token: self.log.info('\tprocessed total of %s rows for %s' % (total_rows, bqTable)) break populate_sample_availibility(self.config, 'current', 'TCGA', 'all', data_type, infos, self.log) self.log.info('finished populate_sample_availibility() for %s' % (data_type))
def process_data_image_records(config, program, image_config, image_type, rows, log): ''' based on either the case_barcode (for radiology images) or the sample_barcode (for tissue or diagnostic images), either updates or creates a new metadata data record in the config-specified metadata data table parameters: config: configuration mappings image_config: section of the config file with specific mappings for these image types not_barcodes: an output parameter for barcodes that weren't found in the underlying clinical or biospecimen table rows: rows selected from the BigQuery table log: where to write progress and other messages file_gdc_id: ? case_gdc_id: ? case_barcode: row[2] sample_gdc_id: ? sample_barcode: row[0] sample_type: sample_barcode[13:15] aliquot_barcode: row[3] aliquot_gdc_id: ? project_short_name: row[4].split('/')[6] disease_code: project_short_name.split('-')[1] program_name: 'TCGA' data_type: image_type data_category: 'Clinical' type: 'file' file_name: row[1] data_format: 'SVS' access: 'open' acl: 'open' platform: 'Clinical' file_name_key: row[4] file_uploaded: 'true' endpoint_type: 'legacy' species: 'H**o sapien' ''' # get the information from the config mapping log.info('\tchecking data records') barcode2rows = {} for row in rows: if 'Radiology' == image_type: bcrows = barcode2rows.setdefault((row[0], row[1].split('/')[-1]), []) bcrows += [row] else: bcrows = barcode2rows.setdefault((row[0], row[1]), []) bcrows += [row] data_select_template = image_config[image_type]['data_select_template'] if 0 == len(data_select_template): barcode_db = set() else: barcodes = '' for barcode, file_name in barcode2rows: barcodes += '("{}", "{}")'.format(barcode, file_name) barcodes = barcodes[:-1] data_rows = ISBCGC_database_helper.select( config, data_select_template % (','.join('("{}", "{}")'.format(barcode, file_name) for barcode, file_name in barcode2rows)), log, []) barcode_db = set([(data_row[0], data_row[1]) for data_row in data_rows]) new_barcodes = set(barcode2rows) - barcode_db if 0 < len(new_barcodes): log.info('\t\tinserting {} new data records'.format(len(new_barcodes))) db_rows = [] for barcode in new_barcodes: for row in barcode2rows[barcode]: row_method = image_config['image_tag2row_method'][image_type] next_row = getattr(sys.modules[__name__], row_method)(row, program, image_config, image_type) if next_row is not None: db_rows += next_row ISBCGC_database_helper.column_insert( config, db_rows, image_config['data_table'], image_config[image_type]['data_columns'], log) else: log.info('\t\tno rows to insert for data records') if 0 < len(barcode_db): log.info('\t\tupdating {} existing data records'.format( len(barcode_db))) rows = [] for barcode in barcode_db: for row in barcode2rows[barcode]: if 'Radiology' == image_type: rows += [[ row[1], row[0], row[1].split('/')[5], image_config['image_tag2data_type'][image_type] ]] else: rows += [[ row[4], row[0], row[1], image_config['image_tag2data_type'][image_type] ]] ISBCGC_database_helper.update( config, image_config[image_type]['data_update_template'], log, rows) else: log.info('\t\tno rows to update for data records')
def main(config_file_name): log = None try: with open(config_file_name) as configFile: config = json.load(configFile) log_dir = str(date.today()).replace('-', '_') + '_' + 'tcga/' log_name = create_log(log_dir, 'update_tcga_gcs_paths') log = logging.getLogger(log_name) log.info('begin updating TCGA paths in production') # get the db rows from production cloudsql log.info('\tselect tcga filenames from cloudsql') query = 'SELECT datafilename ' \ 'FROM metadata_data ' \ 'where 0 < instr(datafilename, \'bam\') and project = \'TCGA\'' cloudsql_rows = set(row[0] for row in helper.select(config, query, log, [])) log.info('\tselected %s tcga filenames from cloudsql' % (len(cloudsql_rows))) # read in the file paths from BigQuery query = 'SELECT file_gdc_id, file_gcs_url ' \ 'FROM [isb-cgc:GDC_metadata.GDCfileID_to_GCSurl] ' \ 'where 0 < instr(file_gcs_url, \'TCGA\') and 0 < instr(file_gcs_url, \'legacy\') and 0 < instr(file_gcs_url, \'bam\') ' \ 'order by file_gcs_url' query_results = query_bq_table(query, True, 'isb-cgc', log) total_not_matched = 0 total_distinct = set() page_token = None while True: total_rows, rows, page_token = fetch_paged_results( query_results, 2000, None, page_token, log) log.info('\t\tcreate map of filename to path') name2path = {} for row in rows: fields = row[1].split('/') name2path[fields[-1]] = '/'.join(fields[3:]) log.info('\t\tfinished map of filename to path') # now setup and do the update of paths in cloud sql log.info('\t\tstart updating paths in cloudsql') params = [] select_params = [] not_matched = [] for name, path in name2path.iteritems(): if name in cloudsql_rows: total_distinct.add(name) params += [[path, name]] select_params += [name] else: not_matched += [path] update = 'update metadata_data set datafilenamekey = %s, datafileuploaded = \'true\' where datafilename = %s' helper.update(config, update, log, params) select_in = '%s,' * len(select_params) select_in = select_in[:-1] select_query = 'select count(*) from metadata_data where datafilename in (%s)' % ( select_in) count = helper.select(config, select_query, log, select_params) log.info('select %s file name matches for %s file names.' % (count[0][0], len(select_params))) total_not_matched += len(not_matched) if not page_token: log.info( '\t\tupdated total of %s rows for TCGA with %d distinct file names' % (total_rows - total_not_matched, len(total_distinct))) break else: log.info( '\t\tupdated %d rows, did not find matches from BQ in cloudsql for %d:\n\t%s' % (len(params), len(not_matched), '\n\t'.join(not_matched))) log.info('\tcompleted update of paths in cloudsql') log.info('finished updating TCGA paths in production') except: if log: log.exception('failed to update tcga GCS filepaths') finally: if log: close_log(log)