def process_data_availability_records(config, program, image_config, image_type, rows, log): ''' NOTE: this assumes at the start of the run, that no sample barcode is associated with and Image ISB label. it is possible that a sample_barcode is added and then a subsequent batch will contain that barcode associated with more files ''' if 'Radiology' == image_type: return stmt = 'select metadata_data_type_availability_id from TCGA_metadata_data_type_availability where %s = isb_label and %s = genomic_build' image_rows = ISBCGC_database_helper.select(config, stmt, log, ["Pathology_Image", "HG19"]) if 1 != len(image_rows): raise ValueError( 'unexpected number of rows returned for availability id {}'.format( image_rows)) data_type_availability_id = image_rows[0][0] # get the current count for a barcode based on previous runs stmt = 'select sample_barcode, count(*) from {}_metadata_sample_data_availability where metadata_data_type_availability_id = %s group by 1'.format( program) db_rows = ISBCGC_database_helper.select(config, stmt, log, [ data_type_availability_id, ]) db_barcode2count = {} for db_row in db_rows: db_barcode2count[db_row[0]] = db_row[1] if len(db_rows) != len(set(db_rows)): raise ValueError( 'sample_barcode should only appear once per isb_label') # get the current count from the current batch for each sample barcode barcode2count = {} for row in rows: barcode2count[row[0]] = barcode2count.setdefault(row[0], 0) + 1 new_barcodes = set(barcode2count) - set(db_rows) new_params = [] for new_barcode in new_barcodes: new_params += [[ data_type_availability_id, new_barcode, barcode2count[new_barcode] ]] ISBCGC_database_helper.column_insert( config, new_params, 'TCGA_metadata_sample_data_availability', ['metadata_data_type_availability_id', 'sample_barcode', 'count'], log) update_barcodes = set(barcode2count) & set(db_rows) update_params = [] for update_barcode in update_barcodes: update_params += [[ barcode2count[update_barcode] + db_barcode2count[update_barcode], data_type_availability_id, update_barcode ]] stmt = 'update TCGA_metadata_sample_data_availability set count = %s where metadata_data_type_availability_id = %s and sample_barcode = %s' ISBCGC_database_helper.update(config, stmt, log, update_params, False)
def process_biospecimen_image_records(config, program, image_config, image_type, rows, log): ''' based on either the sample_barcode (for tissue or diagnostic images), creates a new metadata clinical record if necessary in the config-specified metadata clinical table parameters: config: configuration mappings image_config: section of the config file with specific mappings for these image types rows: rows selected from the BigQuery table log: where to write progress and other messages endpoint_type: 'legacy' sample_gdc_id: ? sample_barcode: row[0] sample_type: row[0][13:15] case_gdc_id: ? case_barcode: row[2] program_name: 'TCGA' disease_code: project_name.split('-')[1] project_short_name: row[4].split('/')[6] ''' # get the information from the config mapping log.info('\tchecking biospecimen records') barcode2row = dict([(row[image_config[image_type]['sample_barcode_index']], row) for row in rows]) log.info('\tbacodes--{}:{}'.format(len(set(barcode2row)), len(barcode2row))) biospecimen_select_template = image_config['biospecimen_select_template'] biospecimen_rows = ISBCGC_database_helper.select( config, biospecimen_select_template % ("'" + "','".join(barcode2row) + "'"), log, []) barcode_db = set( [biospecimen_row[0] for biospecimen_row in biospecimen_rows]) new_barcodes = set(barcode2row) - barcode_db if 0 < len(new_barcodes): log.info('\t\tinserting {} new biospecimen records'.format( len(new_barcodes))) rows = [] for barcode in new_barcodes: row = barcode2row[barcode] project_short_name = row[4].split('/')[6] rows += [[ 'legacy', row[0], row[0][13:15], row[2], 'TCGA', project_short_name.split('-')[1], project_short_name ]] rows += [[ 'current', row[0], row[0][13:15], row[2], 'TCGA', project_short_name.split('-')[1], project_short_name ]] ISBCGC_database_helper.column_insert( config, rows, image_config['biospecimen_table'], image_config['biospecimen_columns'], log) else: log.info('\t\tno rows to insert for biospecimen records')
def populate_sample_availibility(config, endpt_type, program_name, project_id, data_type, infos, log): log.info('\tbegin populate_sample_availibility() for %s:%s' % (project_id, data_type)) # iterate through the gdc info and put together the counts for the sample barcodes sample_barcode2count = {} for info in infos: mapping = config[program_name]['process_files']['data_table_mapping'] flattened = flatten_map(info, mapping) for index in range(len(flattened)): if (data_type in ('Simple somatic mutation', 'Masked Somatic Mutation') and 'controlled ' == flattened[index]['access']) or \ (data_type in ('Aligned reads') and 'open' == flattened[index]['access']): continue sample_barcode = flattened[index]['sample_barcode'] count = sample_barcode2count.setdefault(sample_barcode, 0) sample_barcode2count[sample_barcode] = count + 1 # read in the appropriate data availability row to get the foreign key isb_label = config['data_type2isb_label'][data_type] stmt = 'select metadata_data_type_availability_id from %s_metadata_data_type_availability where genomic_build = %%s and isb_label = %%s' % ( program_name) foreign_key = ISBCGC_database_helper.select( config, stmt, log, [config['endpt2genomebuild'][endpt_type], isb_label])[0][0] params = [] for sample_barcode, count in sample_barcode2count.iteritems(): params += [[foreign_key, sample_barcode, count]] ISBCGC_database_helper.column_insert( config, params, '%s_metadata_sample_data_availability' % (program_name), ['metadata_data_type_availability_id', 'sample_barcode', 'count'], log) log.info('\tfinished populate_sample_availibility() for %s:%s' % (project_id, data_type))
def process_clinical_image_records(config, program, image_config, image_type, rows, log): ''' based on either the case_barcode (for radiology images) or the sample_barcode (for tissue or diagnostic images), creates a new metadata data record in the config-specified metadata data table parameters: config: configuration mappings image_config: section of the config file with specific mappings for these image types not_barcodes: an output parameter for barcodes that weren't found in the underlying clinical or biospecimen table rows: rows selected from the BigQuery table log: where to write progress and other messages endpoint_type: 'legacy' case_gdc_id: ? case_barcode: row[2] program_name: 'TCGA' disease_code: project_name.split('-')[1] project_short_name: row[4].split('/')[6] ''' # get the information from the config mapping log.info('\tchecking clinical records.') barcode2row = dict([(row[image_config[image_type]['case_barcode_index']], row) for row in rows]) log.info('\tbacodes--{}:{}'.format(len(set(barcode2row)), len(barcode2row))) clinical_select_template = image_config['clinical_select_template'] clinical_rows = ISBCGC_database_helper.select( config, clinical_select_template % ("'" + "','".join(barcode2row) + "'"), log, []) barcode_db = set([clinical_row[0] for clinical_row in clinical_rows]) new_barcodes = set(barcode2row) - barcode_db if 0 < len(new_barcodes): log.info('\t\tinserting {} new clinical records'.format( len(new_barcodes))) rows = [] for barcode in new_barcodes: row = barcode2row[barcode] if 'Radiology' == image_type: case_barcode = row[0] project_short_name = row[1].split('/')[5] else: case_barcode = row[2] project_short_name = row[4].split('/')[6] if 'NA' == project_short_name: continue rows += [[ 'legacy', case_barcode, program, project_short_name.split('-')[1], project_short_name ]] rows += [[ 'current', case_barcode, program, project_short_name.split('-')[1], project_short_name ]] ISBCGC_database_helper.column_insert(config, rows, image_config['clinical_table'], image_config['clinical_columns'], log) else: log.info('\t\tno rows to insert for clinical records')
def process_data_image_records(config, program, image_config, image_type, rows, log): ''' based on either the case_barcode (for radiology images) or the sample_barcode (for tissue or diagnostic images), either updates or creates a new metadata data record in the config-specified metadata data table parameters: config: configuration mappings image_config: section of the config file with specific mappings for these image types not_barcodes: an output parameter for barcodes that weren't found in the underlying clinical or biospecimen table rows: rows selected from the BigQuery table log: where to write progress and other messages file_gdc_id: ? case_gdc_id: ? case_barcode: row[2] sample_gdc_id: ? sample_barcode: row[0] sample_type: sample_barcode[13:15] aliquot_barcode: row[3] aliquot_gdc_id: ? project_short_name: row[4].split('/')[6] disease_code: project_short_name.split('-')[1] program_name: 'TCGA' data_type: image_type data_category: 'Clinical' type: 'file' file_name: row[1] data_format: 'SVS' access: 'open' acl: 'open' platform: 'Clinical' file_name_key: row[4] file_uploaded: 'true' endpoint_type: 'legacy' species: 'H**o sapien' ''' # get the information from the config mapping log.info('\tchecking data records') barcode2rows = {} for row in rows: if 'Radiology' == image_type: bcrows = barcode2rows.setdefault((row[0], row[1].split('/')[-1]), []) bcrows += [row] else: bcrows = barcode2rows.setdefault((row[0], row[1]), []) bcrows += [row] data_select_template = image_config[image_type]['data_select_template'] if 0 == len(data_select_template): barcode_db = set() else: barcodes = '' for barcode, file_name in barcode2rows: barcodes += '("{}", "{}")'.format(barcode, file_name) barcodes = barcodes[:-1] data_rows = ISBCGC_database_helper.select( config, data_select_template % (','.join('("{}", "{}")'.format(barcode, file_name) for barcode, file_name in barcode2rows)), log, []) barcode_db = set([(data_row[0], data_row[1]) for data_row in data_rows]) new_barcodes = set(barcode2rows) - barcode_db if 0 < len(new_barcodes): log.info('\t\tinserting {} new data records'.format(len(new_barcodes))) db_rows = [] for barcode in new_barcodes: for row in barcode2rows[barcode]: row_method = image_config['image_tag2row_method'][image_type] next_row = getattr(sys.modules[__name__], row_method)(row, program, image_config, image_type) if next_row is not None: db_rows += next_row ISBCGC_database_helper.column_insert( config, db_rows, image_config['data_table'], image_config[image_type]['data_columns'], log) else: log.info('\t\tno rows to insert for data records') if 0 < len(barcode_db): log.info('\t\tupdating {} existing data records'.format( len(barcode_db))) rows = [] for barcode in barcode_db: for row in barcode2rows[barcode]: if 'Radiology' == image_type: rows += [[ row[1], row[0], row[1].split('/')[5], image_config['image_tag2data_type'][image_type] ]] else: rows += [[ row[4], row[0], row[1], image_config['image_tag2data_type'][image_type] ]] ISBCGC_database_helper.update( config, image_config[image_type]['data_update_template'], log, rows) else: log.info('\t\tno rows to update for data records')