def add_metadata(self, data_df, info, program_name, project, config): """Add metadata info to the dataframe """ metadata_list = flatten_map( info, config[program_name]['process_files']['data_table_mapping']) metadata = metadata_list[0] for next_metadata in metadata_list[1:]: metadata.update(next_metadata) program = project.split('-')[0] start_samplecode = config['sample_code_position'][program]['start'] end_samplecode = config['sample_code_position'][program]['end'] sample_type_code = metadata['sample_barcode'][ start_samplecode:end_samplecode] data_df['file_gdc_id'] = metadata['file_gdc_id'] data_df['aliquot_barcode'] = metadata['aliquot_barcode'] data_df['aliquot_barcode'] = metadata['aliquot_barcode'] data_df['sample_gdc_id'] = metadata['sample_gdc_id'] data_df['sample_barcode'] = metadata['sample_barcode'] data_df['case_gdc_id'] = metadata['case_gdc_id'] data_df['case_barcode'] = metadata['case_barcode'] data_df['program_name'] = metadata['program_name'].upper() data_df['project_short_name'] = metadata['project_short_name'].upper() data_df['sample_type_letter_code'] = config['sample_code2letter'][ sample_type_code] data_df['data_type'] = metadata['data_type'] data_df['experimental_strategy'] = metadata['experimental_strategy'] return data_df
def __insert_rows(config, endpt_type, tablename, values, mapfilter, log): maps = [] for value in values: maps += flatten_map(value, mapfilter) print_list_synopsis(maps, '\t\trows to save for %s' % (tablename), log) module = import_module(config['database_module']) fieldnames = module.ISBCGC_database_helper.field_names(tablename) rows = [] for nextmap in maps: rows += __addrow(endpt_type, fieldnames, nextmap, log) if config['update_cloudsql']: # def select(cls, config, stmt, log, params = [], verbose = True): wherelist = [] for fieldname in fieldnames: wherelist += ['%s = %%s' % (fieldname)] stmt = 'select %s from %s where %s' % (fieldnames[0], tablename, ' and '.join(wherelist)) count = 0 for index in range(8): if len(rows) == index: break result = module.ISBCGC_database_helper.select( config, stmt, log, rows[index]) count += 1 if len(result) > 0 else 0 if count == min(len(rows), 8): log.warning( '\n\t====================\n\tfirst %d records already saved for %s, skipping\n\t====================' % (count, tablename)) return elif 0 < count: raise ValueError( 'only some of the first %d records were saved for %s' % (count, tablename)) module.ISBCGC_database_helper.column_insert(config, rows, tablename, fieldnames, log) else: log.warning( '\n\t====================\n\tnot saving to cloudsql to %s this run!\n\t====================' % (tablename))
def populate_sample_availibility(config, endpt_type, program_name, project_id, data_type, infos, log): log.info('\tbegin populate_sample_availibility() for %s:%s' % (project_id, data_type)) # iterate through the gdc info and put together the counts for the sample barcodes sample_barcode2count = {} for info in infos: mapping = config[program_name]['process_files']['data_table_mapping'] flattened = flatten_map(info, mapping) for index in range(len(flattened)): if (data_type in ('Simple somatic mutation', 'Masked Somatic Mutation') and 'controlled ' == flattened[index]['access']) or \ (data_type in ('Aligned reads') and 'open' == flattened[index]['access']): continue sample_barcode = flattened[index]['sample_barcode'] count = sample_barcode2count.setdefault(sample_barcode, 0) sample_barcode2count[sample_barcode] = count + 1 # read in the appropriate data availability row to get the foreign key isb_label = config['data_type2isb_label'][data_type] stmt = 'select metadata_data_type_availability_id from %s_metadata_data_type_availability where genomic_build = %%s and isb_label = %%s' % ( program_name) foreign_key = ISBCGC_database_helper.select( config, stmt, log, [config['endpt2genomebuild'][endpt_type], isb_label])[0][0] params = [] for sample_barcode, count in sample_barcode2count.iteritems(): params += [[foreign_key, sample_barcode, count]] ISBCGC_database_helper.column_insert( config, params, '%s_metadata_sample_data_availability' % (program_name), ['metadata_data_type_availability_id', 'sample_barcode', 'count'], log) log.info('\tfinished populate_sample_availibility() for %s:%s' % (project_id, data_type))
def add_metadata(self, file_df, data_type, info, program_name, project, config): """Add metadata info to the dataframe """ metadata_list = flatten_map( info, config[program_name]['process_files']['data_table_mapping']) metadata = metadata_list[0] for next_metadata in metadata_list[1:]: metadata.update(next_metadata) metadata_columns = config[program_name]['process_files'][ 'datatype2bqscript'][data_type]['add_metadata_columns'] for metadata_column in metadata_columns: if 'sample_type_code' == metadata_column: program = project.split('-')[0] start_samplecode = config['sample_code_position'][program][ 'start'] end_samplecode = config['sample_code_position'][program]['end'] file_df['sample_type_code'] = metadata['sample_barcode'][ start_samplecode:end_samplecode] else: file_df[metadata_column] = metadata[metadata_column] return file_df
def upload_batch_etl(self, config, outputdir, paths, file2info, program_name, project, data_type, log): if not config[program_name]['process_files']['datatype2bqscript'][ 'Isoform Expression Quantification']['only_matrix']: super(miRNA_matrix, self).upload_batch_etl(config, outputdir, paths, file2info, program_name, project, data_type, log) else: log.info('not calling upload_batch_etl() for %s:%s' % (project, data_type)) # copy files to common location cross all projects, flattening the directory names into the file names input_dir = config['download_base_output_dir'] + '%s/%s/' % (project, data_type) common_dir = config['download_base_output_dir'] + config[program_name][ 'process_files']['datatype2bqscript'][ 'Isoform Expression Quantification']['matrix_subdir'] log.info('\tcopy files for %s:%s for mirna isoform matrix' % (data_type, project)) contents = listdir(input_dir) for content in contents: if path.isdir(input_dir + content): files = listdir(input_dir + content) for file_name in files: full_name = content + '_' + file_name if path.exists(common_dir + full_name): raise ValueError('file already exists: %s' % (full_name)) copy(input_dir + content + '/' + file_name, common_dir + full_name) log.info('\tcopied files for %s: %s for mirna isoform matrix' % (data_type, project)) # first time this is called, safe off the file2info, transformed into aliquot2info, for use in finalize mapfile_name = project + "_aliquotinfo.txt" mapfile_path = config['download_base_output_dir'] + config[ program_name]['process_files']['datatype2bqscript'][ 'Isoform Expression Quantification'][ 'matrix_persist_subdir'] + mapfile_name if not path.exists(mapfile_path): log.info( '\tcreate metadata file for %s:%s for mirna isoform matrix' % (data_type, project)) # create the aliquot centric map file_name2info = {} for value in file2info.values(): flattened = flatten_map( value, config[program_name]['process_files'] ['data_table_mapping'])[0] info = file_name2info.setdefault( '_'.join( [flattened['file_gdc_id'], flattened['file_name']]), {}) info['aliquot_barcode'] = flattened['aliquot_barcode'] info['project_short_name'] = flattened['project_short_name'] program_name = flattened['program_name'] info['program_name'] = program_name sample_type_code = flattened['aliquot_barcode'][ config['sample_code_position'][program_name]['start']: config['sample_code_position'][program_name]['end']] info['sample_type_code'] = sample_type_code info['file_name'] = flattened['file_name'] info['file_gdc_id'] = flattened['file_gdc_id'] info['case_gdc_id'] = flattened['case_gdc_id'] info['sample_gdc_id'] = flattened['sample_gdc_id'] info['aliquot_gdc_id'] = flattened['aliquot_gdc_id'] with open(mapfile_path, 'wb') as mapfile: dump(file_name2info, mapfile, protocol=HIGHEST_PROTOCOL) log.info( '\tsaved metadata file for %s:%s for mirna isoform matrix' % (data_type, project))
def process_files(config, endpt_type, file2info, outputdir, start, end, program_name, project, data_type, etl_class, log): try: filepath = outputdir + config['download_output_file_template'] % ( start, end - 1) with tarfile.open(filepath) as tf: log.info('\t\textract tar files from %s' % (filepath)) tf.extractall(outputdir) log.info('\t\tdone extract tar files from %s' % (filepath)) with open(outputdir + 'MANIFEST.txt') as manifest: lines = manifest.read().split('\n') paths = [] filenames = set() for line in lines[1:]: filepath = line.split('\t')[1] paths += [filepath] filenames.add(filepath.split('/')[1]) paths.sort(key=lambda path: path.split('/')[1]) if config['upload_files']: for path in paths: basefolder = config['buckets']['folders']['base_file_folder'] metadata = flatten_map( file2info[path], config[program_name]['process_files'] ['data_table_mapping']) keypath_template = config[program_name]['process_files'][ 'bucket_path_template'] key_path_components = [] for part in config[program_name]['process_files'][ 'bucket_path']: fields = part.split(':') if 1 == len(fields): if 'endpoint_type' == part: key_path_components += [endpt_type] else: key_path_components += [metadata[0][part]] elif 'alt' == fields[0]: if fields[1] in metadata[0] and metadata[0][fields[1]]: key_path_components += [metadata[0][fields[1]]] else: key_path_components += [metadata[0][fields[2]]] key_name = basefolder + (keypath_template % tuple(key_path_components)) log.info('\t\tuploading %s' % (key_name)) upload_file(config, outputdir + path, config['buckets']['open'], key_name, log) else: log.info('\t\t\tnot uploading files for %s:%s' % (project, data_type)) etl_uploaded = False if config['upload_etl_files'] and data_type in config[program_name][ 'process_files']['datatype2bqscript'] and etl_class is not None: etl_uploaded = etl_class.upload_batch_etl(config, outputdir, paths, file2info, endpt_type, program_name, project, data_type, log) else: log.warning( '\t\tnot processing files for ETL for project %s and datatype %s%s' % (project, data_type, ' because there is no script specified' if config['upload_etl_files'] else '')) return etl_uploaded except: log.exception( 'problem process file %s for project %s and data_type %s' % (filepath, project, data_type)) raise finally: if 'delete_dir_contents' not in config or config['delete_dir_contents']: delete_dir_contents(outputdir)