def load_rnaseq_data(disease_code, with_clinical=False, wait_time=30): # Fetch RNA data archive_path = prefetch_rnaseq_data(disease_code) result_dir = os.path.join(os.path.dirname(archive_path), disease_code, 'gene_expression') if not os.path.exists(result_dir): os.makedirs(result_dir) # Unpack tar file archive = tarfile.open(archive_path) gene_quantification_files = set( el for el in archive if 'genes.normalized_results' in el.name) if len(gene_quantification_files.difference( os.listdir(result_dir))) > 0: archive.extractall(members=gene_quantification_files, path=result_dir) archive.extract('FILE_SAMPLE_MAP.txt', path=result_dir) # Load map from samples to RNA files rna_file_sample_map = pd.read_csv(os.path.join(result_dir, 'FILE_SAMPLE_MAP.txt'), sep='\t') rna_file_sample_map_id_split = rna_file_sample_map[ 'barcode(s)'].str.rsplit('-', n=4, expand=True) rna_file_sample_map_id_split.columns = [ 'TCGA_ID', 'SampleID', 'PortionID', 'PlateID', 'CenterID' ] rna_file_sample_map = rna_file_sample_map.join( rna_file_sample_map_id_split) gene_filter = rna_file_sample_map['filename'].str.contains( 'genes.normalized_results') gene_rna_file_sample_map = rna_file_sample_map[gene_filter] rna_dfs = [] for (f, sample) in list( zip(gene_rna_file_sample_map['filename'], gene_rna_file_sample_map['TCGA_ID'])): sample_rna_df = pd.read_csv(os.path.join(result_dir, f), sep='\t') sample_rna_df['TCGA_ID'] = sample sample_rna_df['gene_name'] = sample_rna_df.gene_id.str.split( '|').str.get(0) rna_dfs.append(sample_rna_df) rna_df = pd.concat(rna_dfs, copy=False).merge(rna_file_sample_map) if with_clinical: patient_data_df = load_clinical_data(disease_code) merged = rna_df.merge(patient_data_df, how='outer', left_on='TCGA_ID', right_on='bcr_patient_barcode') return merged else: return rna_df
def load_rnaseq_data(disease_code, with_clinical=False, wait_time=30): # Fetch RNA data archive_path = prefetch_rnaseq_data(disease_code) result_dir = os.path.join(os.path.dirname(archive_path), disease_code, 'gene_expression') if not os.path.exists(result_dir): os.makedirs(result_dir) # Unpack tar file archive = tarfile.open(archive_path) gene_quantification_files = set(el for el in archive if 'genes.normalized_results' in el.name) if len(gene_quantification_files.difference(os.listdir(result_dir))) > 0: archive.extractall(members=gene_quantification_files, path=result_dir) archive.extract('FILE_SAMPLE_MAP.txt', path=result_dir) # Load map from samples to RNA files rna_file_sample_map = pd.read_csv(os.path.join(result_dir, 'FILE_SAMPLE_MAP.txt'), sep='\t') rna_file_sample_map_id_split = rna_file_sample_map['barcode(s)'].str.rsplit('-', n=4, expand=True) rna_file_sample_map_id_split.columns = ['TCGA_ID', 'SampleID', 'PortionID', 'PlateID', 'CenterID'] rna_file_sample_map = rna_file_sample_map.join(rna_file_sample_map_id_split) gene_filter = rna_file_sample_map['filename'].str.contains('genes.normalized_results') gene_rna_file_sample_map = rna_file_sample_map[gene_filter] rna_dfs = [] for (f, sample) in list(zip(gene_rna_file_sample_map['filename'], gene_rna_file_sample_map['TCGA_ID'])): sample_rna_df = pd.read_csv(os.path.join(result_dir, f), sep='\t') sample_rna_df['TCGA_ID'] = sample sample_rna_df['gene_name'] = sample_rna_df.gene_id.str.split('|').str.get(0) rna_dfs.append(sample_rna_df) rna_df = pd.concat(rna_dfs, copy=False).merge(rna_file_sample_map) if with_clinical: patient_data_df = load_clinical_data(disease_code) merged = rna_df.merge(patient_data_df, how='outer', left_on='TCGA_ID', right_on='bcr_patient_barcode') return merged else: return rna_df
def load_mutation_data(disease_code, with_clinical=False, variant_type='all', wait_time=30): """Load variants from TCGA Parameters ---------- disease_code : str with_clinical : bool, optional If True, attach the clinical information variant_type : str, optional Filter to a specific variant type 'SNP', 'INDEL' wait_time : int, optional Time to wait for response from TCGA Returns ------- mutations : Pandas dataframe A dataframe of mutations """ archive_path = prefetch_mutation_data(disease_code, wait_time=wait_time, cache=True) # Unpack tar file archive = tarfile.open(archive_path) result_dir = os.path.join(os.path.dirname(archive_path), disease_code, 'mutations') if not os.path.exists(result_dir): os.makedirs(result_dir) archive.extractall(path=result_dir) # Filter to MAF files maf_files = [f for f in os.listdir(result_dir) if f.endswith('.maf')] mutation_df = pd.concat([ pd.read_csv(os.path.join(result_dir, maf_file), sep='\t', na_values='[Not Available]') for maf_file in maf_files ], copy=False) # Expand out the TCGA barcode to retrieve the TCGA ID tcga_info = mutation_df['Tumor_Sample_Barcode'].str.rsplit('-', n=4, expand=True) tcga_info.columns = [ 'TCGA_ID', 'SampleID', 'PortionID', 'PlateID', 'CenterID' ] mutations = mutation_df.join(tcga_info, how='left') if variant_type != 'all': if variant_type == 'indel': mutations = mutations[(mutations['Variant_Type'] == 'INS') | (mutations['Variant_Type'] == 'DEL')] else: mutations = mutations[mutations['Variant_Type'] == variant_type] logging.info("Loaded {} mutations for {} tumors from {} patients".format( len(mutations), mutations['Tumor_Sample_Barcode'].nunique(), mutations['TCGA_ID'].nunique())) if with_clinical: patient_data_df = load_clinical_data(disease_code) merged = mutations.merge(patient_data_df, how='outer', left_on='TCGA_ID', right_on='bcr_patient_barcode') logging.info("Patients: {}, Tumor Samples: {}, Mutations {}".format( merged['bcr_patient_barcode'].nunique(), merged['Tumor_Sample_Barcode'].nunique(), len(merged[~merged['TCGA_ID'].isnull()]))) return merged else: return mutations
def load_mutation_data(disease_code, with_clinical=False, variant_type='all', wait_time=30): """Load variants from TCGA Parameters ---------- disease_code : str with_clinical : bool, optional If True, attach the clinical information variant_type : str, optional Filter to a specific variant type 'SNP', 'INDEL' wait_time : int, optional Time to wait for response from TCGA Returns ------- mutations : Pandas dataframe A dataframe of mutations """ archive_path = prefetch_mutation_data(disease_code, wait_time=wait_time, cache=True) # Unpack tar file archive = tarfile.open(archive_path) result_dir = os.path.join(os.path.dirname(archive_path), disease_code, 'mutations') if not os.path.exists(result_dir): os.makedirs(result_dir) archive.extractall(path=result_dir) # Filter to MAF files maf_files = [f for f in os.listdir(result_dir) if f.endswith('.maf')] mutation_df = pd.concat([pd.read_csv(os.path.join(result_dir, maf_file), sep='\t', na_values='[Not Available]') for maf_file in maf_files], copy=False) # Expand out the TCGA barcode to retrieve the TCGA ID tcga_info = mutation_df['Tumor_Sample_Barcode'].str.rsplit('-', n=4, expand=True) tcga_info.columns = ['TCGA_ID', 'SampleID', 'PortionID', 'PlateID', 'CenterID'] mutations = mutation_df.join(tcga_info, how='left') if variant_type != 'all': if variant_type == 'indel': mutations = mutations[ (mutations['Variant_Type'] == 'INS') | (mutations['Variant_Type'] == 'DEL') ] else: mutations = mutations[mutations['Variant_Type'] == variant_type] logging.info("Loaded {} mutations for {} tumors from {} patients".format( len(mutations), mutations['Tumor_Sample_Barcode'].nunique(), mutations['TCGA_ID'].nunique() ) ) if with_clinical: patient_data_df = load_clinical_data(disease_code) merged = mutations.merge(patient_data_df, how='outer', left_on='TCGA_ID', right_on='bcr_patient_barcode') logging.info("Patients: {}, Tumor Samples: {}, Mutations {}".format( merged['bcr_patient_barcode'].nunique(), merged['Tumor_Sample_Barcode'].nunique(), len(merged[~merged['TCGA_ID'].isnull()]) ) ) return merged else: return mutations