def main(args): if not len(args) == 2: sys.exit("USAGE: python checkGeneList.py geneList.txt > outFile") # TCGA hub hub = "https://tcga.xenahubs.net" # PanCan normalized dataset dataset = "TCGA.KIRC.sampleMap/HiSeqV2_PANCAN" # get sample IDs samples = xena.dataset_samples(hub, dataset, None) #read geneList into list f = open(args[1]) line = f.readline()[:-1] genes = [] while line != "": gene = line genes.append(gene) line = f.readline()[:-1] #print expression nGenes = len(genes) k = xena.dataset_gene_probe_avg(hub, dataset, samples, genes) for i in range(0, nGenes): if k[i]['position'] == []: print k[i]['gene']
def main(args): if not len(args) == 3: sys.exit( "USAGE: python getExpressionFromXena.py cancerType[KIRC] geneList.txt > outFile" ) # TCGA hub hub = "https://tcga.xenahubs.net" # PanCan normalized dataset dataset = "TCGA." + str(args[1]) + ".sampleMap/HiSeqV2_PANCAN" # get sample IDs samples = xena.dataset_samples(hub, dataset, None) #read geneList into list f = open(args[2]) line = f.readline()[:-1] genes = [] while line != "": gene = line genes.append(gene) line = f.readline()[:-1] #print header print 'Gene\t' + '\t'.join(samples) #print expression nGenes = len(genes) k = xena.dataset_gene_probe_avg(hub, dataset, samples, genes) for i in range(0, nGenes): geneName = k[i]['gene'] scores = k[i]['scores'][0] print geneName + '\t' + '\t'.join(str(x) for x in scores)
def download_data(): """Download and preprocess the expression and survival data.""" # Transcriptomic data hub = "https://gdc.xenahubs.net" dataset = "TCGA-BRCA.htseq_fpkm-uq.tsv" samples = xena.dataset_samples(hub, dataset, None) df_expression = get_expression_data(hub, dataset, samples) # Survival data df_survival = get_survival_data(hub, dataset, samples) # Phenotype data (unused for now) : could be used to add covariates # df_phenotype = get_phenotype_data(hub, samples) df_all = merge_data(df_expression, df_survival) filtered_df = filter_outliers(df_all) record_train_test(filtered_df) return
def download_gdc_clinicals( xena_hub: str, dataset: str, *, rowKey: str = "SampleID", headers: List = [ "sample_type.samples", "days_to_death.demographic", "days_to_last_follow_up.diagnoses", "vital_status.demographic", ], ) -> pd.DataFrame: """ Clinical information contains many factor columns, where factor names are not readily accessible. This function decodes factor numbers into category names. Parameters ---------- xena_hub Url of the data repository hub. dataset The dataset containing survival information on the repository hub. rowKey The column containing (unique) identifiers - typically the sample IDs. headers A list of column names that we want to retrieve. Returns ------- Dataframe with clinical information, including survival. """ samples = xena.dataset_samples(xena_hub, dataset, None) pos, mat = xena.dataset_probe_values(xena_hub, dataset, samples, headers) clinicals = dict() clinicals[rowKey] = samples for i in range(len(mat)): clinicals[headers[i]] = mat[i] clinicals = pd.DataFrame(clinicals) clinicals = clinicals.set_index(rowKey) return clinicals
def main(args): if not len(args) == 3: sys.exit( "USAGE: python replaceGeneListWithXenaAliasNames.py cancerType[KIRC] geneList.txt > newGeneList.txt" ) # TCGA hub hub = "https://tcga.xenahubs.net" # PanCan normalized dataset dataset = "TCGA." + str(args[1]) + ".sampleMap/HiSeqV2_PANCAN" # get sample IDs samples = xena.dataset_samples(hub, dataset, None) #read geneList into list f = open(args[2]) line = f.readline()[:-1] genes = [] while line != "": gene = line genes.append(gene) line = f.readline()[:-1] f.close() #if in Xena, print gene. If not in Xena, determine if gene has an alias that is in Xena nGenes = len(genes) k = xena.dataset_gene_probe_avg(hub, dataset, samples, genes) for i in range(0, nGenes): if k[i]['position'] == [] or k[i]['scores'][0][0] == 'NaN': # determine if gene has an alias in Xena alias = findAliasInXena(hub, dataset, samples, k[i]['gene']) if alias != '': print alias else: print k[i]['gene']
def check_allsamples( xena_hub: str, dataset: str, *, n: Union[None, int] = None, ) -> List: """ Check what samples are available in the dataset. Parameters ---------- xena_hub Url of the data repository hub. dataset The dataset we want to check on. n Limit the number of samples returned. Set to None if all samples are needed. Returns ------- A list of all sample names. """ return xena.dataset_samples(xena_hub, dataset, n)
def get_fields_and_codes(host, dataset, samples, fields): "get fields and resolve NA in the value" return get_codes(host, dataset, fields, get_fields( host, dataset, samples, fields)) # dictionary with all hub links xena.PUBLIC_HUBS # pancanAtlas cohort cohort = 'TCGA PanCanAtlas' host = xena.PUBLIC_HUBS['pancanAtlasHub'] # get expression for GENES expression_dataset = 'EB++AdjustPANCAN_IlluminaHiSeq_RNASeqV2.geneExp.xena' samples = xena.dataset_samples(host, expression_dataset, None) samples[0: 10] expression = get_fields_and_codes(host, expression_dataset, samples, GENES) # list of lists. expression_by_gene = dict(zip(GENES, expression)) # index by gene. [expression_by_gene.keys(), GENES[0], expression_by_gene[GENES[0]][0:10]] # note that missing data is returned as 'NaN'. One might want to remap this to None or NaN, depending on the later analysis tools. # get disease type and survival columns survival_dataset = 'Survival_SupplementalTable_S1_20171025_xena_sp' fields = ['cancer type abbreviation', 'OS', 'OS.time'] values = get_fields_and_codes(host, survival_dataset,
def get_TCGA_surv(self): samples = xena.dataset_samples(self.host, self.surv_dataset, None) values = xena.dataset_fetch(self.host, self.surv_dataset, samples,['_EVENT','_TIME_TO_EVENT']) surv_df = pd.DataFrame(data=values, index=['Event', 'Time_to_event'], columns=samples).T return surv_df