Esempio n. 1
0
def main(args):
    if not len(args) == 2:
        sys.exit("USAGE: python checkGeneList.py geneList.txt  > outFile")

    # TCGA hub
    hub = "https://tcga.xenahubs.net"

    # PanCan normalized dataset
    dataset = "TCGA.KIRC.sampleMap/HiSeqV2_PANCAN"

    # get sample IDs
    samples = xena.dataset_samples(hub, dataset, None)

    #read geneList into list
    f = open(args[1])
    line = f.readline()[:-1]
    genes = []
    while line != "":
        gene = line
        genes.append(gene)
        line = f.readline()[:-1]

    #print expression
    nGenes = len(genes)
    k = xena.dataset_gene_probe_avg(hub, dataset, samples, genes)
    for i in range(0, nGenes):
        if k[i]['position'] == []:
            print k[i]['gene']
Esempio n. 2
0
def main(args):
    if not len(args) == 3:
        sys.exit(
            "USAGE: python getExpressionFromXena.py cancerType[KIRC] geneList.txt  > outFile"
        )

    # TCGA hub
    hub = "https://tcga.xenahubs.net"

    # PanCan normalized dataset
    dataset = "TCGA." + str(args[1]) + ".sampleMap/HiSeqV2_PANCAN"

    # get sample IDs
    samples = xena.dataset_samples(hub, dataset, None)

    #read geneList into list
    f = open(args[2])
    line = f.readline()[:-1]
    genes = []
    while line != "":
        gene = line
        genes.append(gene)
        line = f.readline()[:-1]

    #print header
    print 'Gene\t' + '\t'.join(samples)

    #print expression
    nGenes = len(genes)
    k = xena.dataset_gene_probe_avg(hub, dataset, samples, genes)
    for i in range(0, nGenes):
        geneName = k[i]['gene']
        scores = k[i]['scores'][0]
        print geneName + '\t' + '\t'.join(str(x) for x in scores)
Esempio n. 3
0
def download_data():
    """Download and preprocess the expression and survival data."""
    # Transcriptomic data
    hub = "https://gdc.xenahubs.net"
    dataset = "TCGA-BRCA.htseq_fpkm-uq.tsv"
    samples = xena.dataset_samples(hub, dataset, None)
    df_expression = get_expression_data(hub, dataset, samples)

    # Survival data
    df_survival = get_survival_data(hub, dataset, samples)

    # Phenotype data (unused for now) : could be used to add covariates
    # df_phenotype = get_phenotype_data(hub, samples)

    df_all = merge_data(df_expression, df_survival)
    filtered_df = filter_outliers(df_all)
    record_train_test(filtered_df)
    return
Esempio n. 4
0
def download_gdc_clinicals(
    xena_hub: str,
    dataset: str,
    *,
    rowKey: str = "SampleID",
    headers: List = [
        "sample_type.samples",
        "days_to_death.demographic",
        "days_to_last_follow_up.diagnoses",
        "vital_status.demographic",
    ],
) -> pd.DataFrame:
    """
    Clinical information contains many factor columns, where factor names are not
    readily accessible. This function decodes factor numbers into category names.

    Parameters
    ----------
    xena_hub
        Url of the data repository hub.
    dataset
        The dataset containing survival information on the repository hub.
    rowKey
        The column containing (unique) identifiers - typically the sample IDs.
    headers
        A list of column names that we want to retrieve.

    Returns
    -------
    Dataframe with clinical information, including survival.
    """

    samples = xena.dataset_samples(xena_hub, dataset, None)
    pos, mat = xena.dataset_probe_values(xena_hub, dataset, samples, headers)
    clinicals = dict()
    clinicals[rowKey] = samples
    for i in range(len(mat)):
        clinicals[headers[i]] = mat[i]
    clinicals = pd.DataFrame(clinicals)
    clinicals = clinicals.set_index(rowKey)
    return clinicals
def main(args):
    if not len(args) == 3:
        sys.exit(
            "USAGE: python replaceGeneListWithXenaAliasNames.py cancerType[KIRC] geneList.txt > newGeneList.txt"
        )

    # TCGA hub
    hub = "https://tcga.xenahubs.net"

    # PanCan normalized dataset
    dataset = "TCGA." + str(args[1]) + ".sampleMap/HiSeqV2_PANCAN"

    # get sample IDs
    samples = xena.dataset_samples(hub, dataset, None)

    #read geneList into list
    f = open(args[2])
    line = f.readline()[:-1]
    genes = []
    while line != "":
        gene = line
        genes.append(gene)
        line = f.readline()[:-1]
    f.close()

    #if in Xena, print gene. If not in Xena, determine if gene has an alias that is in Xena
    nGenes = len(genes)
    k = xena.dataset_gene_probe_avg(hub, dataset, samples, genes)
    for i in range(0, nGenes):
        if k[i]['position'] == [] or k[i]['scores'][0][0] == 'NaN':
            # determine if gene has an alias in Xena
            alias = findAliasInXena(hub, dataset, samples, k[i]['gene'])
            if alias != '':
                print alias
        else:
            print k[i]['gene']
Esempio n. 6
0
def check_allsamples(
    xena_hub: str,
    dataset: str,
    *,
    n: Union[None, int] = None,
) -> List:
    """
    Check what samples are available in the dataset.

    Parameters
    ----------
    xena_hub
        Url of the data repository hub.
    dataset
        The dataset we want to check on.
    n
        Limit the number of samples returned. Set to None if all samples are needed.

    Returns
    -------
    A list of all sample names.
    """

    return xena.dataset_samples(xena_hub, dataset, n)
Esempio n. 7
0

def get_fields_and_codes(host, dataset, samples, fields):
    "get fields and resolve NA in the value"
    return get_codes(host, dataset, fields, get_fields( host, dataset, samples, fields))

# dictionary with all hub links
xena.PUBLIC_HUBS  
# pancanAtlas cohort
cohort = 'TCGA PanCanAtlas'
host = xena.PUBLIC_HUBS['pancanAtlasHub']

    
# get expression for GENES
expression_dataset = 'EB++AdjustPANCAN_IlluminaHiSeq_RNASeqV2.geneExp.xena'
samples = xena.dataset_samples(host, expression_dataset, None)
samples[0: 10]
expression = get_fields_and_codes(host, 
                                  expression_dataset, 
                                  samples, 
                                  GENES) # list of lists.
expression_by_gene = dict(zip(GENES, expression))      # index by gene.
[expression_by_gene.keys(), GENES[0], expression_by_gene[GENES[0]][0:10]]
# note that missing data is returned as 'NaN'. One might want to remap this to None or NaN, depending on the later analysis tools.


# get disease type and survival columns
survival_dataset = 'Survival_SupplementalTable_S1_20171025_xena_sp'
fields = ['cancer type abbreviation', 'OS', 'OS.time']
values = get_fields_and_codes(host, 
                              survival_dataset, 
Esempio n. 8
0
	def get_TCGA_surv(self):

		samples = xena.dataset_samples(self.host, self.surv_dataset, None)
		values = xena.dataset_fetch(self.host, self.surv_dataset, samples,['_EVENT','_TIME_TO_EVENT'])
		surv_df = pd.DataFrame(data=values, index=['Event', 'Time_to_event'], columns=samples).T
		return surv_df