Esempio n. 1
0
def ttest_by_ids (hub1, hub2, dataset1, dataset2, ids, samples1, samples2, mode1, mode2, output):
    fout = open(output,'w')
    o_list =["probe", "t_stat", "test_p", "mean1", "mean2"]
    fout.write(string.join(o_list, '\t') +'\n')

    N = 100
    for i in range (0, len(ids), N):
        pList = ids[i:i+N]
        if mode1 == "probe":
            values1 = xenaAPI.Probes_values (hub1, dataset1, samples1, pList)
        elif mode1 == "gene":
            values1 = xenaAPI.Genes_values (hub1, dataset1, samples1, pList)
        if mode2 == "probe":
            values2 = xenaAPI.Probes_values (hub2, dataset2, samples2, pList)
        elif mode2 == "gene":
            values2 = xenaAPI.Genes_values (hub2, dataset2, samples2, pList)

        for j in range(0, len(pList)):
            probe = pList[j]
            v1 = values1[j]
            v1 = map(lambda x: float(x), v1)
            v1 = [x for x in v1 if math.isnan(x) == 0]
            mean1 = numpy.average (v1)

            v2 = values2[j]
            v2 = map(lambda x: float(x), v2)
            v2 = [x for x in v2 if math.isnan(x) == 0]
            mean2 = numpy.average (v2)

            try:
                tStat, p = scipy.stats.ttest_ind(v1, v2, equal_var=False)
                fout.write(string.join([probe, str(tStat), str(p), str(mean1), str(mean2)], '\t') + '\n')
            except:
                print probe, bad
    fout.close()
def process(obj, IDs, outfile, action):
    fout = open(outfile, 'w')
    n = int(100000 / len(obj["samples"]))
    if n < 20:
        n = 20
    if n > 500:
        n = 500
    print n

    header = True
    for k in range(0, len(IDs), n):
        ids = IDs[k:k + n]
        if obj['mode'] == "gene":
            values_list = xenaAPI.Genes_values(obj['hub'], obj['dataset'],
                                               obj['samples'], ids)
        elif obj['mode'] == "probe":
            values_list = xenaAPI.Probes_values(obj['hub'], obj['dataset'],
                                                obj['samples'], ids)

        for i in range(0, len(ids)):
            id = ids[i]
            if obj['mode'] == "gene":
                values = values_list[i]["scores"][0]
            elif obj['mode'] == "probe":
                values = values_list[i]

            ret = action(values, obj["unit"])
            output(id, ret, fout, header)
            if header:
                header = False
    fout.close()
def keepProbes (obj, outputFile, keep_dic = None):
    fout = open(outputFile,'w')
    n = int(100000/len(obj["samples"]))
    if n < 100:
        n = 100
    if n > 500:
        n = 500
    print n

    fout.write("id\t" + string.join(obj['samples'], '\t') + '\n')

    if keep_dic:
        IDs = keep_dic.keys()
    else:
        IDs = xenaAPI.dataset_fields (obj['hub'], obj['dataset'])

    for k in range (0, len(IDs), n):
        ids = IDs[k:k+n]
        if obj['mode'] == "gene":
            values_list = xenaAPI.Genes_values(obj['hub'], obj['dataset'], obj['samples'], ids)
        elif obj['mode'] == "probe":
            values_list = xenaAPI.Probes_values(obj['hub'], obj['dataset'], obj['samples'], ids)

        for i in range(0, len(ids)):
            id = ids[i]
            if obj['mode'] == "gene":
                values = values_list[i]["scores"][0]
            elif obj['mode'] == "probe":
                values = values_list[i]
            fout.write(id + '\t'+ string.join(map(lambda x : str(x), values), '\t') +'\n')

    fout.close()
Esempio n. 4
0
def chi2_contingency_by_probes (hub, dataset, probes, samples1, samples2, output, kp = None):
    fout = open(output,'w')
    o_list =["probe", "chi2_stat", "Gtest_p", "Mutual_information"]
    if kp:
        o_list.append("observed")
        o_list.append("expected")

    fout.write(string.join(o_list, '\t') +'\n')

    N = 100
    for i in range (0, len(probes), N):
        pList = probes[i:i+N]
        values1 = xenaAPI.Probes_values (hub, dataset, samples1, pList)
        values2 = xenaAPI.Probes_values (hub, dataset, samples2, pList)

        for j in range(0, len(pList)):
            probe = pList[j]
            v1 = values1[j]
            v1 = [x for x in v1 if x != 'NaN']

            v2 = values2[j]
            v2 = [x for x in v2 if x != 'NaN']

            codes = numpy.unique(v1 +v2)
            if len(codes) == 1:
                continue

            array1 = map(lambda code : len([x for x in v1 if x == code]), codes)
            array2 = map(lambda code : len([x for x in v2 if x == code]), codes)
            observed = [[array1, array2]]
            try:
                chi2, p, dof, expected = scipy.stats.chi2_contingency(observed, lambda_="log-likelihood")
                o_list = []
                if len(array1) == 2 and len(array2) ==2: #only makes sense for 2 by 2 table, key parameter: 0,1
                    MI = chi2 / (2*len(v1) + len(v2))
                    o_list = [probe, str(chi2), str(p), str(MI)]
                else:
                    o_list = [probe, str(chi2), str(p), '']
                if kp:
                    o_list.append(str(observed[0][kp[0]][kp[1]]))
                    o_list.append(str(expected[0][kp[0]][kp[1]]))
                fout.write(string.join(o_list,'\t')+'\n')
                #print probe, codes, chi2, p, MI, observed, expected
            except:
                print probe, "bad"

    fout.close()
Esempio n. 5
0
def process(hub, dataset, samples, mode, genes, outputMatrix_T):
    fout_T = open(outputMatrix_T, 'w')

    gN = 500
    sN = 100

    #convert data
    if len(genes) == 0:
        probes = xenaAPI.dataset_fields(hub, dataset)
        probes.remove("sampleID")
    else:
        probes = genes
    fout_T.write('sample\t' + string.join(probes, "\t") + '\n')

    for k in range(0, len(samples), sN):
        sList = samples[k:k + sN]
        sample_values = []
        for i in range(0, len(probes), gN):
            pList = probes[i:i + gN]
            if mode == "probe":
                values = xenaAPI.Probes_values(hub, dataset, sList, pList)
            else:
                values = xenaAPI.Genes_values(hub, dataset, sList, pList)
            for m in range(0, len(values)):
                if len(values[m]) == 0:
                    values[m] = [''] * len(sList)
                    #print pList[m], m, values[m]
            sample_values.extend(values)
            print i
            #if i>gN:
            #    break
        sample_values = zip(*sample_values)
        for j in range(0, len(sList)):
            sample = sList[j]
            values = sample_values[j]
            fout_T.write(sample + '\t')
            fout_T.write(
                string.join(map(lambda x: str(x), values), '\t') + '\n')

    fout_T.close()
def itomic_Nof1(Nof1_item, original_labels, geneMappping, comparison_item, outputfile):
    itomic_samples = xenaAPI.dataset_samples(Nof1_item["hub"], Nof1_item["dataset"])

    tmpfile = str(uuid.uuid4())
    fout = open(tmpfile,'w')
    foutdata = open(outputfile+"_data",'w') #pure data file

    #full file header output
    file_header (comparison_item, Nof1_item, fout)

    #data file header
    foutdata.write("gene")
    foutdata.write('\t'+ string.join(Nof1_item["samples"],'\t'))
    foutdata.write('\t'+ string.join(Nof1_item["samples"],'\t'))
    foutdata.write('\t'+ string.join(Nof1_item["samples"],'\t'))
    foutdata.write('\n')

    foutdata.write("gene")
    foutdata.write('\t'+ string.join(map(lambda x : "RANK %", Nof1_item["samples"]),'\t'))
    foutdata.write('\t'+ string.join(map(lambda x : "Log2TPM", Nof1_item["samples"]),'\t'))
    foutdata.write('\t'+ string.join(map(lambda x : "TPM", Nof1_item["samples"]),'\t'))
    foutdata.write('\n')


    pDic = {} # all p values for multiple hypo adjustment

    # comparison data
    if "file" in comparison_item:
        file = comparison_item["file"]
        cData = getMatrixData(file)
    else:
        cData = None

    hub = comparison_item["hub"]
    dataset = comparison_item["dataset"]
    samples = comparison_item["samples"]
    name = comparison_item["name"]
    mode = comparison_item["mode"]

    if cData:
        n = 2000
    else:
        n = int(100000/len(samples))
        if n < 100:
            n =100
        print n

    for k in range (0, len(original_labels), n):
        labels = original_labels[k:k+n]
        genes = map(lambda original_label: geneMappping[original_label] if (original_label in geneMappping) else original_label,labels)

        print genes[:10], "..."

        #all itomic data
        all_data_list = get_itomic_Data (genes, Nof1_item["hub"], Nof1_item["dataset"], itomic_samples)

        if cData:
            compare_data_list = None
        else:
            # get data for comparison
            if mode == "gene":
                compare_data_list = xenaAPI.Genes_values (hub, dataset, samples, genes)
            if mode == "probe":
                compare_data_list = xenaAPI.Probes_values (hub, dataset, samples, genes)

        for m in range (0, n):
            if m  == len(genes):
                break
            gene = genes[m]
            label = labels[m]
            outputList =[label, gene]
            data_outputList = [gene]
            all_Data = all_data_list[m]

            #all itomic
            allsample_Data = all_data_list[m]

            # cohort data
            if cData:
                if gene in cData:
                    values = cData[gene]
                else:
                    values =[]

            if compare_data_list:
                compare_gene_obj = compare_data_list[m]
                values = compare_gene_obj['scores'][0] #############

            h_l_values = clean (values)

            if len(h_l_values) == 0: #no comparison data
                continue

            #ttest p value
            try:
                tStat, p = scipy.stats.ttest_ind(allsample_Data.values(), h_l_values, equal_var=False)
                mean1 = numpy.mean( allsample_Data.values())
                mean2 = numpy.mean( h_l_values)
                var1 = numpy.var(allsample_Data.values())
                var2 = numpy.var( h_l_values)

                outputList.append (str(p)) # ttest p value
                outputList.append ('') ## ttest adjusted p value (to be filled later)
                outputList.append (str(tStat)) # ttest t
                outputList.append (str(var1))
                outputList.append (str(var2))
                outputList.append (str(mean1))
                outputList.append (str(mean2))
                pDic[gene] = p
            except:
                print "bad"
                continue

            #rank statistics, SD
            all_r_and_p_values = map(lambda x: rank_and_percentage(x, h_l_values), allsample_Data.values())
            r_list = map(lambda x: x[1], all_r_and_p_values)
            mean1 = numpy.mean(r_list)
            SD = numpy.std(r_list)

            outputList.append (str(mean1))
            outputList.append ('{:.2f}'.format(SD)) #rank SD

            # per sample data output
            itomic_values = map(lambda sample: all_Data[sample], Nof1_item["samples"])
            r_and_p_values = map(lambda x: rank_and_percentage(x, h_l_values), itomic_values)
            outputList.extend(map(lambda x: '{:.2f}%'.format(x[1]), r_and_p_values)) #rank %

            data_outputList.extend(map(lambda x: '{:.2f}%'.format(x[1]), r_and_p_values)) #rank %
            data_outputList.extend(map(lambda x: '{:.2f}'.format(x), itomic_values)) #Log2TPM
            data_outputList.extend(map(lambda x: '{:.2f}'.format(revert_Log2_theta(x, Nof1_item["log2Theta"])), itomic_values)) #TPM

            fout.write(string.join(outputList,'\t') +'\n')
            foutdata.write(string.join(data_outputList,'\t') +'\n')

    fout.write("\n")
    fout.write("Rank % : percentile of samples with lower expression than sample of interest.\n")
    fout.write("Higher Rank %  means higher expression.\n")
    fout.close()
    foutdata.close()

    # add multiple hypo adjusted p values to file
    # http://statsmodels.sourceforge.net/devel/generated/statsmodels.sandbox.stats.multicomp.fdrcorrection0.html
    pCorDic = {}
    genes = pDic.keys()
    rejected, pvalue_corrected =  statsmodels.sandbox.stats.multicomp.fdrcorrection0( map( lambda x : pDic[x], genes),
        alpha=0.05, method='indep', is_sorted=False)
    for i in range(0, len(genes)):
        gene = genes[i]
        pCorDic[gene] = pvalue_corrected[i]

    fout = open(outputfile, 'w')
    fin = open(tmpfile, 'r')
    fout.write(fin.readline())
    fout.write(fin.readline())
    while 1:
        line = fin.readline()
        if line == '' or line == '\n':
            break
        data = string.split(line, '\t')
        gene = data[0]
        if gene in pCorDic:
            data[3] = str(pCorDic[gene])
        fout.write(string.join(data,'\t'))
    fin.close()
    fout.close()
    os.system("rm -f " + tmpfile)
def get_itomic_Data (genes, hub, dataset, samples):
    dic ={}
    values_list= xenaAPI.Probes_values(hub, dataset, samples, genes)
    ret_list = map(lambda values: dict(zip(samples, values)), values_list)
    return ret_list  # a list of dictionaries, orders by genes