def ttest_by_ids (hub1, hub2, dataset1, dataset2, ids, samples1, samples2, mode1, mode2, output): fout = open(output,'w') o_list =["probe", "t_stat", "test_p", "mean1", "mean2"] fout.write(string.join(o_list, '\t') +'\n') N = 100 for i in range (0, len(ids), N): pList = ids[i:i+N] if mode1 == "probe": values1 = xenaAPI.Probes_values (hub1, dataset1, samples1, pList) elif mode1 == "gene": values1 = xenaAPI.Genes_values (hub1, dataset1, samples1, pList) if mode2 == "probe": values2 = xenaAPI.Probes_values (hub2, dataset2, samples2, pList) elif mode2 == "gene": values2 = xenaAPI.Genes_values (hub2, dataset2, samples2, pList) for j in range(0, len(pList)): probe = pList[j] v1 = values1[j] v1 = map(lambda x: float(x), v1) v1 = [x for x in v1 if math.isnan(x) == 0] mean1 = numpy.average (v1) v2 = values2[j] v2 = map(lambda x: float(x), v2) v2 = [x for x in v2 if math.isnan(x) == 0] mean2 = numpy.average (v2) try: tStat, p = scipy.stats.ttest_ind(v1, v2, equal_var=False) fout.write(string.join([probe, str(tStat), str(p), str(mean1), str(mean2)], '\t') + '\n') except: print probe, bad fout.close()
def process(obj, IDs, outfile, action): fout = open(outfile, 'w') n = int(100000 / len(obj["samples"])) if n < 20: n = 20 if n > 500: n = 500 print n header = True for k in range(0, len(IDs), n): ids = IDs[k:k + n] if obj['mode'] == "gene": values_list = xenaAPI.Genes_values(obj['hub'], obj['dataset'], obj['samples'], ids) elif obj['mode'] == "probe": values_list = xenaAPI.Probes_values(obj['hub'], obj['dataset'], obj['samples'], ids) for i in range(0, len(ids)): id = ids[i] if obj['mode'] == "gene": values = values_list[i]["scores"][0] elif obj['mode'] == "probe": values = values_list[i] ret = action(values, obj["unit"]) output(id, ret, fout, header) if header: header = False fout.close()
def keepProbes (obj, outputFile, keep_dic = None): fout = open(outputFile,'w') n = int(100000/len(obj["samples"])) if n < 100: n = 100 if n > 500: n = 500 print n fout.write("id\t" + string.join(obj['samples'], '\t') + '\n') if keep_dic: IDs = keep_dic.keys() else: IDs = xenaAPI.dataset_fields (obj['hub'], obj['dataset']) for k in range (0, len(IDs), n): ids = IDs[k:k+n] if obj['mode'] == "gene": values_list = xenaAPI.Genes_values(obj['hub'], obj['dataset'], obj['samples'], ids) elif obj['mode'] == "probe": values_list = xenaAPI.Probes_values(obj['hub'], obj['dataset'], obj['samples'], ids) for i in range(0, len(ids)): id = ids[i] if obj['mode'] == "gene": values = values_list[i]["scores"][0] elif obj['mode'] == "probe": values = values_list[i] fout.write(id + '\t'+ string.join(map(lambda x : str(x), values), '\t') +'\n') fout.close()
def chi2_contingency_by_probes (hub, dataset, probes, samples1, samples2, output, kp = None): fout = open(output,'w') o_list =["probe", "chi2_stat", "Gtest_p", "Mutual_information"] if kp: o_list.append("observed") o_list.append("expected") fout.write(string.join(o_list, '\t') +'\n') N = 100 for i in range (0, len(probes), N): pList = probes[i:i+N] values1 = xenaAPI.Probes_values (hub, dataset, samples1, pList) values2 = xenaAPI.Probes_values (hub, dataset, samples2, pList) for j in range(0, len(pList)): probe = pList[j] v1 = values1[j] v1 = [x for x in v1 if x != 'NaN'] v2 = values2[j] v2 = [x for x in v2 if x != 'NaN'] codes = numpy.unique(v1 +v2) if len(codes) == 1: continue array1 = map(lambda code : len([x for x in v1 if x == code]), codes) array2 = map(lambda code : len([x for x in v2 if x == code]), codes) observed = [[array1, array2]] try: chi2, p, dof, expected = scipy.stats.chi2_contingency(observed, lambda_="log-likelihood") o_list = [] if len(array1) == 2 and len(array2) ==2: #only makes sense for 2 by 2 table, key parameter: 0,1 MI = chi2 / (2*len(v1) + len(v2)) o_list = [probe, str(chi2), str(p), str(MI)] else: o_list = [probe, str(chi2), str(p), ''] if kp: o_list.append(str(observed[0][kp[0]][kp[1]])) o_list.append(str(expected[0][kp[0]][kp[1]])) fout.write(string.join(o_list,'\t')+'\n') #print probe, codes, chi2, p, MI, observed, expected except: print probe, "bad" fout.close()
def process(hub, dataset, samples, mode, genes, outputMatrix_T): fout_T = open(outputMatrix_T, 'w') gN = 500 sN = 100 #convert data if len(genes) == 0: probes = xenaAPI.dataset_fields(hub, dataset) probes.remove("sampleID") else: probes = genes fout_T.write('sample\t' + string.join(probes, "\t") + '\n') for k in range(0, len(samples), sN): sList = samples[k:k + sN] sample_values = [] for i in range(0, len(probes), gN): pList = probes[i:i + gN] if mode == "probe": values = xenaAPI.Probes_values(hub, dataset, sList, pList) else: values = xenaAPI.Genes_values(hub, dataset, sList, pList) for m in range(0, len(values)): if len(values[m]) == 0: values[m] = [''] * len(sList) #print pList[m], m, values[m] sample_values.extend(values) print i #if i>gN: # break sample_values = zip(*sample_values) for j in range(0, len(sList)): sample = sList[j] values = sample_values[j] fout_T.write(sample + '\t') fout_T.write( string.join(map(lambda x: str(x), values), '\t') + '\n') fout_T.close()
def itomic_Nof1(Nof1_item, original_labels, geneMappping, comparison_item, outputfile): itomic_samples = xenaAPI.dataset_samples(Nof1_item["hub"], Nof1_item["dataset"]) tmpfile = str(uuid.uuid4()) fout = open(tmpfile,'w') foutdata = open(outputfile+"_data",'w') #pure data file #full file header output file_header (comparison_item, Nof1_item, fout) #data file header foutdata.write("gene") foutdata.write('\t'+ string.join(Nof1_item["samples"],'\t')) foutdata.write('\t'+ string.join(Nof1_item["samples"],'\t')) foutdata.write('\t'+ string.join(Nof1_item["samples"],'\t')) foutdata.write('\n') foutdata.write("gene") foutdata.write('\t'+ string.join(map(lambda x : "RANK %", Nof1_item["samples"]),'\t')) foutdata.write('\t'+ string.join(map(lambda x : "Log2TPM", Nof1_item["samples"]),'\t')) foutdata.write('\t'+ string.join(map(lambda x : "TPM", Nof1_item["samples"]),'\t')) foutdata.write('\n') pDic = {} # all p values for multiple hypo adjustment # comparison data if "file" in comparison_item: file = comparison_item["file"] cData = getMatrixData(file) else: cData = None hub = comparison_item["hub"] dataset = comparison_item["dataset"] samples = comparison_item["samples"] name = comparison_item["name"] mode = comparison_item["mode"] if cData: n = 2000 else: n = int(100000/len(samples)) if n < 100: n =100 print n for k in range (0, len(original_labels), n): labels = original_labels[k:k+n] genes = map(lambda original_label: geneMappping[original_label] if (original_label in geneMappping) else original_label,labels) print genes[:10], "..." #all itomic data all_data_list = get_itomic_Data (genes, Nof1_item["hub"], Nof1_item["dataset"], itomic_samples) if cData: compare_data_list = None else: # get data for comparison if mode == "gene": compare_data_list = xenaAPI.Genes_values (hub, dataset, samples, genes) if mode == "probe": compare_data_list = xenaAPI.Probes_values (hub, dataset, samples, genes) for m in range (0, n): if m == len(genes): break gene = genes[m] label = labels[m] outputList =[label, gene] data_outputList = [gene] all_Data = all_data_list[m] #all itomic allsample_Data = all_data_list[m] # cohort data if cData: if gene in cData: values = cData[gene] else: values =[] if compare_data_list: compare_gene_obj = compare_data_list[m] values = compare_gene_obj['scores'][0] ############# h_l_values = clean (values) if len(h_l_values) == 0: #no comparison data continue #ttest p value try: tStat, p = scipy.stats.ttest_ind(allsample_Data.values(), h_l_values, equal_var=False) mean1 = numpy.mean( allsample_Data.values()) mean2 = numpy.mean( h_l_values) var1 = numpy.var(allsample_Data.values()) var2 = numpy.var( h_l_values) outputList.append (str(p)) # ttest p value outputList.append ('') ## ttest adjusted p value (to be filled later) outputList.append (str(tStat)) # ttest t outputList.append (str(var1)) outputList.append (str(var2)) outputList.append (str(mean1)) outputList.append (str(mean2)) pDic[gene] = p except: print "bad" continue #rank statistics, SD all_r_and_p_values = map(lambda x: rank_and_percentage(x, h_l_values), allsample_Data.values()) r_list = map(lambda x: x[1], all_r_and_p_values) mean1 = numpy.mean(r_list) SD = numpy.std(r_list) outputList.append (str(mean1)) outputList.append ('{:.2f}'.format(SD)) #rank SD # per sample data output itomic_values = map(lambda sample: all_Data[sample], Nof1_item["samples"]) r_and_p_values = map(lambda x: rank_and_percentage(x, h_l_values), itomic_values) outputList.extend(map(lambda x: '{:.2f}%'.format(x[1]), r_and_p_values)) #rank % data_outputList.extend(map(lambda x: '{:.2f}%'.format(x[1]), r_and_p_values)) #rank % data_outputList.extend(map(lambda x: '{:.2f}'.format(x), itomic_values)) #Log2TPM data_outputList.extend(map(lambda x: '{:.2f}'.format(revert_Log2_theta(x, Nof1_item["log2Theta"])), itomic_values)) #TPM fout.write(string.join(outputList,'\t') +'\n') foutdata.write(string.join(data_outputList,'\t') +'\n') fout.write("\n") fout.write("Rank % : percentile of samples with lower expression than sample of interest.\n") fout.write("Higher Rank % means higher expression.\n") fout.close() foutdata.close() # add multiple hypo adjusted p values to file # http://statsmodels.sourceforge.net/devel/generated/statsmodels.sandbox.stats.multicomp.fdrcorrection0.html pCorDic = {} genes = pDic.keys() rejected, pvalue_corrected = statsmodels.sandbox.stats.multicomp.fdrcorrection0( map( lambda x : pDic[x], genes), alpha=0.05, method='indep', is_sorted=False) for i in range(0, len(genes)): gene = genes[i] pCorDic[gene] = pvalue_corrected[i] fout = open(outputfile, 'w') fin = open(tmpfile, 'r') fout.write(fin.readline()) fout.write(fin.readline()) while 1: line = fin.readline() if line == '' or line == '\n': break data = string.split(line, '\t') gene = data[0] if gene in pCorDic: data[3] = str(pCorDic[gene]) fout.write(string.join(data,'\t')) fin.close() fout.close() os.system("rm -f " + tmpfile)
def get_itomic_Data (genes, hub, dataset, samples): dic ={} values_list= xenaAPI.Probes_values(hub, dataset, samples, genes) ret_list = map(lambda values: dict(zip(samples, values)), values_list) return ret_list # a list of dictionaries, orders by genes