chicken_all_vals) = plot_fore_control(outfile, protein, chicken_fore, chicken_control, chicken_all, chicken_host_freqs, human_host_freqs) human_control_pval = utils.my_wil_rank_sum_gtr(human_fore_vals, human_control_vals, human_all_vals) chicken_control_pval = utils.my_wil_rank_sum_gtr(chicken_fore_vals, chicken_control_vals, chicken_all_vals) if protein in ('polymerase PB1', 'neuraminidase', 'polymerase PB2', 'nonstructural protein 2', 'polymerase PA', 'matrix protein 2'): human_pval = 'fore gtr ' + str(len(human_fore_vals)) + ' ' + str(len(human_control_vals) )+ ' ' + str(utils_stats.wilcox_gtr(human_fore_vals, human_control_vals)) p = utils_stats.wilcox_gtr(chicken_fore_vals, chicken_control_vals) print 'pval', p, 'yo' chicken_pval = 'fore gtr ' + str(len(chicken_fore_vals)) + ' ' + str(len(chicken_control_vals)) + ' ' + str(p) else: human_pval = 'ctrl gtr ' + str(len(human_control_vals)) + ' ' + str(len(human_fore_vals)) + ' ' + str(utils_stats.wilcox_gtr(human_control_vals, human_fore_vals)) chicken_pval = 'ctrl gtr ' + str(len(chicken_control_vals)) + ' ' + str(len(chicken_fore_vals)) + ' ' + str(utils_stats.wilcox_gtr(chicken_control_vals, chicken_fore_vals)) print 'human pval', protein, human_pval print 'chicken pval', protein, chicken_pval # print(protein, len(human_fore), len(human_control), # len(chicken_fore), len(chicken_control), # len(set(human_fore.keys()) & set(chicken_fore.keys()))) masters_human.append((human_fore, human_control)) masters_chicken.append((chicken_fore, chicken_control)) # for p1,p2 in itertools.combinations(masters_human, 2): # print len(utils_graph.intersectLists([p1[0],p2[1]])), len(utils_graph.intersectLists([p2[0],p1[1]]))
import utils_stats elm_ls = [[1, 'YIIK'], [1, 'YIVK'], [1, 'YLDK'], [1, 'YTIR'], [2, 'YLMA'], [3, 'YLLV'], [5, 'YIEG'], [10, 'YVNT'], [15, 'YTID'], [28, 'YVSM'], [129, 'YLLA'], [143, 'YLLT'], [260, 'YTLD'], [266, 'YINT'], [271, 'YVRT'], [273, 'YCVL'], [274, 'YLEK'], [275, 'YFTA'], [275, 'YIMK'], [277, 'YVDG']] cut = 200 virus = [] nonvirus = [] found_seqs = {} with open('results/elmdict_Gallus_gallus.txt') as f: for line in f: elm, seq, count, frac_st = line.strip().split('\t') if elm == 'LIG_SH2_STAT5': appended = False for elm_count, elm_seq in elm_ls: if seq == elm_seq: found_seqs[seq] = True if elm_count > cut: virus.append(float(frac_st)) else: nonvirus.append(float(frac_st)) appended = True break if not appended: nonvirus.append(float(frac_st)) for count, seq in elm_ls: if not seq in found_seqs and count > cut: virus.append(float(0)) print utils_stats.wilcox_gtr(virus, nonvirus)
float(0.1)) virus_like = [] non_virus = [] host = 'H_sapiens' virus = 'HIV' for elm in species2dict[host]: if elm in flu2dict[virus]: for seq in species2dict[host][elm]: if seq in flu2dict[virus][elm]: #if flu2dict[virus][elm][seq] > float(.05): # virus_like.append([elm+':'+seq,species2dict[host][elm][seq]]) virus_like.append(species2dict[host][elm][seq]) #else: # non_virus.append(species2dict[host][elm][seq]) else: non_virus.append(species2dict[host][elm][seq]) else: for seq in species2dict[host][elm]: non_virus.append(species2dict[host][elm][seq]) print utils_stats.wilcox_gtr(virus_like, non_virus) with open('virus', 'w') as f: for item in virus_like: f.write('blank\t' + str(item) + '\n') with open('nonvirus', 'w') as f: for item in non_virus: f.write('blank\t' + str(item) + '\n') print len(virus_like), len(non_virus)
found_seqs = {} for seq in elm_counts[protein][elm]: if float(elm_counts[protein][elm][seq])/protein_count > float(.9): if seq in host_freqs[elm]: virus_freqs.append(host_freqs[elm][seq]) found_seqs[seq] = True else: virus_freqs.append(float(0)) else: if seq in host_freqs[elm]: non_virus_freqs.append(host_freqs[elm][seq]) found_seqs[seq] = True else: non_virus_freqs.append(float(0)) for seq in host_freqs[elm]: if not seq in found_seqs: non_virus_freqs.append(host_freqs[elm][seq]) #line = '' if len(virus_freqs) > 2 and len(non_virus_freqs) > 2: lines += protein + '\t'+ elm + '\t'+ str(utils_stats.wilcox_gtr(virus_freqs, non_virus_freqs)) + '\t' + str(utils_stats.wilcox_less(virus_freqs, non_virus_freqs)) + '\n' #else: # line = protein + '\t'+ elm + '\t'+ 'NO_DATA(' + str(len(virus_freqs)) + ',' + str(len(non_virus_freqs)) + ')' #lines += line + '\n' with open(ofile,'w') as f: f.write(lines)
virus_freqs.append(host_freqs[elm][seq]) found_seqs[seq] = True else: virus_freqs.append(float(0)) else: if seq in host_freqs[elm]: non_virus_freqs.append(host_freqs[elm][seq]) found_seqs[seq] = True else: non_virus_freqs.append(float(0)) for seq in host_freqs[elm]: if not seq in found_seqs: non_virus_freqs.append(host_freqs[elm][seq]) # #line = '' if len(virus_freqs) > 2 and len(non_virus_freqs) > 2: lines += ( protein + "\t" + elm + "\t" + str(utils_stats.wilcox_gtr(virus_freqs, non_virus_freqs)) + "\t" + str(utils_stats.wilcox_less(virus_freqs, non_virus_freqs)) + "\n" ) # #else: # # line = protein + '\t'+ elm + '\t'+ 'NO_DATA(' + str(len(virus_freqs)) + ',' + str(len(non_virus_freqs)) + ')' # #lines += line + '\n' with open(ofile, "w") as f: f.write(lines)