def calc_HF_pos(cis_dict, trans_dict, tc_dict): hnames = module.ENSid_Human_Dict() snames = module.ENSidDict() shared_ids = set(cis_dict.keys()).intersection(set(trans_dict.keys())) ##counts the cis_trans changes (up or down, binary) up_up = 0 up_down = 0 down_up = 0 down_down = 0 for ensid in shared_ids: hortho = '' if ensid in hnames: hortho = hnames[ensid] if hortho in tc_dict: cis_lfc = cis_dict[ensid] trans_lfc = trans_dict[ensid] ##print hortho,cis_lfc,trans_lfc if cis_lfc > 0: if trans_lfc > 0: up_up += 1 else: up_down += 1 else: if trans_lfc > 0: down_up += 1 else: down_down += 1 return up_up, up_down, down_up, down_down
def write_diff_bed( score_dict, outbed, TSS_PATH='/home/james/Dropbox/Miller/data/BED/ensGene_collapsed_TSS.bed', IN_DIFF='/home/james/Dropbox/Miller/data/RNA_seq_exp/CERC_RABS_PAXB_LITC_gene_exp.diff', NAMES=True, USE_SYN=True, TRACK='', BGA=False): tss_chrpos = load_TSS(fpath=TSS_PATH) snames = module.ENSidDict() syn_dict = load_xloc_synonyms(IN_DIFF) w = open(outbed, 'w') if TRACK: w.write(TRACK + '\n') for ensid in score_dict.keys(): chrpos = '' name = '' ##use all ensids at each recorded transcribed locus if USE_SYN: all_syn = syn_dict[ensid] for cur_syn in all_syn: if cur_syn in tss_chrpos: chrpos = tss_chrpos[cur_syn] name = snames[cur_syn] else: chrpos = tss_chrpos[ensid] name = snames[ensid] ##skip ensids not found in the collapsed TSS if not chrpos: continue chrome = chrpos.split(':')[0] start = chrpos.split(':')[1].split('-')[0] stop = chrpos.split(':')[1].split('-')[1] if not NAMES: name = ensid score = str(score_dict[ensid]) if not BGA: towrite = '\t'.join([chrome, start, stop, name, score]) else: towrite = '\t'.join([chrome, start, stop, score]) w.write(towrite + '\n') w.flush() w.close()
def writeASE_names(ase_file,names_file): names = module.ENSidDict() f = open(ase_file,'r') w = open(names_file,'w') for line in f: line = line.strip() split = line.split('\t') ensid = split[0] gname = names[ensid] ase = split[1] w.write(gname + '\t' + ase + '\n')
def plot_key_ase_dict(keys, dict_list, LABELS='', COLORS=''): keys_ase = [] key_names = [] plt.figure() snames = module.ENSidDict() for key in keys: cur_name = snames[key] key_names.append(cur_name) gene_ase = [] for ind, ase_dict in enumerate(dict_list): cur_counts = np.array(ase_dict[key]) print cur_name print cur_counts if np.count_nonzero(cur_counts[:, 0]) < 3 or np.count_nonzero( cur_counts[:, 0]) < 3: gene_ase.append(np.array([0])) continue cur_ase_vals = cur_counts[:, 0] / cur_counts[:, 1] cur_ase_vals = np.log2(cur_ase_vals) cur_ase_vals = cur_ase_vals[~np.isnan(cur_ase_vals)] gene_ase.append(cur_ase_vals) keys_ase.append(gene_ase) for ind_gene, cur_gene in enumerate(keys_ase): for ind_pop, cur_pop_ase in enumerate(cur_gene): if len(cur_pop_ase) < 3: continue cur_index = ind_gene * 3 + ind_pop xs = np.zeros(len(cur_pop_ase)) + cur_index plt.scatter(xs, cur_pop_ase, marker='.', color='black') plt.errorbar(cur_index, cur_pop_ase.mean(), yerr=stats.sem(cur_pop_ase), fmt='o', ecolor=COLORS[ind_pop], color=COLORS[ind_pop]) zero_xs = np.arange(len(keys) + 1) * 3 zeros = np.zeros(len(zero_xs)) plt.plot(zero_xs, zeros, color='grey', ls='dashed') plt.xlim([-.5, (len(keys) * 3) - .5]) x_ticks = np.arange(len(keys)) * 3 plt.xticks(zero_xs, key_names, rotation=45) plt.ylabel('log2(F/M)') plt.tight_layout() #xs = np.zeros(len(cur_ase_vals)) + ind #plt.scatter(xs,cur_ase_vals,marker='.',color='black') #plt.errorbar(ind,cur_ase_vals.mean(),yerr=stats.sem(cur_ase_vals),fmt='o',ecolor=COLORS[ind]) '''
def loadASE_names(ase_file): toret = {} names = module.ENSidDict() f = open(ase_file,'r') for line in f: line = line.strip() split = line.split('\t') ensid = split[0] gname = names[ensid].upper() ase = float(split[1]) toret[gname]=ase return toret
def gtfToGnames(ingtf): gene_names = set() ensidNames = module.ENSidDict() f = open(ingtf,'r') for line in f: line = line.strip() split = line.split('\t') cur_ensid = split[8].split('"')[1] cur_name = ensidNames[cur_ensid] gene_names.add(cur_name) for name in gene_names: print name
def plot_key_expDiff_dict(keys, exp_dict, test_indexes, ref_index, LABELS='', COLORS=''): plt.figure() keys_exp_diff = [] key_names = [] snames = module.ENSidDict() for key in keys: cur_name = snames[key] key_names.append(cur_name) gene_exp_diff = [] for cur_pop in test_indexes: cur_exp = np.array(exp_dict[key][cur_pop]) cur_ref_mean = np.array(exp_dict[key][ref_index]).mean() cur_exp_diff = cur_exp / cur_ref_mean cur_exp_diff = np.log2(cur_exp_diff) gene_exp_diff.append(cur_exp_diff) keys_exp_diff.append(gene_exp_diff) for ind_gene, cur_gene in enumerate(keys_exp_diff): for ind_pop, cur_pop_diff in enumerate(cur_gene): cur_index = ind_gene * 3 + ind_pop xs = np.zeros(len(cur_pop_diff)) + cur_index plt.scatter(xs, cur_pop_diff, marker='.', color='black') plt.errorbar(cur_index, cur_pop_diff.mean(), yerr=stats.sem(cur_pop_diff), fmt='o', ecolor=COLORS[ind_pop], color=COLORS[ind_pop]) zero_xs = np.arange(len(keys) + 1) * 3 zeros = np.zeros(len(zero_xs)) plt.plot(zero_xs, zeros, color='grey', ls='dashed') plt.xlim([-.5, (len(keys) * 3) - .5]) x_ticks = np.arange(len(keys)) * 3 plt.xticks(zero_xs, key_names, rotation=45) plt.ylabel('log2(F/M)') plt.tight_layout()
def load_TSS( fpath='/home/james/Dropbox/Miller/data/BED/ensGene_collapsed_TSS.bed'): f = open(fpath, 'r') chrpos_dict = {} snames = module.ENSidDict() for line in f: line = line.strip() split = line.split('\t') chrome = split[0] start = split[1] stop = split[2] chrpos = chrome + ':' + start + '-' + stop ensid = split[3] chrpos_dict[ensid] = chrpos return chrpos_dict
def sort_CountDict(sam_count_dict, GTF_file='/data/James/BED/ensGene.gtf', CHR=''): sorted_ase_list = [] gtfstarts = loadGTFStart(GTF_file) ensids = sam_count_dict.keys() ##keeps track of the amount of bases in all prev chromosomes - chrI has no offset, chrII offset is len(chrI), ect chr_offset = module.chrpostoDict() gnames = module.ENSidDict() print len(ensids) if CHR: filtered = [] for ensid in ensids: ##print gtfstarts[ensid] start_chrpos = gtfstarts[ensid][0] if start_chrpos.split(':')[0] == CHR: filtered.append(ensid) ensids = filtered print len(ensids) sorted_ids = sorted(ensids, key=lambda id: int(gtfstarts[id][0].split(':')[1]) + chr_offset[gtfstarts[id][0].split(':')[0]]) for gene in sorted_ids: sorted_ase_list.append(sam_count_dict[gene]) return sorted_ase_list
def ase_concordance(ase_file_1,ase_file_2,log_vals=False): ase_dict_1 = loadASE_list(ase_file_1) ase_dict_2 = loadASE_list(ase_file_2) ##get a list of genes shared between the two genes = set(ase_dict_1.keys()).intersection(ase_dict_2.keys()) ##a = open('/data/James/AlleleSpecificExpression/VTP/UUDD_CxP_ASE.bed','w') ##b = open('/data/James/AlleleSpecificExpression/VTP/DUUD_CxP_ASE.bed','w') positions = module.ensidStartStop() names = module.ENSidDict() ase_list_1 = [] ase_list_2 = [] up_up = 0 down_down = 0 up_down = 0 down_up = 0 for gene in genes: ##log2 the ratio value ##print ase_dict_1[gene],ase_dict_2[gene] if not log_vals: ase_val_1 = math.log((ase_dict_1[gene] + 1e-30)/(1-ase_dict_1[gene]+1e-30),2) ase_val_2 = math.log((ase_dict_2[gene] + 1e-30)/(1-ase_dict_2[gene]+1e-30),2) else: ase_val_1 = ase_dict_1[gene] ase_val_2 = ase_dict_2[gene] ##print ase_dict_1[gene] ##print ase_val_1 if math.fabs(ase_val_1) > 5 or math.fabs(ase_val_2) > 5: ##print ase_dict_1[gene],ase_dict_2[gene], ase_val_1,ase_val_2 continue ase_list_1.append(ase_val_1) ase_list_2.append(ase_val_2) ##if gene not in positions: ##continue ##chrome,start,stop = positions[gene] template = '%s\t%s\t%s\t%s\n' if ase_val_1 > 0: if ase_val_2 >0: ##print gene ##a.write(template % (chrome, str(start),str(stop),names[gene])) up_up += 1 else: ##b.write(template % (chrome, str(start),str(stop),names[gene])) up_down +=1 else: if ase_val_2 > 0: ##b.write(template % (chrome, str(start),str(stop),names[gene])) down_up +=1 else: ##print gene ##a.write(template % (chrome, str(start),str(stop),names[gene])) down_down +=1 print up_up,up_down,down_up,down_down ##a.flush() ##a.close() ##b.flush() ##b.close() ##plt.show() plt.figure(figsize=(10,10)) plt.scatter(ase_list_1,ase_list_2,alpha=.5,marker='o') ##plt.plot(xs,eigenVectors[1,0]/eigenVectors[0,0]*xs+ase_list_2.mean(),ls='dashed',color='red') plotPCA(ase_list_1,ase_list_2) r,pval = stats.pearsonr(ase_list_1,ase_list_2) plt.xlabel('PAXB (FW) Allelic Bias\nr=%f' % r,fontsize=22) plt.ylabel('CERC (FW) Allelic Bias',fontsize=22) plt.title('Allelic Bias in PAXB (FW) vs CERC (FW)',fontsize=30) plt.ylim([-5,5]) plt.xlim([-5,5]) plt.tight_layout() return r
def getSig_ASE(in_dict, FDR=.01, COV_CUT=20, OUTFILE=''): sigDict = {} stickleNames = module.ENSidDict() pval_list = [] ensid_list = [] AIB_list = [] for ensid in in_dict.keys(): cur_F, cur_M = in_dict[ensid] cur_N = cur_F + cur_M total_F = np.sum(cur_F) total_M = np.sum(cur_M) if (total_F + total_M) < COV_CUT: continue total_N = float(total_F + total_M) binom_p = stats.binom_test(total_F, n=total_N) '''if ensid == 'ENSGACT00000017212': print cur_F,cur_M print total_F,total_M print total_N print binom_p ''' zscores = ((cur_F + .5) - .5 * cur_N) / ((.25 * cur_N)**.5) AIB = (cur_F / cur_N).mean() sum_zscores = np.sum(zscores) ##sum variance, assume uncorrelated (null model, no ASE) sum_var = np.sum(.25 * cur_N) ##tval,pval = stats.ttest_1samp(zscores,0) pval_list.append(binom_p) ensid_list.append(ensid) AIB_list.append(AIB) pval_array = np.array(pval_list) ensid_array = np.array(ensid_list) AIB_array = np.array(AIB_list) cutoff = 0. ##FDR by B-H ensid_array = ensid_array[np.argsort(pval_array)] AIB_array = AIB_array[np.argsort(pval_array)] pval_array = pval_array[np.argsort(pval_array)] w = open('/home/james/Dropbox/Miller/data/ASE/PAXB_VTP_ASEpval.l', 'w') for ensid in ensid_array: w.write(stickleNames[ensid] + '\n') w.flush() w.close() fdr_list = [] for k in range(len(pval_array)): cutoff += 1. cur_fdr = (cutoff / len(pval_array)) * FDR ##print cutoff,len(pval_array) ##print cur_fdr ##print pval_array[k] if cur_fdr < pval_array[k]: print cutoff break if OUTFILE: w = open(OUTFILE, 'w') header = ['ensid'] for i in range(len(in_dict.values()[0][0])): F_rep = 'F_%d' % i header.append(F_rep) for i in range(len(in_dict.values()[0][0])): M_rep = 'M_%d' % i header.append(M_rep) header.append('pval') header.append('AIB') w.write('\t'.join(header) + '\n') for i in range(int(cutoff)): cur_ensid = ensid_array[i] cur_name = stickleNames[cur_ensid] cur_F, cur_M = in_dict[cur_ensid] cur_pval = pval_array[i] cur_AIB = AIB_array[i] printlist = [cur_name ] + list(cur_F) + list(cur_M) + [cur_pval, cur_AIB] printlist = map(lambda x: str(x), printlist) w.write('\t'.join(printlist) + '\n') w.flush() w.close() ##make the AIB,pval dict for i in range(int(cutoff)): cur_ensid = ensid_array[i] cur_F, cur_M = in_dict[cur_ensid] cur_pval = pval_array[i] cur_AIB = AIB_array[i] sigDict[cur_ensid] = (cur_AIB, cur_pval) return sigDict
plt.imshow(ase_diff) plt.colorbar(orientation='vertical') plt.title('real') plt.figure() plt.imshow(rand_ase_diff) plt.colorbar(orientation='vertical') plt.title('random') plt.show() write_expDiff_mean( '/home/james/Dropbox/Miller/data/RNA_seq_exp/CERC_RABS_PAXB_LITC_ids.fpkm_table', '/home/james/Dropbox/Miller/data/ASE/CRPL_meanExp.l') ensNames = module.ENSidDict() start_dict = load_starts() PAXB_VTP_ase_dict = load_ase_dict( '/home/james/Dropbox/Miller/data/ASE/PAXB_VTP_AI_all.tsv') PAXB_VTP_M_F_dict = M_F_counts_reps(PAXB_VTP_ase_dict) CERC_VTP_ase_dict = load_ase_dict( '/home/james/Dropbox/Miller/data/ASE/CERC_VTP_AI_all.tsv') CERC_VTP_M_F_dict = M_F_counts_reps(CERC_VTP_ase_dict) ''' write_expDiff_mean('/home/james/Dropbox/Miller/data/RNA_seq_exp/CERC_RABS_PAXB_LITC_ids.fpkm_table','/home/james/Dropbox/Miller/data/ASE/CRPL_meanExp.l') ensNames = module.ENSidDict() start_dict = load_starts() PAXB_VTP_ase_dict = load_ase_dict('/home/james/Dropbox/Miller/data/ASE/PAXB_VTP_AI_all.tsv') PAXB_VTP_M_F_dict = M_F_counts_reps(PAXB_VTP_ase_dict)