def calc_HF_pos(cis_dict, trans_dict, tc_dict):

    hnames = module.ENSid_Human_Dict()
    snames = module.ENSidDict()
    shared_ids = set(cis_dict.keys()).intersection(set(trans_dict.keys()))

    ##counts the cis_trans changes (up or down, binary)
    up_up = 0
    up_down = 0
    down_up = 0
    down_down = 0
    for ensid in shared_ids:
        hortho = ''
        if ensid in hnames:
            hortho = hnames[ensid]
        if hortho in tc_dict:

            cis_lfc = cis_dict[ensid]
            trans_lfc = trans_dict[ensid]
            ##print hortho,cis_lfc,trans_lfc
            if cis_lfc > 0:

                if trans_lfc > 0:
                    up_up += 1
                else:
                    up_down += 1
            else:
                if trans_lfc > 0:
                    down_up += 1
                else:
                    down_down += 1

    return up_up, up_down, down_up, down_down
Example #2
0
def write_diff_bed(
        score_dict,
        outbed,
        TSS_PATH='/home/james/Dropbox/Miller/data/BED/ensGene_collapsed_TSS.bed',
        IN_DIFF='/home/james/Dropbox/Miller/data/RNA_seq_exp/CERC_RABS_PAXB_LITC_gene_exp.diff',
        NAMES=True,
        USE_SYN=True,
        TRACK='',
        BGA=False):

    tss_chrpos = load_TSS(fpath=TSS_PATH)
    snames = module.ENSidDict()
    syn_dict = load_xloc_synonyms(IN_DIFF)

    w = open(outbed, 'w')
    if TRACK:
        w.write(TRACK + '\n')

    for ensid in score_dict.keys():

        chrpos = ''
        name = ''
        ##use all ensids at each recorded transcribed locus
        if USE_SYN:
            all_syn = syn_dict[ensid]
            for cur_syn in all_syn:
                if cur_syn in tss_chrpos:
                    chrpos = tss_chrpos[cur_syn]
                    name = snames[cur_syn]

        else:
            chrpos = tss_chrpos[ensid]
            name = snames[ensid]

        ##skip ensids not found in the collapsed TSS
        if not chrpos:
            continue

        chrome = chrpos.split(':')[0]
        start = chrpos.split(':')[1].split('-')[0]
        stop = chrpos.split(':')[1].split('-')[1]

        if not NAMES:
            name = ensid

        score = str(score_dict[ensid])

        if not BGA:
            towrite = '\t'.join([chrome, start, stop, name, score])
        else:
            towrite = '\t'.join([chrome, start, stop, score])
        w.write(towrite + '\n')

    w.flush()
    w.close()
Example #3
0
def writeASE_names(ase_file,names_file):

    names = module.ENSidDict()
    f = open(ase_file,'r')
    w = open(names_file,'w')

    for line in f:
	line = line.strip()
	split = line.split('\t')
	ensid = split[0]
	gname = names[ensid]
	ase = split[1]
	w.write(gname + '\t' + ase + '\n')
Example #4
0
def plot_key_ase_dict(keys, dict_list, LABELS='', COLORS=''):

    keys_ase = []
    key_names = []
    plt.figure()
    snames = module.ENSidDict()
    for key in keys:
        cur_name = snames[key]
        key_names.append(cur_name)
        gene_ase = []
        for ind, ase_dict in enumerate(dict_list):
            cur_counts = np.array(ase_dict[key])
            print cur_name
            print cur_counts
            if np.count_nonzero(cur_counts[:, 0]) < 3 or np.count_nonzero(
                    cur_counts[:, 0]) < 3:
                gene_ase.append(np.array([0]))
                continue
            cur_ase_vals = cur_counts[:, 0] / cur_counts[:, 1]
            cur_ase_vals = np.log2(cur_ase_vals)
            cur_ase_vals = cur_ase_vals[~np.isnan(cur_ase_vals)]
            gene_ase.append(cur_ase_vals)
        keys_ase.append(gene_ase)

    for ind_gene, cur_gene in enumerate(keys_ase):
        for ind_pop, cur_pop_ase in enumerate(cur_gene):
            if len(cur_pop_ase) < 3:
                continue
            cur_index = ind_gene * 3 + ind_pop
            xs = np.zeros(len(cur_pop_ase)) + cur_index
            plt.scatter(xs, cur_pop_ase, marker='.', color='black')
            plt.errorbar(cur_index,
                         cur_pop_ase.mean(),
                         yerr=stats.sem(cur_pop_ase),
                         fmt='o',
                         ecolor=COLORS[ind_pop],
                         color=COLORS[ind_pop])

    zero_xs = np.arange(len(keys) + 1) * 3
    zeros = np.zeros(len(zero_xs))
    plt.plot(zero_xs, zeros, color='grey', ls='dashed')
    plt.xlim([-.5, (len(keys) * 3) - .5])
    x_ticks = np.arange(len(keys)) * 3
    plt.xticks(zero_xs, key_names, rotation=45)
    plt.ylabel('log2(F/M)')
    plt.tight_layout()
    #xs = np.zeros(len(cur_ase_vals)) + ind
    #plt.scatter(xs,cur_ase_vals,marker='.',color='black')
    #plt.errorbar(ind,cur_ase_vals.mean(),yerr=stats.sem(cur_ase_vals),fmt='o',ecolor=COLORS[ind])
    '''
Example #5
0
def loadASE_names(ase_file):

    toret = {}
    names = module.ENSidDict()
    f = open(ase_file,'r')

    for line in f:
	    line = line.strip()
	    split = line.split('\t')
	    ensid = split[0]
	    gname = names[ensid].upper()
	    ase = float(split[1])
	    toret[gname]=ase

    return toret
Example #6
0
def gtfToGnames(ingtf):

    gene_names = set()

    ensidNames = module.ENSidDict()
    
    f = open(ingtf,'r')

    for line in f:
	line = line.strip()
	split = line.split('\t')
	
	cur_ensid = split[8].split('"')[1]
	cur_name = ensidNames[cur_ensid]
	gene_names.add(cur_name)

    for name in gene_names:
	print name
Example #7
0
def plot_key_expDiff_dict(keys,
                          exp_dict,
                          test_indexes,
                          ref_index,
                          LABELS='',
                          COLORS=''):

    plt.figure()
    keys_exp_diff = []
    key_names = []
    snames = module.ENSidDict()
    for key in keys:
        cur_name = snames[key]
        key_names.append(cur_name)
        gene_exp_diff = []
        for cur_pop in test_indexes:
            cur_exp = np.array(exp_dict[key][cur_pop])
            cur_ref_mean = np.array(exp_dict[key][ref_index]).mean()
            cur_exp_diff = cur_exp / cur_ref_mean
            cur_exp_diff = np.log2(cur_exp_diff)
            gene_exp_diff.append(cur_exp_diff)
        keys_exp_diff.append(gene_exp_diff)

    for ind_gene, cur_gene in enumerate(keys_exp_diff):
        for ind_pop, cur_pop_diff in enumerate(cur_gene):
            cur_index = ind_gene * 3 + ind_pop
            xs = np.zeros(len(cur_pop_diff)) + cur_index
            plt.scatter(xs, cur_pop_diff, marker='.', color='black')
            plt.errorbar(cur_index,
                         cur_pop_diff.mean(),
                         yerr=stats.sem(cur_pop_diff),
                         fmt='o',
                         ecolor=COLORS[ind_pop],
                         color=COLORS[ind_pop])

    zero_xs = np.arange(len(keys) + 1) * 3
    zeros = np.zeros(len(zero_xs))
    plt.plot(zero_xs, zeros, color='grey', ls='dashed')
    plt.xlim([-.5, (len(keys) * 3) - .5])
    x_ticks = np.arange(len(keys)) * 3
    plt.xticks(zero_xs, key_names, rotation=45)
    plt.ylabel('log2(F/M)')
    plt.tight_layout()
Example #8
0
def load_TSS(
        fpath='/home/james/Dropbox/Miller/data/BED/ensGene_collapsed_TSS.bed'):

    f = open(fpath, 'r')

    chrpos_dict = {}
    snames = module.ENSidDict()
    for line in f:
        line = line.strip()
        split = line.split('\t')
        chrome = split[0]
        start = split[1]
        stop = split[2]

        chrpos = chrome + ':' + start + '-' + stop

        ensid = split[3]

        chrpos_dict[ensid] = chrpos
    return chrpos_dict
Example #9
0
def sort_CountDict(sam_count_dict,
                   GTF_file='/data/James/BED/ensGene.gtf',
                   CHR=''):

    sorted_ase_list = []

    gtfstarts = loadGTFStart(GTF_file)

    ensids = sam_count_dict.keys()

    ##keeps track of the amount of bases in all prev chromosomes - chrI has no offset, chrII offset is len(chrI), ect
    chr_offset = module.chrpostoDict()

    gnames = module.ENSidDict()

    print len(ensids)

    if CHR:
        filtered = []
        for ensid in ensids:
            ##print gtfstarts[ensid]
            start_chrpos = gtfstarts[ensid][0]
            if start_chrpos.split(':')[0] == CHR:
                filtered.append(ensid)

        ensids = filtered

    print len(ensids)

    sorted_ids = sorted(ensids,
                        key=lambda id: int(gtfstarts[id][0].split(':')[1]) +
                        chr_offset[gtfstarts[id][0].split(':')[0]])

    for gene in sorted_ids:
        sorted_ase_list.append(sam_count_dict[gene])

    return sorted_ase_list
Example #10
0
def ase_concordance(ase_file_1,ase_file_2,log_vals=False):

    ase_dict_1 = loadASE_list(ase_file_1)
    ase_dict_2 = loadASE_list(ase_file_2)
    ##get a list of genes shared between the two

    genes = set(ase_dict_1.keys()).intersection(ase_dict_2.keys())

    ##a = open('/data/James/AlleleSpecificExpression/VTP/UUDD_CxP_ASE.bed','w')
    ##b = open('/data/James/AlleleSpecificExpression/VTP/DUUD_CxP_ASE.bed','w')

    positions = module.ensidStartStop()
    names = module.ENSidDict()

    ase_list_1 = []
    ase_list_2 = []
    up_up = 0
    down_down = 0
    up_down = 0
    down_up = 0
    
    for gene in genes:

	##log2 the ratio value
	##print ase_dict_1[gene],ase_dict_2[gene]
        if not log_vals:
            ase_val_1 = math.log((ase_dict_1[gene] + 1e-30)/(1-ase_dict_1[gene]+1e-30),2)
            ase_val_2 = math.log((ase_dict_2[gene] + 1e-30)/(1-ase_dict_2[gene]+1e-30),2)
        else:
            ase_val_1 = ase_dict_1[gene]
            ase_val_2 = ase_dict_2[gene]
	##print ase_dict_1[gene]
	##print ase_val_1
	if math.fabs(ase_val_1) > 5 or math.fabs(ase_val_2) > 5:
	    ##print ase_dict_1[gene],ase_dict_2[gene], ase_val_1,ase_val_2
	    continue

	ase_list_1.append(ase_val_1)
	ase_list_2.append(ase_val_2)
        ##if gene not in positions:
            ##continue
	##chrome,start,stop = positions[gene]
	template = '%s\t%s\t%s\t%s\n'
	if ase_val_1 > 0:
	    if ase_val_2 >0:
		##print gene
		##a.write(template % (chrome, str(start),str(stop),names[gene]))
		up_up += 1
	    else:
		##b.write(template % (chrome, str(start),str(stop),names[gene]))
		up_down +=1
	else:
	    if ase_val_2 > 0:
		##b.write(template % (chrome, str(start),str(stop),names[gene]))
		down_up +=1
	    else:
		##print gene
		##a.write(template % (chrome, str(start),str(stop),names[gene]))
		down_down +=1
    print up_up,up_down,down_up,down_down

    ##a.flush()
    ##a.close()
    ##b.flush()
    ##b.close()
    ##plt.show()
    plt.figure(figsize=(10,10))
    plt.scatter(ase_list_1,ase_list_2,alpha=.5,marker='o')
    ##plt.plot(xs,eigenVectors[1,0]/eigenVectors[0,0]*xs+ase_list_2.mean(),ls='dashed',color='red')
    plotPCA(ase_list_1,ase_list_2)
    r,pval =  stats.pearsonr(ase_list_1,ase_list_2)
    plt.xlabel('PAXB (FW) Allelic Bias\nr=%f' % r,fontsize=22)
    plt.ylabel('CERC (FW) Allelic Bias',fontsize=22)
    plt.title('Allelic Bias in PAXB (FW) vs CERC (FW)',fontsize=30)
    plt.ylim([-5,5])
    plt.xlim([-5,5])
    plt.tight_layout()
    return r
Example #11
0
def getSig_ASE(in_dict, FDR=.01, COV_CUT=20, OUTFILE=''):

    sigDict = {}

    stickleNames = module.ENSidDict()

    pval_list = []
    ensid_list = []
    AIB_list = []

    for ensid in in_dict.keys():

        cur_F, cur_M = in_dict[ensid]
        cur_N = cur_F + cur_M

        total_F = np.sum(cur_F)
        total_M = np.sum(cur_M)

        if (total_F + total_M) < COV_CUT:
            continue
        total_N = float(total_F + total_M)
        binom_p = stats.binom_test(total_F, n=total_N)
        '''if ensid == 'ENSGACT00000017212':
            print cur_F,cur_M
            print total_F,total_M
            print total_N
            print binom_p
        '''
        zscores = ((cur_F + .5) - .5 * cur_N) / ((.25 * cur_N)**.5)

        AIB = (cur_F / cur_N).mean()
        sum_zscores = np.sum(zscores)
        ##sum variance, assume uncorrelated (null model, no ASE)
        sum_var = np.sum(.25 * cur_N)

        ##tval,pval =  stats.ttest_1samp(zscores,0)

        pval_list.append(binom_p)
        ensid_list.append(ensid)
        AIB_list.append(AIB)

    pval_array = np.array(pval_list)
    ensid_array = np.array(ensid_list)
    AIB_array = np.array(AIB_list)
    cutoff = 0.

    ##FDR by B-H
    ensid_array = ensid_array[np.argsort(pval_array)]
    AIB_array = AIB_array[np.argsort(pval_array)]
    pval_array = pval_array[np.argsort(pval_array)]

    w = open('/home/james/Dropbox/Miller/data/ASE/PAXB_VTP_ASEpval.l', 'w')

    for ensid in ensid_array:
        w.write(stickleNames[ensid] + '\n')

    w.flush()
    w.close()

    fdr_list = []
    for k in range(len(pval_array)):
        cutoff += 1.
        cur_fdr = (cutoff / len(pval_array)) * FDR
        ##print cutoff,len(pval_array)
        ##print cur_fdr
        ##print pval_array[k]
        if cur_fdr < pval_array[k]:
            print cutoff
            break

    if OUTFILE:
        w = open(OUTFILE, 'w')

        header = ['ensid']
        for i in range(len(in_dict.values()[0][0])):
            F_rep = 'F_%d' % i
            header.append(F_rep)

        for i in range(len(in_dict.values()[0][0])):
            M_rep = 'M_%d' % i
            header.append(M_rep)
        header.append('pval')
        header.append('AIB')
        w.write('\t'.join(header) + '\n')
        for i in range(int(cutoff)):

            cur_ensid = ensid_array[i]
            cur_name = stickleNames[cur_ensid]
            cur_F, cur_M = in_dict[cur_ensid]
            cur_pval = pval_array[i]
            cur_AIB = AIB_array[i]

            printlist = [cur_name
                         ] + list(cur_F) + list(cur_M) + [cur_pval, cur_AIB]
            printlist = map(lambda x: str(x), printlist)
            w.write('\t'.join(printlist) + '\n')
        w.flush()
        w.close()

    ##make the AIB,pval dict

    for i in range(int(cutoff)):

        cur_ensid = ensid_array[i]
        cur_F, cur_M = in_dict[cur_ensid]
        cur_pval = pval_array[i]
        cur_AIB = AIB_array[i]
        sigDict[cur_ensid] = (cur_AIB, cur_pval)

    return sigDict
Example #12
0
    plt.imshow(ase_diff)
    plt.colorbar(orientation='vertical')
    plt.title('real')

    plt.figure()
    plt.imshow(rand_ase_diff)
    plt.colorbar(orientation='vertical')
    plt.title('random')
    plt.show()


write_expDiff_mean(
    '/home/james/Dropbox/Miller/data/RNA_seq_exp/CERC_RABS_PAXB_LITC_ids.fpkm_table',
    '/home/james/Dropbox/Miller/data/ASE/CRPL_meanExp.l')

ensNames = module.ENSidDict()
start_dict = load_starts()
PAXB_VTP_ase_dict = load_ase_dict(
    '/home/james/Dropbox/Miller/data/ASE/PAXB_VTP_AI_all.tsv')
PAXB_VTP_M_F_dict = M_F_counts_reps(PAXB_VTP_ase_dict)

CERC_VTP_ase_dict = load_ase_dict(
    '/home/james/Dropbox/Miller/data/ASE/CERC_VTP_AI_all.tsv')
CERC_VTP_M_F_dict = M_F_counts_reps(CERC_VTP_ase_dict)
'''
write_expDiff_mean('/home/james/Dropbox/Miller/data/RNA_seq_exp/CERC_RABS_PAXB_LITC_ids.fpkm_table','/home/james/Dropbox/Miller/data/ASE/CRPL_meanExp.l')

ensNames = module.ENSidDict()
start_dict = load_starts()
PAXB_VTP_ase_dict = load_ase_dict('/home/james/Dropbox/Miller/data/ASE/PAXB_VTP_AI_all.tsv')
PAXB_VTP_M_F_dict = M_F_counts_reps(PAXB_VTP_ase_dict)