Exemple #1
0
def _bigwig_extractor(datafile, intervals, **kwargs):
    width = intervals[0].stop - intervals[0].start
    data = np.zeros((len(intervals), 1, 1, width))

    wWigIO.open(datafile)

    for index, interval in enumerate(intervals):
        wWigIO.getData(datafile, interval.chrom, interval.start, interval.stop,
                       data[index, 0, 0, :])

    wWigIO.close(datafile)

    return data
Exemple #2
0
def extract_bigwig_to_npy(bigwig, output_dir, dtype=np.float32):
    wWigIO.open(bigwig)
    chrom_sizes = wWigIO.getChromSize(bigwig)
    file_shapes = {}
    for chrom, size in zip(*chrom_sizes):
        data = np.empty(size)
        wWigIO.getData(bigwig, chrom, 0, size, data)
        np.save('{}.npy'.format(os.path.join(output_dir, chrom)),
                data.astype(dtype))
        file_shapes[chrom] = data.shape
    wWigIO.close(bigwig)

    with open(os.path.join(output_dir, 'metadata.json'), 'w') as fp:
        json.dump(
            {
                'file_shapes': file_shapes,
                'type': 'array',
                'source': bigwig
            }, fp)
Exemple #3
0
 def getChromSizesWigIO(self):
     wWigIO.open(self.path)
     out = wWigIO.getChromSize(self.path)
     out = dict(zip(out[0], out[1]))
     wWigIO.close(self.path)
     return out
Exemple #4
0
# ------------------------------------
# Misc functions
# ------------------------------------

# ------------------------------------
# Classes
# ------------------------------------

# ------------------------------------
# Main
# ------------------------------------

if __name__=="__main__":
    # get information from bigwig file
    wWigIO.open('test.bw')
    chroms = wWigIO.getChromSize('test.bw')
    wigs = wWigIO.getIntervals('test.bw', 'chr1', 10, 200)
    wWigIO.close('test.bw')
    print wigs

    # bigwig -> wig
    wWigIO.bigWigToWig('test.bw','test.wig')

    # write the chrom sizes into test.sizes
    with open('test.sizes','w') as fh:
        for chrom in chroms:
            print >>fh, chrom+"\t"+str(chroms[chrom])
    
    # wig -> bigwig
    wWigIO.wigToBigWig('test.wig','test.sizes','test2.bw')
def main(row_ME, phylop_primates):

	wWigIO.open(phylop_vertebrates)
	wWigIO.open(phylop_primates)
def main(sim_fastq, U2_GTAG_5_file, U2_GTAG_3_file, phylop_vertebrates,
         phylop_primates, exon_scores):

    MEs = set([])
    wWigIO.open(phylop_vertebrates)
    wWigIO.open(phylop_primates)

    U2_GTAG_5 = PWM_to_dict(U2_GTAG_5_file)
    U2_GTAG_3 = PWM_to_dict(U2_GTAG_3_file)

    U2_GTAG_5_max_score = 0
    U2_GTAG_3_max_score = 0

    for index in range(13):
        U2_GTAG_5_max_score += max(U2_GTAG_5['A'][index],
                                   U2_GTAG_5['C'][index],
                                   U2_GTAG_5['T'][index],
                                   U2_GTAG_5['G'][index])

    for index in range(17):
        U2_GTAG_3_max_score += max(U2_GTAG_3['A'][index],
                                   U2_GTAG_3['C'][index],
                                   U2_GTAG_3['T'][index],
                                   U2_GTAG_3['G'][index])

    TOTAL_U2_max_score = U2_GTAG_5_max_score + U2_GTAG_3_max_score

    gencode_U2_scores = []
    gencode_mean_conservation_vertebrates = []
    gencode_mean_conservation_primates = []

    for row in csv.reader(open(exon_scores), delimiter=' '):

        chr, estart, eend, strand, U2_score, mean_conservation_vertebrates, mean_conservation_primates = row

        gencode_U2_scores.append(float(U2_score))
        gencode_mean_conservation_vertebrates.append(
            float(mean_conservation_vertebrates))
        gencode_mean_conservation_primates.append(
            float(mean_conservation_primates))

    for row in csv.reader(open(sim_fastq), delimiter='\t'):

        if row[0][0] == "@":

            SJ, ME_seq, estart, eend, total_coverage, n = row[0].split("_")

            len_ME = len(ME_seq)

            SJ = SJ[1:]
            SJ_chr, SJ_istart, SJ_iend = re.findall(r"[\w']+", SJ)

            SJ_len = int(SJ_iend) - int(SJ_istart)
            Kmer = SJ_len - (len_ME + 1)
            P_ME = 1 - (1 - (float(1) / float(4**len_ME + 4)))**Kmer

            strand = "+"

            if "-" in SJ:
                strand = "-"

            estart = int(estart)
            eend = int(eend)

            MEs.add((SJ_chr, strand, estart, eend, P_ME))

    for m in MEs:

        chr, strand, estart, eend, P_ME = m

        estart, eend = sorted([estart, eend])

        E5 = str(Genome[chr][estart - 14:estart + 3]).upper()
        E3 = str(Genome[chr][eend - 3:eend + 10]).upper()

        if strand == "-":

            E5 = str(Genome[chr][eend - 3:eend +
                                 14].reverse_complement()).upper()
            E3 = str(Genome[chr][estart - 10:estart +
                                 3].reverse_complement()).upper()

        U2_score = 0

        i = 0

        for N in E5:
            if N != "N":
                U2_score += U2_GTAG_3[N][i]
                i += 1

        i = 0

        for N in E3:
            if N != "N":
                U2_score += U2_GTAG_5[N][i]
                i += 1

        U2_score = percent(U2_score, TOTAL_U2_max_score)

        conservation_vertebrates = wWigIO.getIntervals(phylop_vertebrates, chr,
                                                       estart - 2, eend + 2)
        conservation_primates = wWigIO.getIntervals(phylop_primates, chr,
                                                    estart - 2, eend + 2)

        mean_conservation_vertebrates = 0
        mean_conservation_primates = 0

        for i in conservation_vertebrates:

            mean_conservation_vertebrates += i[2]

        try:

            mean_conservation_vertebrates = mean_conservation_vertebrates / len(
                conservation_vertebrates)

        except ZeroDivisionError:
            pass

        for i in conservation_primates:

            mean_conservation_primates += i[2]

        try:

            mean_conservation_primates = mean_conservation_primates / len(
                conservation_primates)

        except ZeroDivisionError:
            pass

        ME_percentil_U2_score = stats.percentileofscore(
            gencode_U2_scores, U2_score)
        ME_percentil_mean_conservation_vertebrates = stats.percentileofscore(
            gencode_mean_conservation_vertebrates, mean_conservation_primates)
        ME_percentil_mean_conservation_primates = stats.percentileofscore(
            gencode_mean_conservation_primates, mean_conservation_vertebrates)

        overall_score = P_ME * (1 - ME_percentil_U2_score / 100) * (
            1 - ME_percentil_mean_conservation_vertebrates / 100)

        if ME_percentil_mean_conservation_primates > ME_percentil_mean_conservation_vertebrates:
            overall_score = P_ME * (1 - ME_percentil_U2_score / 100) * (
                1 - ME_percentil_mean_conservation_primates / 100)

        #print chr, estart, eend, strand, U2_score, mean_conservation_vertebrates, mean_conservation_primates
        print chr, estart, eend, strand, U2_score, ME_percentil_U2_score, mean_conservation_vertebrates, ME_percentil_mean_conservation_vertebrates, mean_conservation_primates, ME_percentil_mean_conservation_primates, P_ME, overall_score
def main(gencode_bed, U2_GTAG_5_file, U2_GTAG_3_file, phylop_vertebrates,
         phylop_primates):

    wWigIO.open(phylop_vertebrates)
    wWigIO.open(phylop_primates)

    U2_GTAG_5 = PWM_to_dict(U2_GTAG_5_file)
    U2_GTAG_3 = PWM_to_dict(U2_GTAG_3_file)

    U2_GTAG_5_max_score = 0
    U2_GTAG_3_max_score = 0

    for index in range(13):
        U2_GTAG_5_max_score += max(U2_GTAG_5['A'][index],
                                   U2_GTAG_5['C'][index],
                                   U2_GTAG_5['T'][index],
                                   U2_GTAG_5['G'][index])

    for index in range(17):
        U2_GTAG_3_max_score += max(U2_GTAG_3['A'][index],
                                   U2_GTAG_3['C'][index],
                                   U2_GTAG_3['T'][index],
                                   U2_GTAG_3['G'][index])

    TOTAL_U2_max_score = U2_GTAG_5_max_score + U2_GTAG_3_max_score

    exons = set([])

    for row in csv.reader(open(gencode_bed), delimiter='\t'):

        csv.field_size_limit(1000000000)

        qstarts = map(int, row[11].strip(",").split(","))
        blocksizes = map(int, row[10].strip(",").split(","))

        start = int(row[1])
        strand = row[5]
        bn = int(row[9])
        chr = row[0]

        for q1, b in zip(qstarts[1:-1], blocksizes[1:-1]):
            estart = start + q1
            eend = start + q1 + b

            E5 = str(Genome[chr][estart - 14:estart + 3]).upper()
            E3 = str(Genome[chr][eend - 3:eend + 10]).upper()

            if strand == "-":

                E5 = str(Genome[chr][eend - 3:eend +
                                     14].reverse_complement()).upper()
                E3 = str(Genome[chr][estart - 10:estart +
                                     3].reverse_complement()).upper()

            U2_score = 0

            i = 0

            for N in E5:
                U2_score += U2_GTAG_3[N][i]
                i += 1

            i = 0

            for N in E3:
                U2_score += U2_GTAG_5[N][i]
                i += 1

            U2_score = percent(U2_score, TOTAL_U2_max_score)

            if E5[-5:-3] == "AG" and E3[3:5] == "GT":

                exons.add((chr, estart, eend, strand, U2_score))

            # if " ".join([chr, estart, eend]) == "chr17 26597935 26598725":
            # 	print

    for e in exons:

        chr, estart, eend, strand, U2_score = e

        conservation_vertebrates = wWigIO.getIntervals(phylop_vertebrates, chr,
                                                       estart - 2, eend + 2)
        conservation_primates = wWigIO.getIntervals(phylop_primates, chr,
                                                    estart - 2, eend + 2)

        mean_conservation_vertebrates = 0
        mean_conservation_primates = 0

        for i in conservation_vertebrates:

            mean_conservation_vertebrates += i[2]

        try:

            mean_conservation_vertebrates = mean_conservation_vertebrates / len(
                conservation_vertebrates)

        except ZeroDivisionError:
            pass

        for i in conservation_primates:

            mean_conservation_primates += i[2]

        try:

            mean_conservation_primates = mean_conservation_primates / len(
                conservation_primates)

        except ZeroDivisionError:
            pass

        print chr, estart, eend, strand, U2_score, mean_conservation_vertebrates, mean_conservation_primates
Exemple #8
0
 def __init__(self, fname):
     ''' Open BigWig file. '''
     self.fname = fname
     wWigIO.open(self.fname)
Exemple #9
0
import os, sys
import numpy as np
import scipy.stats
import tabix

import wWigIO
import bw_bin


def show_help():
    print >> sys.stderr, "\n\tpython ", sys.argv[
        0], " /datd/huboqiang/test_NOM/02.SingleC/mESC_gF28_1/singleC/chr10.ACG.TCG.bed.gz COUNT_U COUNT_M 15"


in_bw = "/datd/huboqiang/test_NOM/mESC_nuc.sort.bw"
wWigIO.open(in_bw)


class NDR(object):
    def __init__(self,
                 chrom,
                 count_u,
                 count_m,
                 tb_file,
                 cutoff=1e-5,
                 depth=3,
                 bin_len=40,
                 step_len=20,
                 dist_len=140):
        self.chrom = chrom
        self.bin_len = bin_len
Exemple #10
0
 def __init__(self,fname):
     ''' Open BigWig file. '''
     self.fname=fname
     wWigIO.open(self.fname)
Exemple #11
0
# ------------------------------------
# Misc functions
# ------------------------------------

# ------------------------------------
# Classes
# ------------------------------------

# ------------------------------------
# Main
# ------------------------------------

if __name__ == "__main__":
    # get information from bigwig file
    wWigIO.open('test.bw')
    chroms = wWigIO.getChromSize('test.bw')
    wigs = wWigIO.getIntervals('test.bw', 'chr1', 10, 200)
    wWigIO.close('test.bw')
    print wigs

    # bigwig -> wig
    wWigIO.bigWigToWig('test.bw', 'test.wig')

    # write the chrom sizes into test.sizes
    with open('test.sizes', 'w') as fh:
        for chrom in chroms:
            print >> fh, chrom + "\t" + str(chroms[chrom])

    # wig -> bigwig
    wWigIO.wigToBigWig('test.wig', 'test.sizes', 'test2.bw')
Exemple #12
0
def main(sim_fastq, U2_GTAG_5_file, U2_GTAG_3_file, phylop_vertebrates,
         phylop_primates):

    MEs = set([])
    wWigIO.open(phylop_vertebrates)
    wWigIO.open(phylop_primates)

    U2_GTAG_5 = PWM_to_dict(U2_GTAG_5_file)
    U2_GTAG_3 = PWM_to_dict(U2_GTAG_3_file)

    U2_GTAG_5_max_score = 0
    U2_GTAG_3_max_score = 0

    for index in range(13):
        U2_GTAG_5_max_score += max(U2_GTAG_5['A'][index],
                                   U2_GTAG_5['C'][index],
                                   U2_GTAG_5['T'][index],
                                   U2_GTAG_5['G'][index])

    for index in range(17):
        U2_GTAG_3_max_score += max(U2_GTAG_3['A'][index],
                                   U2_GTAG_3['C'][index],
                                   U2_GTAG_3['T'][index],
                                   U2_GTAG_3['G'][index])

    TOTAL_U2_max_score = U2_GTAG_5_max_score + U2_GTAG_3_max_score

    for row in csv.reader(open(sim_fastq), delimiter='\t'):

        chr, estart, eend, exon, exon_len, strand = row

        estart = int(estart)
        eend = int(eend)

        # 	if row[0][0]=="@":

        # 		SJ, ME_seq, estart, eend, total_coverage, n = row[0].split("_")

        # 		len_ME = len(ME_seq)

        # 		SJ = SJ[1:]
        # 		SJ_chr, SJ_istart, SJ_iend = re.findall(r"[\w']+", SJ)

        # 		SJ_len = int(SJ_iend) - int(SJ_istart)
        # 		Kmer = SJ_len - (len_ME+1)
        # 		P_ME = 1 - ( 1 - (float(1)/float(4**len_ME+4)))**Kmer

        # 		strand = "+"

        # 		if "-" in SJ:
        # 			strand = "-"

        # 		estart = int(estart)
        # 		eend = int(eend)

        # 		MEs.add((SJ_chr, strand, estart, eend, P_ME))

        # for m in MEs:

        # 	chr, strand, estart, eend, P_ME = m

        # 	estart, eend = sorted([estart, eend])

        E5 = str(Genome[chr][estart - 14:estart + 3]).upper()
        E3 = str(Genome[chr][eend - 3:eend + 10]).upper()

        if strand == "-":

            E5 = str(Genome[chr][eend - 3:eend +
                                 14].reverse_complement()).upper()
            E3 = str(Genome[chr][estart - 10:estart +
                                 3].reverse_complement()).upper()

        E5 = E5[:-5] + "AG" + E5[-3:]
        E3 = E3[:3] + "GT" + E3[5:]

        U2_score = 0
        ME5_U2_score = 0
        ME3_U2_score = 0

        i = 0

        for N in E5:
            if N != "N":
                U2_score += U2_GTAG_3[N][i]
                ME5_U2_score += U2_GTAG_3[N][i]
                i += 1

        i = 0

        for N in E3:
            if N != "N":
                U2_score += U2_GTAG_5[N][i]
                ME3_U2_score += U2_GTAG_5[N][i]
                i += 1

        ME3_U2_score = percent(ME3_U2_score, U2_GTAG_5_max_score)
        ME5_U2_score = percent(ME5_U2_score, U2_GTAG_3_max_score)

        U2_score = percent(U2_score, TOTAL_U2_max_score)

        conservation_vertebrates = wWigIO.getIntervals(phylop_vertebrates, chr,
                                                       estart - 2, eend + 2)
        conservation_primates = wWigIO.getIntervals(phylop_primates, chr,
                                                    estart - 2, eend + 2)

        mean_conservation_vertebrates = 0
        mean_conservation_primates = 0

        for i in conservation_vertebrates:

            mean_conservation_vertebrates += i[2]

        try:

            mean_conservation_vertebrates = mean_conservation_vertebrates / len(
                conservation_vertebrates)

        except ZeroDivisionError:
            pass

        for i in conservation_primates:

            mean_conservation_primates += i[2]

        try:

            mean_conservation_primates = mean_conservation_primates / len(
                conservation_primates)

        except ZeroDivisionError:
            pass

        #print chr, estart, eend, strand, U2_score, mean_conservation_vertebrates, mean_conservation_primates
        print chr, estart, eend, strand, U2_score, ME5_U2_score, ME3_U2_score, mean_conservation_vertebrates, mean_conservation_primates
Exemple #13
0
def main(row_ME, phylop_primates):

    wWigIO.open(phylop_vertebrates)
    wWigIO.open(phylop_primates)
Exemple #14
0
def main(sim_fastq,  U2_GTAG_5_file, U2_GTAG_3_file, phylop_vertebrates, phylop_primates):

	MEs = set([])
	wWigIO.open(phylop_vertebrates)
	wWigIO.open(phylop_primates)

	U2_GTAG_5 = PWM_to_dict(U2_GTAG_5_file)
	U2_GTAG_3 = PWM_to_dict(U2_GTAG_3_file)

	U2_GTAG_5_max_score = 0
	U2_GTAG_3_max_score = 0

	for index in range(13):
		U2_GTAG_5_max_score += max(U2_GTAG_5['A'][index], U2_GTAG_5['C'][index], U2_GTAG_5['T'][index], U2_GTAG_5['G'][index])

	for index in range(17):
		U2_GTAG_3_max_score += max(U2_GTAG_3['A'][index], U2_GTAG_3['C'][index], U2_GTAG_3['T'][index], U2_GTAG_3['G'][index])
	
	TOTAL_U2_max_score = U2_GTAG_5_max_score + U2_GTAG_3_max_score


	for row in csv.reader(open(sim_fastq), delimiter = '\t'):

		chr, estart, eend, exon, exon_len, strand = row

		estart = int(estart)
		eend = int(eend)



	# 	if row[0][0]=="@":

	# 		SJ, ME_seq, estart, eend, total_coverage, n = row[0].split("_")

	# 		len_ME = len(ME_seq)

	# 		SJ = SJ[1:]
	# 		SJ_chr, SJ_istart, SJ_iend = re.findall(r"[\w']+", SJ)


	# 		SJ_len = int(SJ_iend) - int(SJ_istart)
	# 		Kmer = SJ_len - (len_ME+1)
	# 		P_ME = 1 - ( 1 - (float(1)/float(4**len_ME+4)))**Kmer	

	# 		strand = "+"

	# 		if "-" in SJ:
	# 			strand = "-"

	# 		estart = int(estart)
	# 		eend = int(eend)

	# 		MEs.add((SJ_chr, strand, estart, eend, P_ME))


	# for m in MEs:

	# 	chr, strand, estart, eend, P_ME = m

	# 	estart, eend = sorted([estart, eend])

		E5 = str(Genome[chr][estart-14:estart+3]).upper()
		E3 = str(Genome[chr][eend-3:eend+10]).upper()


		if strand == "-":

			E5 = str(Genome[chr][eend-3:eend+14].reverse_complement()).upper()
			E3 = str(Genome[chr][estart-10:estart+3].reverse_complement()).upper()

		E5 = E5[:-5] + "AG" + E5[-3:]
		E3 = E3[:3] + "GT" + E3[5:]


		U2_score = 0
		ME5_U2_score = 0
		ME3_U2_score = 0	

		i = 0


		for N in E5:
			if N!="N":
				U2_score += U2_GTAG_3[N][i]
				ME5_U2_score += U2_GTAG_3[N][i]
				i += 1

		i = 0

		for N in E3:
			if N!="N":
				U2_score += U2_GTAG_5[N][i]
				ME3_U2_score += U2_GTAG_5[N][i]
				i += 1

		ME3_U2_score = percent(ME3_U2_score, U2_GTAG_5_max_score)
		ME5_U2_score = percent(ME5_U2_score, U2_GTAG_3_max_score)

		U2_score = percent(U2_score, TOTAL_U2_max_score)

		conservation_vertebrates = wWigIO.getIntervals(phylop_vertebrates, chr, estart-2, eend+2)
		conservation_primates = wWigIO.getIntervals(phylop_primates, chr, estart-2, eend+2)

		mean_conservation_vertebrates = 0
		mean_conservation_primates = 0

		for i in conservation_vertebrates:

			mean_conservation_vertebrates += i[2]

		try:

			mean_conservation_vertebrates = mean_conservation_vertebrates/len(conservation_vertebrates)

		except ZeroDivisionError:
			pass

		
		for i in conservation_primates:

			mean_conservation_primates += i[2]

		try:

			mean_conservation_primates = mean_conservation_primates/len(conservation_primates)

		except ZeroDivisionError:
			pass

		#print chr, estart, eend, strand, U2_score, mean_conservation_vertebrates, mean_conservation_primates
		print chr, estart, eend, strand, U2_score, ME5_U2_score, ME3_U2_score,  mean_conservation_vertebrates, mean_conservation_primates
def main(gencode_bed, U2_GTAG_5_file, U2_GTAG_3_file, phylop_vertebrates, phylop_primates):

	wWigIO.open(phylop_vertebrates)
	wWigIO.open(phylop_primates)

	U2_GTAG_5 = PWM_to_dict(U2_GTAG_5_file)
	U2_GTAG_3 = PWM_to_dict(U2_GTAG_3_file)

	U2_GTAG_5_max_score = 0
	U2_GTAG_3_max_score = 0

	for index in range(13):
		U2_GTAG_5_max_score += max(U2_GTAG_5['A'][index], U2_GTAG_5['C'][index], U2_GTAG_5['T'][index], U2_GTAG_5['G'][index])

	for index in range(17):
		U2_GTAG_3_max_score += max(U2_GTAG_3['A'][index], U2_GTAG_3['C'][index], U2_GTAG_3['T'][index], U2_GTAG_3['G'][index])
	
	TOTAL_U2_max_score = U2_GTAG_5_max_score + U2_GTAG_3_max_score

	exons = set([])

	for row in csv.reader(open(gencode_bed), delimiter = '\t'):
		
		csv.field_size_limit(1000000000)

		qstarts = map (int, row[11].strip(",").split(","))                      
		blocksizes = map(int, row[10].strip(",").split(","))

		start = int(row[1])
		strand = row[5]
		bn = int(row[9])
		chr = row[0]

		

		for q1, b in zip(qstarts[1:-1], blocksizes[1:-1]):
			estart = start + q1
			eend = start + q1 + b


			E5 = str(Genome[chr][estart-14:estart+3]).upper()
			E3 = str(Genome[chr][eend-3:eend+10]).upper()


			if strand == "-":

				E5 = str(Genome[chr][eend-3:eend+14].reverse_complement()).upper()
				E3 = str(Genome[chr][estart-10:estart+3].reverse_complement()).upper()


			U2_score = 0

			i = 0

			for N in E5:
				U2_score += U2_GTAG_3[N][i]
				i += 1

			i = 0

			for N in E3:
				U2_score += U2_GTAG_5[N][i]
				i += 1

			U2_score = percent(U2_score, TOTAL_U2_max_score)



			if E5[-5:-3]=="AG" and E3[3:5] == "GT":



				exons.add((chr, estart, eend, strand, U2_score))

			# if " ".join([chr, estart, eend]) == "chr17 26597935 26598725":
			# 	print 


	for e in exons:



		chr, estart, eend, strand, U2_score = e

		conservation_vertebrates = wWigIO.getIntervals(phylop_vertebrates, chr, estart-2, eend+2)
		conservation_primates = wWigIO.getIntervals(phylop_primates, chr, estart-2, eend+2)

		mean_conservation_vertebrates = 0
		mean_conservation_primates = 0

		for i in conservation_vertebrates:

			mean_conservation_vertebrates += i[2]

		try:

			mean_conservation_vertebrates = mean_conservation_vertebrates/len(conservation_vertebrates)

		except ZeroDivisionError:
			pass

		
		for i in conservation_primates:

			mean_conservation_primates += i[2]

		try:

			mean_conservation_primates = mean_conservation_primates/len(conservation_primates)

		except ZeroDivisionError:
			pass

		print chr, estart, eend, strand, U2_score, mean_conservation_vertebrates, mean_conservation_primates