예제 #1
0
def join_lists(mylist_1, mylist_2):
    """
	input: lists of (key, annotation)
	output: list of (key, annotation)
	
	"""
    if Utility_extended.is_listT_sorted(mylist_1) != 1:
        mylist_1 = sorted(mylist_1, key=itemgetter(0))
    mylist_1 = remove_redundancy(mylist_1)
    #print len(mylist_1)
    if Utility_extended.is_listT_sorted(mylist_2) != 1:
        mylist_2 = sorted(mylist_2, key=itemgetter(0))
    mylist_2 = remove_redundancy(mylist_2)
    #print len(mylist_2)
    mylist_1_IDs = [i[0] for i in mylist_1]
    mylist_2_IDs = [i[0] for i in mylist_2]

    outlist = []
    if len(mylist_1) <= len(mylist_2):
        for item in mylist_1:
            ID = item[0]
            if ID in mylist_2_IDs:
                index = mylist_2_IDs.index(ID)
                out = item[1] + mylist_2[index][1]
                outlist.append((ID, out))
    else:
        for item in mylist_2:
            ID = item[0]
            if ID in mylist_1_IDs:
                index = mylist_1_IDs.index(ID)
                out = mylist_1[index][1] + item[1]
                outlist.append((ID, out))
    return outlist
예제 #2
0
def get_iri_by_gene(rc_on_shared_introns,
                    rc_on_shared_exons,
                    rc_threshold,
                    id_set=None):
    """
	normalize the intron read density by the average exon read density of that gene
	
	rc_on_shared_introns: {entrezID:[((start, end), rc)]}
	rc_on_shared_exons: {entrezID:[((start, end), rc)]}
	
	expression cutoff is implemented by id_set and rc_threshold
	returns {entrezID:[((start, end), iri)]}
	"""
    myids = Utility_extended.get_subset_ids_from_dic(rc_on_shared_introns,
                                                     id_set)
    myids = Utility_extended.get_subset_ids_from_dic(rc_on_shared_exons, myids)

    iri_by_gene = {}  #{entrez_id:[((start, end), iri)]}
    for entrez_id in myids:
        exons_rc = get_trons_rc(rc_on_shared_exons[entrez_id])
        if exons_rc > rc_threshold:
            iri_by_gene[entrez_id] = []
            exons_length = get_trons_length(rc_on_shared_exons[entrez_id])
            rc_density_on_exons = float(exons_rc) / exons_length
            for intron in rc_on_shared_introns[entrez_id]:
                intron_rc = intron[1]
                intron_coordinate = intron[0]
                intron_length = intron_coordinate[1] - intron_coordinate[0] + 1
                rc_density_on_intron = float(intron_rc) / intron_length
                iri = rc_density_on_intron / rc_density_on_exons
                iri_by_gene[entrez_id].append((intron_coordinate, iri))

    return iri_by_gene
예제 #3
0
def calculateExonIntrons(entrez_genes, bedfile, column_index, chroms,  fragment_size, totalcount, out_file):
	lib_name = (bedfile).split('/')[-1] # remove directory
	suffix = lib_name.split('.')[-1] # txt
	lib_name = lib_name.split('.')[0] 
	extension = "-" + lib_name +'.' + suffix +"1"
	if Utility_extended.fileExists(bedfile):
		if Utility_extended.chrom_files_exist(chroms, extension) != 1:
			# Separate by chrom and sort by start
			print chroms, extension, " files do not exist, separate by chroms. "
			Utility_extended.separate_by_chrom_sort(chroms, bedfile, extension, [column_index])
	else:
		print bedfile, " is not found";
		sys.exit(1)
	
	all_reads_on_shared_exons = {} # {entrezID:[((start, end), read_count)]}
	all_reads_on_shared_introns = {} # {entrezID:[((start, end), read_count)]}
	all_reads_on_merged_transcripts = {} #{entrezID:[((start, end), read_count)]}
	all_summary = {}
	
	for chrom in chroms:
		chrombed = chrom + extension
		entrez_genes_by_chrom =  Entrez.KnownEntrezGenes([chrom], entrez_genes.subset_by_chrom(chrom))
		(reads_on_shared_exons, reads_on_shared_introns, reads_on_merged_transcripts, summary) =  calculateExonIntrons_by_chrom (entrez_genes_by_chrom, chrombed, fragment_size, totalcount, out_file)
		#if chrom == chroms[0]:
			#myid = reads_on_shared_exons.keys()[0]
			#test(entrez_genes_by_chrom, reads_on_shared_introns, myid)
		all_reads_on_shared_exons.update(reads_on_shared_exons)
		all_reads_on_shared_introns.update(reads_on_shared_introns)
		all_reads_on_merged_transcripts.update(reads_on_merged_transcripts)
		all_summary.update(summary)
		
	SeparateByChrom.cleanup(chroms, extension)
	return (all_reads_on_shared_exons, all_reads_on_shared_introns, all_reads_on_merged_transcripts, summary)
예제 #4
0
def main(argv):
    parser = OptionParser()
    (opt, args) = parser.parse_args(argv)

    A = [(1, 2.5), (3.5, 15), (45, 71), (74, 93)]
    B = [(1.2, 2.5), (2, 7), (2, 2), (57, 84)]
    print A
    print B
    print Utility_extended.intersect(A, B, 0.0001)
예제 #5
0
def main(argv):
	parser = OptionParser()
	parser.add_option("-i", "--inputfile", action="store", type="string", dest="infile", metavar="<file>", help="name for input file, which is a multicolumn text file")
	parser.add_option("-c", "--column", action="store", type="int", dest="c", metavar="<int>", help="the index of the column to be rescaled, 1-based")
	parser.add_option("-r", "--rescale_factor", action="store", type="float", dest="rescale_factor", metavar="<float>", help="the rescale factor that will be multiplied to the numbers in column c")
	parser.add_option("-o", "--outputfile", action="store", type="string", dest="outfile", metavar="<file>", help="name for output file")
	(opt, args) = parser.parse_args(argv)
	if len(argv) < 8:
		parser.print_help()
		sys.exit(1)
	
	Utility_extended.rescale_a_column(opt.infile, opt.c-1, opt.rescale_factor, opt.outfile)
예제 #6
0
def rank_iri_by_intron_length(iri, id_set=None):
    """
	The module is to test whether long introns likely will have higher intron retention
	It uses the gene-based normalization for 
	
	iri: {entrezID:[((start, end), iri)]}
	returns [(id, length, iri_value)]
	"""
    #print "3"
    myids = Utility_extended.get_subset_ids_from_dic(iri, id_set)
    #print len(myids)

    ranked_list = []
    for myid in myids:
        #print "The id is ", myid
        introns = iri[myid]
        for item in introns:
            start = item[0][0]
            end = item[0][1]
            length = end - start + 1
            iri_value = item[1]
            ranked_list.append((myid, length, iri_value))
    ranked_list.sort(key=itemgetter(1))

    return ranked_list
예제 #7
0
def getShared5UTR(transcripts, min_width=5):
    my_5UTRs = []
    for transcript in transcripts:
        my_5UTR = transcript.get5UTR(0, 0)
        if len(my_5UTR) > 0:
            my_5UTRs.append(my_5UTR)
    return Utility_extended.shared(my_5UTRs)
예제 #8
0
def get_tron_rpkm_histogram(rpkm_distribution, mylabel, mytitle, id_set=None):
    """
	A significant fraction (35%) of intronic regions are free of reads. Is it because of mappability? The simplest approach is the output these regions and check them out on genome browser. Another approach is to calculate an irrelevant ChIP-Seq (K36me3)/RNA-Seq library and compare the same regions, and use the scatter plot to find out the mappability.
	
	rpkm_distribution
	
	"""
    myids = Utility_extended.get_subset_ids_from_dic(rpkm_distribution, id_set)
    myids.sort()  # enable predictable behavior of entrez_id order

    all_list = []  # One entry each tronic regions

    for entrez_id in myids:
        rpkms = [item[1] for item in (rpkm_distribution)[entrez_id]]
        all_list.extend(rpkms)

    plt.clf()
    plt.hist(all_list, bins=40, color='r', normed=True, log=True)
    # mytitle = "Intron read density (normalized by average exon read density of respective gene) histogram"
    plt.title(mytitle)
    plt.xlabel(mylabel)
    plt.ylabel("Frequency")
    #plt.legend(loc = 'upper left')
    plt.savefig(mytitle + ".png", format="png")
    return all_list
예제 #9
0
def getMergedExonicRegions(transcripts):
    # Return the merged exons in the format of list of (start, end)
    all_exons = []
    for transcript in transcripts:
        all_exons += transcript.getExons()
    all_exons = sorted(all_exons, key=itemgetter(0))
    return Utility_extended.union(all_exons)
def assign_islands_to_REs(re_file_dir, re_file_name, chrom, chrom_length, island_list, upstream, downstream, min_re_length=0):
	"""
	islands are non-overlapping [start, end]
	returns {(start,end): [id]}
	
	"""
	currentdir = os.getcwd()
	os.chdir(re_file_dir)
	#{id:RepElement}
	#
	known_repelements = RepElements.KnownRepElements.initiate_from_file([chrom], re_file_name)
	#print len(known_repelements.rep_elements.keys())
	island_flags = [0 for island in island_list]
	for myid in known_repelements.rep_elements.keys():
		myelement = known_repelements.rep_elements[myid]
		#No matter whether positive or negative, genoStart < genoEnd
		if plus.match(myelement.strand):
			start = max(myelement.genoStart - upstream, 0)
			end = min(myelement.genoEnd + upstream, chrom_length)
		elif minus.match(myelement.strand):
			start = max(myelement.genoStart - downstream, 0)
			end = min(myelement.genoEnd + upstream, chrom_length)
		else:
			print myelement
			print "strand not recognized"
			exit(1)
		
		region = (start, end)
		(start_index, end_index) = Utility_extended.find_islands_overlapping_with_region(region, island_list) #returns [island]
		for index in range(start_index, end_index):
			island_flags [index] = 1 # These islands overlap with REs
		#print re_file_name, region, start_index, end_index
	os.chdir(currentdir)
	return island_flags
예제 #11
0
    def find_colliding_ids(self, entrez_gene_boundaries, extension=0):
        """
		entrez_gene_boundaries: [(start, end, entrez_id)]
		build and return a dic: {id:[ids_in_collision]}, where the value is the list of ids which collides with  
		"""

        #union the boundaries to find clusters
        clusters_of_ids = Utility_extended.union_with_trace(
            entrez_gene_boundaries, extension
        )  #Output {(start, end):[ entrez_gene_boundaries elements that contribute to that region]}

        entrez_id_collision_dic = {}
        for item in entrez_gene_boundaries:
            myid = item[2]
            entrez_id_collision_dic[myid] = []

        for region in clusters_of_ids.keys():
            ids = [item[2] for item in clusters_of_ids[region]]
            # although in the same union, a pair of ids need not to be directly overlapping
            # Make sure a pair is traversed only once
            for i in xrange(len(ids)):
                myid = ids[i]
                for j in range(i + 1, len(ids)):
                    the_other_id = ids[j]
                    if self.is_overlapping(myid, the_other_id, extension) == 1:
                        entrez_id_collision_dic[myid].append(the_other_id)
                        entrez_id_collision_dic[the_other_id].append(myid)
        return entrez_id_collision_dic
def get_read_count_on_genic_regions(geneList, bedFile, fragment_size):
    """
	only deals with one chrom
	geneList is a UCSC_lite object: name, chrom, strand, txStart, txEnd
	
	Returns three lists: gene name, length,  read count
	"""

    (gene_name_list, region_start_list,
     region_end_list) = get_feature_lists(geneList)
    tag_position_list = []
    f = open(bedFile, 'r')
    for line in f:
        if not re.match("#", line):
            line = line.strip()
            sline = line.split()
            tag_position_list.append(
                associate_tags_with_regions.tag_position(sline, fragment_size))
    f.close()
    if not Utility_extended.is_list_sorted(tag_position_list):
        tag_position_list.sort()
    #A list, with total tag number on this region, order as the region lists
    read_count_list = associate_tags_with_regions.find_readcount_on_regions(
        tag_position_list, region_start_list, region_end_list)

    assert len(gene_name_list) == len(read_count_list)
    region_length_list = [0] * len(gene_name_list)
    for i in xrange(len(gene_name_list)):
        region_length_list[i] = region_end_list[i] - region_start_list[i]

    return gene_name_list, region_length_list, read_count_list
예제 #13
0
def join_dics(mydic_1, mydic_2):
    outdic = {}
    intersection = Utility_extended.intersection(mydic_1.keys(),
                                                 mydic_2.keys())
    for ID in intersection:
        out = mydic_1[ID] + mydic_2[ID]
        outdic[ID] = out
    return outdic
예제 #14
0
def get_feature_level(re_tree, summary_name):
    """
	Find the mean rpkm, median rpkm, or presence of each mark for each species 
	return:
	feature_level_mean: {reClass:{reFamily:{reName:{feature_name:level}}}}
	feature_level_median: {reClass:{reFamily:{reName:{feature_name:level}}}}
	feature_enrichment: {reClass:{reFamily:{reName:{feature_name:enrichment_ratio}}}}, 
		where enrichment_ratio = # sites with the mark/ # of sites
	"""
    feature_level_mean = {}
    feature_level_median = {}
    feature_enrichment = {}

    flag = 0

    for reClass in re_tree.keys():
        feature_level_mean[reClass] = {}
        feature_level_median[reClass] = {}
        feature_enrichment[reClass] = {}

        for reFamily in re_tree[reClass].keys():
            feature_level_mean[reClass][reFamily] = {}
            feature_level_median[reClass][reFamily] = {}
            feature_enrichment[reClass][reFamily] = {}

            for reName in re_tree[reClass][reFamily]:
                feature_level_mean[reClass][reFamily][reName] = {}
                feature_level_median[reClass][reFamily][reName] = {}
                feature_enrichment[reClass][reFamily][reName] = {}

                summary_file_name = summary_name + "_on_" + "_".join(
                    [reClass, reFamily, reName]) + ".pkl"
                assert (Utility_extended.fileExists(summary_file_name) == 1)
                inf = open(summary_file_name, 'rb')
                # {id:{feature_name:value}}
                reClass_reFamily_reName_summary = pickle.load(inf)
                inf.close()

                if flag == 0:  # Do this only one time

                    feature_names = AnalyzeRNASeq.get_feature_names(
                        reClass_reFamily_reName_summary)
                    print "\nFeature names are: ", feature_names
                    flag = 1

                for feature_name in feature_names:
                    if feature_name != "annotation":
                        mean, median = calculate_level(
                            reClass_reFamily_reName_summary, feature_name)
                        feature_level_mean[reClass][reFamily][reName][
                            feature_name] = mean
                        feature_level_median[reClass][reFamily][reName][
                            feature_name] = median
                        feature_enrichment[reClass][reFamily][reName][
                            feature_name] = calculate_enrichment(
                                reClass_reFamily_reName_summary, feature_name)

    return feature_level_mean, feature_level_median, feature_enrichment
예제 #15
0
def find_pattern(re_tree,
                 summary_name,
                 present_list,
                 absent_list,
                 threshold=0.0001):
    """
	present_list:[feature_name] features that are required to be present
	absent_list:[feature_name] features that are required to be absent
	
	return: {reClass:{reFamily:{reName:{feature_name:enrichment_ratio}}}}
	{reClass:{reFamily:{reName:{feature_name:[ids]}}}}
	"""

    pattern_enrichment = {}
    pattern_positve_ids = {}
    flag = 0

    for reClass in re_tree.keys():
        pattern_enrichment[reClass] = {}
        pattern_positve_ids[reClass] = {}

        for reFamily in re_tree[reClass].keys():
            pattern_enrichment[reClass][reFamily] = {}
            pattern_positve_ids[reClass][reFamily] = {}

            for reName in re_tree[reClass][reFamily]:

                summary_file_name = summary_name + "_on_" + "_".join(
                    [reClass, reFamily, reName]) + ".pkl"
                assert (Utility_extended.fileExists(summary_file_name) == 1)
                inf = open(summary_file_name, 'rb')
                # {id:{feature_name:value}}
                reClass_reFamily_reName_summary = pickle.load(inf)
                inf.close()

                if flag == 0:  # Do this only one time

                    feature_names = AnalyzeRNASeq.get_feature_names(
                        reClass_reFamily_reName_summary)
                    print "\nFeature names are: ", feature_names

                    assert (set(present_list).issubset(set(feature_names)))
                    assert (set(absent_list).issubset(set(feature_names)))

                    flag = 1

                enrichment, positive_ids = calculate_pattern_enrichment_in_single_species(
                    reClass_reFamily_reName_summary, present_list, absent_list,
                    threshold)

                pattern_enrichment[reClass][reFamily][reName] = enrichment
                pattern_positve_ids[reClass][reFamily][reName] = positive_ids

    return pattern_enrichment, pattern_positve_ids
예제 #16
0
def getSharedExonicRegions(transcripts, min_width=5):
    """
	shared_exons: a list of (start,end), which are shared among all transcripts in input, might breaking up existing exons.
	sorted
	"""

    shared_exons = transcripts[0].getExons()
    for index in range(1, len(transcripts)):
        current_exons = transcripts[index].getExons()
        shared_exons = Utility_extended.intersect(shared_exons, current_exons,
                                                  min_width)
    return shared_exons  #sorted
예제 #17
0
def get_read_count(re_file_dir, re_file_name, feature_name, chrom,
                   chrom_length, tag_position_list, total_count, upstream,
                   downstream, min_re_length):
    """
	returns {id:{feature_name: value}}
	feature_name include: feature_name + "_rc", feature_name + "_rpkm"
	
	"""
    currentdir = os.getcwd()
    os.chdir(re_file_dir)
    known_repelements = RepElements.KnownRepElements.initiate_from_file(
        [chrom], re_file_name)

    regions = []
    for myid in known_repelements.rep_elements.keys():
        myelement = known_repelements.rep_elements[myid]
        #No matter whether positive or negative, genoStart < genoEnd
        if plus.match(myelement.strand):
            start = max(myelement.genoStart - upstream, 0)
            end = min(myelement.genoEnd + upstream, chrom_length)
        elif minus.match(myelement.strand):
            start = max(myelement.genoStart - downstream, 0)
            end = min(myelement.genoEnd + upstream, chrom_length)
        else:
            print myelement
            print "strand not recognized"
            exit(1)
        regions.append((start, end, myelement.id))

    tag_list = [(element, 0) for element in tag_position_list]
    read_counts = Utility_extended.get_read_counts_on_regions(
        tag_list, regions)  #returns a list, [region, read_count]
    rc_dic = {}
    for item in read_counts:
        region = item[0]
        start = region[0]
        end = region[1]
        myid = region[2]
        if start == end:
            print chrom, myid, start, end
        if (
                end - start
        ) >= min_re_length:  #only include those with length >= min_re_length
            rc = item[1]
            rpkm = rc / ((total_count) / 1000000.0)
            rpkm = rpkm / ((end - start) / 1000.0)
            rc_dic[myid] = {}
            rc_dic[myid][feature_name + "_rc"] = rc
            rc_dic[myid][feature_name + "_rpkm"] = rpkm
    os.chdir(currentdir)
    return rc_dic
예제 #18
0
def associate_tags_with_3UTR (tag_positions, UTRregion_start, UTRregion_end):
	#Cannot use similar code from Utility_Extended as it requires strings while we are dealing with integers
	my_tag_list = []
	if (Utility_extended.is_list_sorted(tag_positions)==0):
		my_tag_list = sorted(tag_positions)
	else:
		my_tag_list = tag_positions
		
	assert (UTRregion_start<=UTRregion_end)
	start_ind = bisect.bisect_left(my_tag_list, UTRregion_start)
	end_ind = bisect.bisect_right(my_tag_list, UTRregion_end)
	tags = my_tag_list[start_ind : end_ind]
	
	return tags
예제 #19
0
def associate_tags_with_3UTR(tag_positions, UTRregion_start, UTRregion_end):
    #Cannot use similar code from Utility_Extended as it requires strings while we are dealing with integers
    my_tag_list = []
    if (Utility_extended.is_list_sorted(tag_positions) == 0):
        my_tag_list = sorted(tag_positions)
    else:
        my_tag_list = tag_positions

    assert (UTRregion_start <= UTRregion_end)
    start_ind = bisect.bisect_left(my_tag_list, UTRregion_start)
    end_ind = bisect.bisect_right(my_tag_list, UTRregion_end)
    tags = my_tag_list[start_ind:end_ind]

    return tags
예제 #20
0
def remove_redundancy(mylist):
    """
	list item: (key, annotation)
	If multiple elements have the same key, only the first element is retained.
	"""
    unique_list = []
    if mylist != []:
        if Utility_extended.is_listT_sorted(mylist) != 1:
            mylist = sorted(mylist, key=itemgetter(0))
        unique_list.append(mylist[0])
        for i in range(1, len(mylist)):
            if mylist[i][0] != mylist[i - 1][0]:
                unique_list.append(mylist[i])
    return unique_list
예제 #21
0
def log_transform_rpkm_distribution(rd_distribution,
                                    pc=0.0000000001,
                                    id_set=None):
    """
	returns {entrezID:[((start, end), density)]
	"""
    myids = Utility_extended.get_subset_ids_from_dic(rd_distribution, id_set)

    log_read_density_distribution = {}
    for entrez_id in myids:
        rpkms = rd_distribution[entrez_id]
        log_read_density_distribution[entrez_id] = [
            (item[0], log(item[1] + pc, 2)) for item in rpkms
        ]
    return log_read_density_distribution
예제 #22
0
def getSharedIntronicRegions(transcripts, min_width=5):
    """
	shared_introns: a list of (start,end), which are shared among all transcripts in input, might breaking up existing introns.
	sorted
	"""

    shared_introns = transcripts[0].getIntrons()
    for index in range(1, len(transcripts)):
        current_introns = transcripts[index].getIntrons()
        #if len(current_introns) == 0 and len(shared_introns) == 0:
        #for transcript in transcripts:
        #print transcript.getAll()
        shared_introns = Utility_extended.intersect(shared_introns,
                                                    current_introns, min_width)
    return shared_introns  #sorted
예제 #23
0
def get_rd_relative_fluctuation_histogram(rd_distribution,
                                          num_introns_cutoff,
                                          mylabel,
                                          mytitle,
                                          pc=0.000000001,
                                          id_set=None):
    """
	rd_distribution:{entrezID:[((start, end), rd)]}
	For each simple gene, calculate a relative fluctuation of per-intron iri: standard deviation/mean {entrez_id: rf}
	The reason for relative fluctuation is for 
	"""
    myids = Utility_extended.get_subset_ids_from_dic(rd_distribution, id_set)

    relative_fluctuation = {}
    for entrez_id in myids:
        this_gene = rd_distribution[entrez_id]
        if len(this_gene) >= num_introns_cutoff:
            iris = [item[1] for item in this_gene]
            mean = numpy.average(iris)
            if mean > 0:
                std = numpy.std(iris)
                rf = std / mean
                relative_fluctuation[entrez_id] = rf
    plt.clf()
    plt.figure(1)
    plt.subplot(211)
    plt.hist(relative_fluctuation.values(),
             bins=50,
             color='g',
             normed=True,
             log=True)
    plt.title(mytitle)
    plt.xlabel(mylabel)
    plt.ylabel("Frequency")
    #plt.legend(loc = 'upper left')

    plt.subplot(212)
    plt.hist([log(item + pc, 2) for item in relative_fluctuation.values()],
             bins=50,
             color='r',
             normed=True,
             log=True)
    plt.xlabel("log " + mylabel)
    plt.ylabel("Frequency")
    #plt.legend(loc = 'upper left')
    plt.savefig(mytitle + ".png", format="png")

    return relative_fluctuation
def get_read_count_on_exons(gene_coords, bedFile, fragment_size):
    """
	only deals with one chrom
	gene_coords is a list of UCSC object

	Return: three lists: geneName, exonsTotalLength, exonsTotalReadCount
	"""
    tag_position_list = []
    f = open(bedFile, 'r')
    for line in f:
        if not re.match("#", line):
            line = line.strip()
            sline = line.split()
            tag_position_list.append(
                associate_tags_with_regions.tag_position(sline, fragment_size))
    f.close()
    if not Utility_extended.is_list_sorted(tag_position_list):
        tag_position_list.sort()

    geneName = []
    exonsTotalLength = []  #used for calculating the RPKM value
    exonsTotalReadCount = []
    for g in gene_coords:
        geneName.append(g.name)
        if g.exonCount > 0:
            exon_Starts_str = (g.exonStarts.split(
                ','))[:-1]  #remove the last '' because the format is '1,2,3,'
            exon_Ends_str = (g.exonEnds.split(
                ','))[:-1]  #remove the last '' because the format is '1,2,3,'
            exon_Starts = [int(x) for x in exon_Starts_str]
            exon_Ends = [int(x) for x in exon_Ends_str]
            assert len(exon_Starts) == len(exon_Ends)

            totalLength = 0
            for i in xrange(len(exon_Starts)):
                totalLength += exon_Ends[i] - exon_Starts[i]
            exonsTotalLength.append(totalLength)

            exon_read_count_list = associate_tags_with_regions.find_readcount_on_regions(
                tag_position_list, exon_Starts, exon_Ends)
            exonsTotalReadCount.append(sum(exon_read_count_list))
        else:
            exonsTotalLength.append(0)
            exonsTotalReadCount.append(0)
    return geneName, exonsTotalLength, exonsTotalReadCount
예제 #25
0
def get_coverage(re_file_name,
                 chrom,
                 chrom_length,
                 islands,
                 upstream,
                 downstream,
                 min_re_length=10):
    """
	Find the coverage for each re instances  
	returns {id:value}
	"""
    known_repelements = RepElements.KnownRepElements.initiate_from_file(
        [chrom], re_file_name)
    #Get the regions defined by the REs
    regions = []  #(start, end, myelement.id)
    for myid in known_repelements.rep_elements.keys():
        myelement = known_repelements.rep_elements[myid]
        #No matter whether positive or negative, genoStart < genoEnd
        if plus.match(myelement.strand):
            start = max(myelement.genoStart - upstream, 0)
            end = min(myelement.genoEnd + upstream, chrom_length)
        elif minus.match(myelement.strand):
            start = max(myelement.genoStart - downstream, 0)
            end = min(myelement.genoEnd + upstream, chrom_length)
        else:
            print myelement
            print "strand not recognized"
            exit(1)
        regions.append((start, end, myelement.id))

    #{region:coverage}, region:(start, end, myelement.id)
    regions_w_coverage = Utility_extended.find_coverage_by_islands_on_regions(
        regions, islands)
    #change to {id:coverage}
    coverage_dic = {}
    for myregion in regions_w_coverage.keys():
        start = myregion[0]
        end = myregion[1]
        length = end - start + 1
        if length >= min_re_length:
            myid = myregion[2]
            coverage_dic[myid] = regions_w_coverage[myregion]

    return coverage_dic
def find_reads_on_regions(read_file, regions, shift, outfile_name, boundary_extension = 0):
	"""
	regions: [BED3]. The regions can overlap
	read_file contains reads from only one chrom
	Return:
	read count on each island: {(start, end): rc}
	"""
	regions_rc = [] #[(region, rc)]
	tag_list = [] #[(position, line)]
	tag_flag_list = [] # Flags whether a read pass filtering.
	
	if Utility.fileExists(read_file):
		f = open(read_file,'r')
		for line in f:
			if not re.match("#", line):
				line = line.strip()
				sline = line.split()
				position = tag_position(sline, shift)
				tag_list.append((position, line))
		f.close()
	# sort according to position
	if (Utility_extended.is_listT_sorted(tag_list)==0):
		tag_list.sort(key=itemgetter(0))
	positions = [tag[0] for tag in tag_list]
	tag_flag_list = [0 for tag in tag_list]
	
	for region in regions:
		start = max(region.start - boundary_extension, 0)
		end = region.end + boundary_extension 
		assert (start<=end)
		start_ind = bisect.bisect_left(positions, start)
		end_ind = bisect.bisect_right(positions, end)
		for index in range(start_ind, end_ind): # These reads are on regions
			tag_flag_list[index] = 1
		rc = end_ind - start_ind
		regions_rc.append((region, rc))

	o = open(outfile_name, 'w')
	for i in xrange(len(tag_list)):
		if tag_flag_list[i] == 1:
			o.write( tag_list[i][1] + '\n') # tag_list[i][1] = line
	o.close();

	return regions_rc
예제 #27
0
def get_fraction_retained_intron_per_gene_histogram(rc_distribution,
                                                    rc_threshold,
                                                    mytitle,
                                                    id_set=None):
    """
	rc_distribution:{entrezID:[((start, end), rc)]
	"""
    myids = Utility_extended.get_subset_ids_from_dic(rc_distribution, id_set)
    myids.sort()  # enable predictable behavior of chrom order

    all_list = {}  # one entry each gene
    total = 0.0
    total_above_threshold = 0.0
    total_no_tron_genes = 0
    for entrez_id in myids:
        rcs = [item[1]
               for item in (rc_distribution)[entrez_id]]  #read counts[]
        if len(rcs) > 0:
            above_threshold = 0
            for item in rcs:
                if item >= rc_threshold:
                    above_threshold += 1
            above_threshold_fraction = float(above_threshold) / (len(rcs))
            all_list[entrez_id] = above_threshold_fraction
            total += len(rcs)
            total_above_threshold += above_threshold
        else:
            #print "%s has no trons" %entrez_id
            total_no_tron_genes += 1
    print "%d out of %d genes have no trons" % (total_no_tron_genes,
                                                len(myids))
    print "The number of introns with read count >= %d is %d, fraction  %f" % (
        rc_threshold, total_above_threshold,
        float(total_above_threshold) / total)
    plt.clf()
    plt.hist(all_list.values(), bins=100, color='r', normed=True)
    # mytitle = "Intron read density (normalized by average exon read density of respective gene) histogram"
    plt.title(mytitle)
    plt.xlabel("Fraction of retained introns per gene ")
    plt.ylabel("Frequency")
    #plt.legend(loc = 'upper left')
    plt.savefig(mytitle + ".png", format="png")
    return all_list
예제 #28
0
def get_rd_fluctuation_histogram(rd_distribution,
                                 num_introns_cutoff,
                                 mylabel,
                                 mytitle,
                                 pc=0.000000001,
                                 id_set=None):
    """
	rd_distribution:{entrezID:[((start, end), rd)]}
	For each simple gene, calculate fluctuation of per-intron read density normalized by expression: standard deviation {entrez_id: rf}
	
	can also be used for iri
	Return {entrez_id:value}
	"""
    myids = Utility_extended.get_subset_ids_from_dic(rd_distribution, id_set)

    fluctuation = {}
    for entrez_id in myids:
        this_gene = rd_distribution[entrez_id]
        if len(this_gene) >= num_introns_cutoff:
            iris = [item[1] for item in this_gene]
            std = numpy.std(iris)
            fluctuation[entrez_id] = std
    plt.clf()
    plt.figure(1)
    plt.subplot(211)
    plt.hist(fluctuation.values(), bins=50, color='g', normed=True, log=True)
    plt.title(mytitle)
    plt.xlabel(mylabel)
    plt.ylabel("Frequency")
    #plt.legend(loc = 'upper left')

    plt.subplot(212)
    plt.hist([log(item + pc, 2) for item in fluctuation.values()],
             bins=50,
             color='r',
             normed=True,
             log=True)
    plt.xlabel("log " + mylabel)
    plt.ylabel("Frequency")
    #plt.legend(loc = 'upper left')
    plt.savefig(mytitle + ".png", format="png")

    return fluctuation
예제 #29
0
def get_distribution_ito_tron_number(rc_distribution, id_set=None):
    """
	Most appropriate for simple genes
	ito: in terms of 
	Resection rc_distribution so that it is organized by the number of trons 
	Return: {number_of_trons:{entrez_id:[((start, end), iri)]}}
	"""
    myids = Utility_extended.get_subset_ids_from_dic(rc_distribution, id_set)

    distribution_ito_tron_number = {
    }  # {number_of_trons:{entrez_id:[((start, end), iri)]}}
    for entrez_id in myids:
        rcs = (rc_distribution)[entrez_id]
        number_of_trons = len(rcs)
        if number_of_trons not in distribution_ito_tron_number.keys():
            distribution_ito_tron_number[number_of_trons] = {}
        else:
            (distribution_ito_tron_number[number_of_trons])[entrez_id] = rcs
    return distribution_ito_tron_number
예제 #30
0
def get_expression_rpkms_on_shared_exons(rc_on_shared_exons,
                                         totalcount,
                                         id_set=None):
    """
	rc_on_shared_exons:{entrezID:[((start, end), read_count)]}
	Returns {id:value}
	"""
    myids = Utility_extended.get_subset_ids_from_dic(rc_on_shared_exons,
                                                     id_set)

    expression_rpkm = {}
    for myid in myids:
        exons = rc_on_shared_exons[myid]

        shared_exons_total_length = sum(
            [item[0][1] - item[0][0] + 1 for item in exons])
        shared_exons_rc = sum([item[1] for item in exons])
        expression_rpkm[myid] = shared_exons_rc * (
            1000.0 / shared_exons_total_length) * (1000000 / float(totalcount))
    return expression_rpkm
예제 #31
0
def print_out_summary(lib_file, outputfile):
    """
	lib file in pickle format: {repID:{feature:value}}
	output the rc data of the particular pickle file 
	"""
    assert (Utility_extended.fileExists(lib_file) == 1)
    lib = pickle.load(open(lib_file, 'rb'))
    myid = lib.keys()[0]
    mykeys = (lib[myid]).keys()
    mykeys = sorted(mykeys)
    of = open(outputfile, 'w')
    oline = "ID" + "\t" + ("\t").join(mykeys) + "\n"
    of.write(oline)
    for myid in lib.keys():
        re = lib[myid]
        oline = str(myid)
        for feature in mykeys:
            oline += "\t" + str(re[feature])
        oline += "\n"
        of.write(oline)
    of.close()
예제 #32
0
def Calculate3UTRUsage(entrez_genes, bedfile, chroms, outfile, threshold, PAfile, extension, index):
	"""
	entrez genes are made sure to be on one strand, 
	the bed file are reads for that strand

	entrez_genes is a KnownEntrezGenes class object
	The raw read file needs to conform to bed format

	column_index: column in bed file for sorting

	"""
	# Separate reads by chrom 
	rawreadslibName1 = (bedfile).split('/')[-1]
	rawreadssuffix1 = rawreadslibName1.split('.')[-1] 
	rawreadslibName1 = rawreadslibName1.split('.')[0]
	rawreadsextension1 = "-" + rawreadslibName1 +'.' + rawreadssuffix1 + "1"
	if Utility_extended.fileExists(bedfile):
		if Utility_extended.chrom_files_exist(chroms, rawreadsextension1) != 1:
			# Separate by chrom and sort by start
			print chroms, rawreadsextension1, " files do not exist, separate by chroms and sort each file according to the second column. "
			Utility_extended.separate_by_chrom_sort(chroms, bedfile, rawreadsextension1, str(index))
	else:
		print bedfile, " is not found"
		sys.exit(1)

	#This part is to access the polyadenylation sites
	PA1 = open(PAfile, 'r')
	
	PAsiteslist = []
	PA2 = 'i'
	while PA2 != '':
		PA2 = PA1.readline()
		if PA2 != '':
			PA3 = PA2.strip('\n')
			PA4 = PA3.split('\t')
			PAsiteslist.append((PA4[0],PA4[1]))

	PA1.close()

	# Here the output is 'a', i.e. the output is appended to an existing file instead of creating one
	outf = open(outfile, 'a')	
	for chrom in chroms: 
		if chrom in entrez_genes.chroms:
			# a KnownEntrezGenes object
			entrez_genes_by_chrom =  Entrez.KnownEntrezGenes([chrom], entrez_genes.subset_by_chrom(chrom))
			# Get the read locations
			if Utility_extended.fileExists(chrom + rawreadsextension1):
				f = open(chrom + rawreadsextension1, 'r')
				tag_positions = []
				for line in f:
					line = line.strip()
					sline = line.split()
					#make sure the extension is always 0, otherwise the rest of the program might not work as intended
					tag_positions.append(associate_tags_with_regions.tag_position(sline, 0))
				
				f.close()
				if not Utility_extended.is_list_sorted(tag_positions):
					tag_positions.sort()					
				#By this point tag_positions is a sorted list of all the reads located on the strand and chromosome the code is currently dealing with

				for entrez_id in entrez_genes_by_chrom.entrez_ids:
					gene = entrez_genes_by_chrom.entrez_genes[entrez_id] # an EntrezGene class object
					# get_3UTRs gets the ENTREZ 3'UTR, which appears to generally give the beginning of the 3'UTR and a site very close to the most distal polyadenylation site
					three_UTRs = gene.get_3UTRs()
					# Mastertuplemaker uses the ENTREZ 3'UTR and the polyA sites given to create the true data for the 3'UTR needed for CUTR_vs_AUTR to work
					true3UTRstarts, true3UTRends, UTRregion_start, UTRregion_end, UTRbeginning = Mastertuplemaker(three_UTRs,PAsiteslist,chrom,gene.strand, extension)
					#value should always be 1 as only 3'UTR with more than 1 polyA site need be considered
					if len(true3UTRends) > 1:
						#find all reads inside the 3'UTR
						inside_reads = associate_tags_with_3UTR(tag_positions, UTRregion_start, UTRregion_end)
						#finds reads in each region of the 3'UTR and calculates aUTR/cUTR for each of them
						#PolyAsites potentially useful for output
						RUDs, basic_RUD, PolyAsites = CUTR_vs_AUTR(true3UTRstarts, true3UTRends, inside_reads, gene.strand, threshold)
						
						#important if one wants to output gene_symbol information
						gene_symbol = []
						for mytranscript in gene.transcripts:
							if mytranscript.additional_annotations[0] not in gene_symbol:
								gene_symbol.append(mytranscript.additional_annotations[0])


						#outline to use to output RUDs
						outline = str(entrez_id) + "\t" + chrom + "\t" + gene.strand + "\t" + str(basic_RUD) + "\t" + ",".join(map(str, RUDs)) + "\n"
						
						#outline to use to output polyA information for a species
						#outline = str(entrez_id) + "\t" + chrom + "\t" + gene.strand + "\t" + str(UTRbeginning) + "\t" + ",".join(map(str, PolyAsites)) + "\n"
					
						outf.write(outline)
	outf.close()