def join_lists(mylist_1, mylist_2): """ input: lists of (key, annotation) output: list of (key, annotation) """ if Utility_extended.is_listT_sorted(mylist_1) != 1: mylist_1 = sorted(mylist_1, key=itemgetter(0)) mylist_1 = remove_redundancy(mylist_1) #print len(mylist_1) if Utility_extended.is_listT_sorted(mylist_2) != 1: mylist_2 = sorted(mylist_2, key=itemgetter(0)) mylist_2 = remove_redundancy(mylist_2) #print len(mylist_2) mylist_1_IDs = [i[0] for i in mylist_1] mylist_2_IDs = [i[0] for i in mylist_2] outlist = [] if len(mylist_1) <= len(mylist_2): for item in mylist_1: ID = item[0] if ID in mylist_2_IDs: index = mylist_2_IDs.index(ID) out = item[1] + mylist_2[index][1] outlist.append((ID, out)) else: for item in mylist_2: ID = item[0] if ID in mylist_1_IDs: index = mylist_1_IDs.index(ID) out = mylist_1[index][1] + item[1] outlist.append((ID, out)) return outlist
def get_iri_by_gene(rc_on_shared_introns, rc_on_shared_exons, rc_threshold, id_set=None): """ normalize the intron read density by the average exon read density of that gene rc_on_shared_introns: {entrezID:[((start, end), rc)]} rc_on_shared_exons: {entrezID:[((start, end), rc)]} expression cutoff is implemented by id_set and rc_threshold returns {entrezID:[((start, end), iri)]} """ myids = Utility_extended.get_subset_ids_from_dic(rc_on_shared_introns, id_set) myids = Utility_extended.get_subset_ids_from_dic(rc_on_shared_exons, myids) iri_by_gene = {} #{entrez_id:[((start, end), iri)]} for entrez_id in myids: exons_rc = get_trons_rc(rc_on_shared_exons[entrez_id]) if exons_rc > rc_threshold: iri_by_gene[entrez_id] = [] exons_length = get_trons_length(rc_on_shared_exons[entrez_id]) rc_density_on_exons = float(exons_rc) / exons_length for intron in rc_on_shared_introns[entrez_id]: intron_rc = intron[1] intron_coordinate = intron[0] intron_length = intron_coordinate[1] - intron_coordinate[0] + 1 rc_density_on_intron = float(intron_rc) / intron_length iri = rc_density_on_intron / rc_density_on_exons iri_by_gene[entrez_id].append((intron_coordinate, iri)) return iri_by_gene
def calculateExonIntrons(entrez_genes, bedfile, column_index, chroms, fragment_size, totalcount, out_file): lib_name = (bedfile).split('/')[-1] # remove directory suffix = lib_name.split('.')[-1] # txt lib_name = lib_name.split('.')[0] extension = "-" + lib_name +'.' + suffix +"1" if Utility_extended.fileExists(bedfile): if Utility_extended.chrom_files_exist(chroms, extension) != 1: # Separate by chrom and sort by start print chroms, extension, " files do not exist, separate by chroms. " Utility_extended.separate_by_chrom_sort(chroms, bedfile, extension, [column_index]) else: print bedfile, " is not found"; sys.exit(1) all_reads_on_shared_exons = {} # {entrezID:[((start, end), read_count)]} all_reads_on_shared_introns = {} # {entrezID:[((start, end), read_count)]} all_reads_on_merged_transcripts = {} #{entrezID:[((start, end), read_count)]} all_summary = {} for chrom in chroms: chrombed = chrom + extension entrez_genes_by_chrom = Entrez.KnownEntrezGenes([chrom], entrez_genes.subset_by_chrom(chrom)) (reads_on_shared_exons, reads_on_shared_introns, reads_on_merged_transcripts, summary) = calculateExonIntrons_by_chrom (entrez_genes_by_chrom, chrombed, fragment_size, totalcount, out_file) #if chrom == chroms[0]: #myid = reads_on_shared_exons.keys()[0] #test(entrez_genes_by_chrom, reads_on_shared_introns, myid) all_reads_on_shared_exons.update(reads_on_shared_exons) all_reads_on_shared_introns.update(reads_on_shared_introns) all_reads_on_merged_transcripts.update(reads_on_merged_transcripts) all_summary.update(summary) SeparateByChrom.cleanup(chroms, extension) return (all_reads_on_shared_exons, all_reads_on_shared_introns, all_reads_on_merged_transcripts, summary)
def main(argv): parser = OptionParser() (opt, args) = parser.parse_args(argv) A = [(1, 2.5), (3.5, 15), (45, 71), (74, 93)] B = [(1.2, 2.5), (2, 7), (2, 2), (57, 84)] print A print B print Utility_extended.intersect(A, B, 0.0001)
def main(argv): parser = OptionParser() parser.add_option("-i", "--inputfile", action="store", type="string", dest="infile", metavar="<file>", help="name for input file, which is a multicolumn text file") parser.add_option("-c", "--column", action="store", type="int", dest="c", metavar="<int>", help="the index of the column to be rescaled, 1-based") parser.add_option("-r", "--rescale_factor", action="store", type="float", dest="rescale_factor", metavar="<float>", help="the rescale factor that will be multiplied to the numbers in column c") parser.add_option("-o", "--outputfile", action="store", type="string", dest="outfile", metavar="<file>", help="name for output file") (opt, args) = parser.parse_args(argv) if len(argv) < 8: parser.print_help() sys.exit(1) Utility_extended.rescale_a_column(opt.infile, opt.c-1, opt.rescale_factor, opt.outfile)
def rank_iri_by_intron_length(iri, id_set=None): """ The module is to test whether long introns likely will have higher intron retention It uses the gene-based normalization for iri: {entrezID:[((start, end), iri)]} returns [(id, length, iri_value)] """ #print "3" myids = Utility_extended.get_subset_ids_from_dic(iri, id_set) #print len(myids) ranked_list = [] for myid in myids: #print "The id is ", myid introns = iri[myid] for item in introns: start = item[0][0] end = item[0][1] length = end - start + 1 iri_value = item[1] ranked_list.append((myid, length, iri_value)) ranked_list.sort(key=itemgetter(1)) return ranked_list
def getShared5UTR(transcripts, min_width=5): my_5UTRs = [] for transcript in transcripts: my_5UTR = transcript.get5UTR(0, 0) if len(my_5UTR) > 0: my_5UTRs.append(my_5UTR) return Utility_extended.shared(my_5UTRs)
def get_tron_rpkm_histogram(rpkm_distribution, mylabel, mytitle, id_set=None): """ A significant fraction (35%) of intronic regions are free of reads. Is it because of mappability? The simplest approach is the output these regions and check them out on genome browser. Another approach is to calculate an irrelevant ChIP-Seq (K36me3)/RNA-Seq library and compare the same regions, and use the scatter plot to find out the mappability. rpkm_distribution """ myids = Utility_extended.get_subset_ids_from_dic(rpkm_distribution, id_set) myids.sort() # enable predictable behavior of entrez_id order all_list = [] # One entry each tronic regions for entrez_id in myids: rpkms = [item[1] for item in (rpkm_distribution)[entrez_id]] all_list.extend(rpkms) plt.clf() plt.hist(all_list, bins=40, color='r', normed=True, log=True) # mytitle = "Intron read density (normalized by average exon read density of respective gene) histogram" plt.title(mytitle) plt.xlabel(mylabel) plt.ylabel("Frequency") #plt.legend(loc = 'upper left') plt.savefig(mytitle + ".png", format="png") return all_list
def getMergedExonicRegions(transcripts): # Return the merged exons in the format of list of (start, end) all_exons = [] for transcript in transcripts: all_exons += transcript.getExons() all_exons = sorted(all_exons, key=itemgetter(0)) return Utility_extended.union(all_exons)
def assign_islands_to_REs(re_file_dir, re_file_name, chrom, chrom_length, island_list, upstream, downstream, min_re_length=0): """ islands are non-overlapping [start, end] returns {(start,end): [id]} """ currentdir = os.getcwd() os.chdir(re_file_dir) #{id:RepElement} # known_repelements = RepElements.KnownRepElements.initiate_from_file([chrom], re_file_name) #print len(known_repelements.rep_elements.keys()) island_flags = [0 for island in island_list] for myid in known_repelements.rep_elements.keys(): myelement = known_repelements.rep_elements[myid] #No matter whether positive or negative, genoStart < genoEnd if plus.match(myelement.strand): start = max(myelement.genoStart - upstream, 0) end = min(myelement.genoEnd + upstream, chrom_length) elif minus.match(myelement.strand): start = max(myelement.genoStart - downstream, 0) end = min(myelement.genoEnd + upstream, chrom_length) else: print myelement print "strand not recognized" exit(1) region = (start, end) (start_index, end_index) = Utility_extended.find_islands_overlapping_with_region(region, island_list) #returns [island] for index in range(start_index, end_index): island_flags [index] = 1 # These islands overlap with REs #print re_file_name, region, start_index, end_index os.chdir(currentdir) return island_flags
def find_colliding_ids(self, entrez_gene_boundaries, extension=0): """ entrez_gene_boundaries: [(start, end, entrez_id)] build and return a dic: {id:[ids_in_collision]}, where the value is the list of ids which collides with """ #union the boundaries to find clusters clusters_of_ids = Utility_extended.union_with_trace( entrez_gene_boundaries, extension ) #Output {(start, end):[ entrez_gene_boundaries elements that contribute to that region]} entrez_id_collision_dic = {} for item in entrez_gene_boundaries: myid = item[2] entrez_id_collision_dic[myid] = [] for region in clusters_of_ids.keys(): ids = [item[2] for item in clusters_of_ids[region]] # although in the same union, a pair of ids need not to be directly overlapping # Make sure a pair is traversed only once for i in xrange(len(ids)): myid = ids[i] for j in range(i + 1, len(ids)): the_other_id = ids[j] if self.is_overlapping(myid, the_other_id, extension) == 1: entrez_id_collision_dic[myid].append(the_other_id) entrez_id_collision_dic[the_other_id].append(myid) return entrez_id_collision_dic
def get_read_count_on_genic_regions(geneList, bedFile, fragment_size): """ only deals with one chrom geneList is a UCSC_lite object: name, chrom, strand, txStart, txEnd Returns three lists: gene name, length, read count """ (gene_name_list, region_start_list, region_end_list) = get_feature_lists(geneList) tag_position_list = [] f = open(bedFile, 'r') for line in f: if not re.match("#", line): line = line.strip() sline = line.split() tag_position_list.append( associate_tags_with_regions.tag_position(sline, fragment_size)) f.close() if not Utility_extended.is_list_sorted(tag_position_list): tag_position_list.sort() #A list, with total tag number on this region, order as the region lists read_count_list = associate_tags_with_regions.find_readcount_on_regions( tag_position_list, region_start_list, region_end_list) assert len(gene_name_list) == len(read_count_list) region_length_list = [0] * len(gene_name_list) for i in xrange(len(gene_name_list)): region_length_list[i] = region_end_list[i] - region_start_list[i] return gene_name_list, region_length_list, read_count_list
def join_dics(mydic_1, mydic_2): outdic = {} intersection = Utility_extended.intersection(mydic_1.keys(), mydic_2.keys()) for ID in intersection: out = mydic_1[ID] + mydic_2[ID] outdic[ID] = out return outdic
def get_feature_level(re_tree, summary_name): """ Find the mean rpkm, median rpkm, or presence of each mark for each species return: feature_level_mean: {reClass:{reFamily:{reName:{feature_name:level}}}} feature_level_median: {reClass:{reFamily:{reName:{feature_name:level}}}} feature_enrichment: {reClass:{reFamily:{reName:{feature_name:enrichment_ratio}}}}, where enrichment_ratio = # sites with the mark/ # of sites """ feature_level_mean = {} feature_level_median = {} feature_enrichment = {} flag = 0 for reClass in re_tree.keys(): feature_level_mean[reClass] = {} feature_level_median[reClass] = {} feature_enrichment[reClass] = {} for reFamily in re_tree[reClass].keys(): feature_level_mean[reClass][reFamily] = {} feature_level_median[reClass][reFamily] = {} feature_enrichment[reClass][reFamily] = {} for reName in re_tree[reClass][reFamily]: feature_level_mean[reClass][reFamily][reName] = {} feature_level_median[reClass][reFamily][reName] = {} feature_enrichment[reClass][reFamily][reName] = {} summary_file_name = summary_name + "_on_" + "_".join( [reClass, reFamily, reName]) + ".pkl" assert (Utility_extended.fileExists(summary_file_name) == 1) inf = open(summary_file_name, 'rb') # {id:{feature_name:value}} reClass_reFamily_reName_summary = pickle.load(inf) inf.close() if flag == 0: # Do this only one time feature_names = AnalyzeRNASeq.get_feature_names( reClass_reFamily_reName_summary) print "\nFeature names are: ", feature_names flag = 1 for feature_name in feature_names: if feature_name != "annotation": mean, median = calculate_level( reClass_reFamily_reName_summary, feature_name) feature_level_mean[reClass][reFamily][reName][ feature_name] = mean feature_level_median[reClass][reFamily][reName][ feature_name] = median feature_enrichment[reClass][reFamily][reName][ feature_name] = calculate_enrichment( reClass_reFamily_reName_summary, feature_name) return feature_level_mean, feature_level_median, feature_enrichment
def find_pattern(re_tree, summary_name, present_list, absent_list, threshold=0.0001): """ present_list:[feature_name] features that are required to be present absent_list:[feature_name] features that are required to be absent return: {reClass:{reFamily:{reName:{feature_name:enrichment_ratio}}}} {reClass:{reFamily:{reName:{feature_name:[ids]}}}} """ pattern_enrichment = {} pattern_positve_ids = {} flag = 0 for reClass in re_tree.keys(): pattern_enrichment[reClass] = {} pattern_positve_ids[reClass] = {} for reFamily in re_tree[reClass].keys(): pattern_enrichment[reClass][reFamily] = {} pattern_positve_ids[reClass][reFamily] = {} for reName in re_tree[reClass][reFamily]: summary_file_name = summary_name + "_on_" + "_".join( [reClass, reFamily, reName]) + ".pkl" assert (Utility_extended.fileExists(summary_file_name) == 1) inf = open(summary_file_name, 'rb') # {id:{feature_name:value}} reClass_reFamily_reName_summary = pickle.load(inf) inf.close() if flag == 0: # Do this only one time feature_names = AnalyzeRNASeq.get_feature_names( reClass_reFamily_reName_summary) print "\nFeature names are: ", feature_names assert (set(present_list).issubset(set(feature_names))) assert (set(absent_list).issubset(set(feature_names))) flag = 1 enrichment, positive_ids = calculate_pattern_enrichment_in_single_species( reClass_reFamily_reName_summary, present_list, absent_list, threshold) pattern_enrichment[reClass][reFamily][reName] = enrichment pattern_positve_ids[reClass][reFamily][reName] = positive_ids return pattern_enrichment, pattern_positve_ids
def getSharedExonicRegions(transcripts, min_width=5): """ shared_exons: a list of (start,end), which are shared among all transcripts in input, might breaking up existing exons. sorted """ shared_exons = transcripts[0].getExons() for index in range(1, len(transcripts)): current_exons = transcripts[index].getExons() shared_exons = Utility_extended.intersect(shared_exons, current_exons, min_width) return shared_exons #sorted
def get_read_count(re_file_dir, re_file_name, feature_name, chrom, chrom_length, tag_position_list, total_count, upstream, downstream, min_re_length): """ returns {id:{feature_name: value}} feature_name include: feature_name + "_rc", feature_name + "_rpkm" """ currentdir = os.getcwd() os.chdir(re_file_dir) known_repelements = RepElements.KnownRepElements.initiate_from_file( [chrom], re_file_name) regions = [] for myid in known_repelements.rep_elements.keys(): myelement = known_repelements.rep_elements[myid] #No matter whether positive or negative, genoStart < genoEnd if plus.match(myelement.strand): start = max(myelement.genoStart - upstream, 0) end = min(myelement.genoEnd + upstream, chrom_length) elif minus.match(myelement.strand): start = max(myelement.genoStart - downstream, 0) end = min(myelement.genoEnd + upstream, chrom_length) else: print myelement print "strand not recognized" exit(1) regions.append((start, end, myelement.id)) tag_list = [(element, 0) for element in tag_position_list] read_counts = Utility_extended.get_read_counts_on_regions( tag_list, regions) #returns a list, [region, read_count] rc_dic = {} for item in read_counts: region = item[0] start = region[0] end = region[1] myid = region[2] if start == end: print chrom, myid, start, end if ( end - start ) >= min_re_length: #only include those with length >= min_re_length rc = item[1] rpkm = rc / ((total_count) / 1000000.0) rpkm = rpkm / ((end - start) / 1000.0) rc_dic[myid] = {} rc_dic[myid][feature_name + "_rc"] = rc rc_dic[myid][feature_name + "_rpkm"] = rpkm os.chdir(currentdir) return rc_dic
def associate_tags_with_3UTR (tag_positions, UTRregion_start, UTRregion_end): #Cannot use similar code from Utility_Extended as it requires strings while we are dealing with integers my_tag_list = [] if (Utility_extended.is_list_sorted(tag_positions)==0): my_tag_list = sorted(tag_positions) else: my_tag_list = tag_positions assert (UTRregion_start<=UTRregion_end) start_ind = bisect.bisect_left(my_tag_list, UTRregion_start) end_ind = bisect.bisect_right(my_tag_list, UTRregion_end) tags = my_tag_list[start_ind : end_ind] return tags
def associate_tags_with_3UTR(tag_positions, UTRregion_start, UTRregion_end): #Cannot use similar code from Utility_Extended as it requires strings while we are dealing with integers my_tag_list = [] if (Utility_extended.is_list_sorted(tag_positions) == 0): my_tag_list = sorted(tag_positions) else: my_tag_list = tag_positions assert (UTRregion_start <= UTRregion_end) start_ind = bisect.bisect_left(my_tag_list, UTRregion_start) end_ind = bisect.bisect_right(my_tag_list, UTRregion_end) tags = my_tag_list[start_ind:end_ind] return tags
def remove_redundancy(mylist): """ list item: (key, annotation) If multiple elements have the same key, only the first element is retained. """ unique_list = [] if mylist != []: if Utility_extended.is_listT_sorted(mylist) != 1: mylist = sorted(mylist, key=itemgetter(0)) unique_list.append(mylist[0]) for i in range(1, len(mylist)): if mylist[i][0] != mylist[i - 1][0]: unique_list.append(mylist[i]) return unique_list
def log_transform_rpkm_distribution(rd_distribution, pc=0.0000000001, id_set=None): """ returns {entrezID:[((start, end), density)] """ myids = Utility_extended.get_subset_ids_from_dic(rd_distribution, id_set) log_read_density_distribution = {} for entrez_id in myids: rpkms = rd_distribution[entrez_id] log_read_density_distribution[entrez_id] = [ (item[0], log(item[1] + pc, 2)) for item in rpkms ] return log_read_density_distribution
def getSharedIntronicRegions(transcripts, min_width=5): """ shared_introns: a list of (start,end), which are shared among all transcripts in input, might breaking up existing introns. sorted """ shared_introns = transcripts[0].getIntrons() for index in range(1, len(transcripts)): current_introns = transcripts[index].getIntrons() #if len(current_introns) == 0 and len(shared_introns) == 0: #for transcript in transcripts: #print transcript.getAll() shared_introns = Utility_extended.intersect(shared_introns, current_introns, min_width) return shared_introns #sorted
def get_rd_relative_fluctuation_histogram(rd_distribution, num_introns_cutoff, mylabel, mytitle, pc=0.000000001, id_set=None): """ rd_distribution:{entrezID:[((start, end), rd)]} For each simple gene, calculate a relative fluctuation of per-intron iri: standard deviation/mean {entrez_id: rf} The reason for relative fluctuation is for """ myids = Utility_extended.get_subset_ids_from_dic(rd_distribution, id_set) relative_fluctuation = {} for entrez_id in myids: this_gene = rd_distribution[entrez_id] if len(this_gene) >= num_introns_cutoff: iris = [item[1] for item in this_gene] mean = numpy.average(iris) if mean > 0: std = numpy.std(iris) rf = std / mean relative_fluctuation[entrez_id] = rf plt.clf() plt.figure(1) plt.subplot(211) plt.hist(relative_fluctuation.values(), bins=50, color='g', normed=True, log=True) plt.title(mytitle) plt.xlabel(mylabel) plt.ylabel("Frequency") #plt.legend(loc = 'upper left') plt.subplot(212) plt.hist([log(item + pc, 2) for item in relative_fluctuation.values()], bins=50, color='r', normed=True, log=True) plt.xlabel("log " + mylabel) plt.ylabel("Frequency") #plt.legend(loc = 'upper left') plt.savefig(mytitle + ".png", format="png") return relative_fluctuation
def get_read_count_on_exons(gene_coords, bedFile, fragment_size): """ only deals with one chrom gene_coords is a list of UCSC object Return: three lists: geneName, exonsTotalLength, exonsTotalReadCount """ tag_position_list = [] f = open(bedFile, 'r') for line in f: if not re.match("#", line): line = line.strip() sline = line.split() tag_position_list.append( associate_tags_with_regions.tag_position(sline, fragment_size)) f.close() if not Utility_extended.is_list_sorted(tag_position_list): tag_position_list.sort() geneName = [] exonsTotalLength = [] #used for calculating the RPKM value exonsTotalReadCount = [] for g in gene_coords: geneName.append(g.name) if g.exonCount > 0: exon_Starts_str = (g.exonStarts.split( ','))[:-1] #remove the last '' because the format is '1,2,3,' exon_Ends_str = (g.exonEnds.split( ','))[:-1] #remove the last '' because the format is '1,2,3,' exon_Starts = [int(x) for x in exon_Starts_str] exon_Ends = [int(x) for x in exon_Ends_str] assert len(exon_Starts) == len(exon_Ends) totalLength = 0 for i in xrange(len(exon_Starts)): totalLength += exon_Ends[i] - exon_Starts[i] exonsTotalLength.append(totalLength) exon_read_count_list = associate_tags_with_regions.find_readcount_on_regions( tag_position_list, exon_Starts, exon_Ends) exonsTotalReadCount.append(sum(exon_read_count_list)) else: exonsTotalLength.append(0) exonsTotalReadCount.append(0) return geneName, exonsTotalLength, exonsTotalReadCount
def get_coverage(re_file_name, chrom, chrom_length, islands, upstream, downstream, min_re_length=10): """ Find the coverage for each re instances returns {id:value} """ known_repelements = RepElements.KnownRepElements.initiate_from_file( [chrom], re_file_name) #Get the regions defined by the REs regions = [] #(start, end, myelement.id) for myid in known_repelements.rep_elements.keys(): myelement = known_repelements.rep_elements[myid] #No matter whether positive or negative, genoStart < genoEnd if plus.match(myelement.strand): start = max(myelement.genoStart - upstream, 0) end = min(myelement.genoEnd + upstream, chrom_length) elif minus.match(myelement.strand): start = max(myelement.genoStart - downstream, 0) end = min(myelement.genoEnd + upstream, chrom_length) else: print myelement print "strand not recognized" exit(1) regions.append((start, end, myelement.id)) #{region:coverage}, region:(start, end, myelement.id) regions_w_coverage = Utility_extended.find_coverage_by_islands_on_regions( regions, islands) #change to {id:coverage} coverage_dic = {} for myregion in regions_w_coverage.keys(): start = myregion[0] end = myregion[1] length = end - start + 1 if length >= min_re_length: myid = myregion[2] coverage_dic[myid] = regions_w_coverage[myregion] return coverage_dic
def find_reads_on_regions(read_file, regions, shift, outfile_name, boundary_extension = 0): """ regions: [BED3]. The regions can overlap read_file contains reads from only one chrom Return: read count on each island: {(start, end): rc} """ regions_rc = [] #[(region, rc)] tag_list = [] #[(position, line)] tag_flag_list = [] # Flags whether a read pass filtering. if Utility.fileExists(read_file): f = open(read_file,'r') for line in f: if not re.match("#", line): line = line.strip() sline = line.split() position = tag_position(sline, shift) tag_list.append((position, line)) f.close() # sort according to position if (Utility_extended.is_listT_sorted(tag_list)==0): tag_list.sort(key=itemgetter(0)) positions = [tag[0] for tag in tag_list] tag_flag_list = [0 for tag in tag_list] for region in regions: start = max(region.start - boundary_extension, 0) end = region.end + boundary_extension assert (start<=end) start_ind = bisect.bisect_left(positions, start) end_ind = bisect.bisect_right(positions, end) for index in range(start_ind, end_ind): # These reads are on regions tag_flag_list[index] = 1 rc = end_ind - start_ind regions_rc.append((region, rc)) o = open(outfile_name, 'w') for i in xrange(len(tag_list)): if tag_flag_list[i] == 1: o.write( tag_list[i][1] + '\n') # tag_list[i][1] = line o.close(); return regions_rc
def get_fraction_retained_intron_per_gene_histogram(rc_distribution, rc_threshold, mytitle, id_set=None): """ rc_distribution:{entrezID:[((start, end), rc)] """ myids = Utility_extended.get_subset_ids_from_dic(rc_distribution, id_set) myids.sort() # enable predictable behavior of chrom order all_list = {} # one entry each gene total = 0.0 total_above_threshold = 0.0 total_no_tron_genes = 0 for entrez_id in myids: rcs = [item[1] for item in (rc_distribution)[entrez_id]] #read counts[] if len(rcs) > 0: above_threshold = 0 for item in rcs: if item >= rc_threshold: above_threshold += 1 above_threshold_fraction = float(above_threshold) / (len(rcs)) all_list[entrez_id] = above_threshold_fraction total += len(rcs) total_above_threshold += above_threshold else: #print "%s has no trons" %entrez_id total_no_tron_genes += 1 print "%d out of %d genes have no trons" % (total_no_tron_genes, len(myids)) print "The number of introns with read count >= %d is %d, fraction %f" % ( rc_threshold, total_above_threshold, float(total_above_threshold) / total) plt.clf() plt.hist(all_list.values(), bins=100, color='r', normed=True) # mytitle = "Intron read density (normalized by average exon read density of respective gene) histogram" plt.title(mytitle) plt.xlabel("Fraction of retained introns per gene ") plt.ylabel("Frequency") #plt.legend(loc = 'upper left') plt.savefig(mytitle + ".png", format="png") return all_list
def get_rd_fluctuation_histogram(rd_distribution, num_introns_cutoff, mylabel, mytitle, pc=0.000000001, id_set=None): """ rd_distribution:{entrezID:[((start, end), rd)]} For each simple gene, calculate fluctuation of per-intron read density normalized by expression: standard deviation {entrez_id: rf} can also be used for iri Return {entrez_id:value} """ myids = Utility_extended.get_subset_ids_from_dic(rd_distribution, id_set) fluctuation = {} for entrez_id in myids: this_gene = rd_distribution[entrez_id] if len(this_gene) >= num_introns_cutoff: iris = [item[1] for item in this_gene] std = numpy.std(iris) fluctuation[entrez_id] = std plt.clf() plt.figure(1) plt.subplot(211) plt.hist(fluctuation.values(), bins=50, color='g', normed=True, log=True) plt.title(mytitle) plt.xlabel(mylabel) plt.ylabel("Frequency") #plt.legend(loc = 'upper left') plt.subplot(212) plt.hist([log(item + pc, 2) for item in fluctuation.values()], bins=50, color='r', normed=True, log=True) plt.xlabel("log " + mylabel) plt.ylabel("Frequency") #plt.legend(loc = 'upper left') plt.savefig(mytitle + ".png", format="png") return fluctuation
def get_distribution_ito_tron_number(rc_distribution, id_set=None): """ Most appropriate for simple genes ito: in terms of Resection rc_distribution so that it is organized by the number of trons Return: {number_of_trons:{entrez_id:[((start, end), iri)]}} """ myids = Utility_extended.get_subset_ids_from_dic(rc_distribution, id_set) distribution_ito_tron_number = { } # {number_of_trons:{entrez_id:[((start, end), iri)]}} for entrez_id in myids: rcs = (rc_distribution)[entrez_id] number_of_trons = len(rcs) if number_of_trons not in distribution_ito_tron_number.keys(): distribution_ito_tron_number[number_of_trons] = {} else: (distribution_ito_tron_number[number_of_trons])[entrez_id] = rcs return distribution_ito_tron_number
def get_expression_rpkms_on_shared_exons(rc_on_shared_exons, totalcount, id_set=None): """ rc_on_shared_exons:{entrezID:[((start, end), read_count)]} Returns {id:value} """ myids = Utility_extended.get_subset_ids_from_dic(rc_on_shared_exons, id_set) expression_rpkm = {} for myid in myids: exons = rc_on_shared_exons[myid] shared_exons_total_length = sum( [item[0][1] - item[0][0] + 1 for item in exons]) shared_exons_rc = sum([item[1] for item in exons]) expression_rpkm[myid] = shared_exons_rc * ( 1000.0 / shared_exons_total_length) * (1000000 / float(totalcount)) return expression_rpkm
def print_out_summary(lib_file, outputfile): """ lib file in pickle format: {repID:{feature:value}} output the rc data of the particular pickle file """ assert (Utility_extended.fileExists(lib_file) == 1) lib = pickle.load(open(lib_file, 'rb')) myid = lib.keys()[0] mykeys = (lib[myid]).keys() mykeys = sorted(mykeys) of = open(outputfile, 'w') oline = "ID" + "\t" + ("\t").join(mykeys) + "\n" of.write(oline) for myid in lib.keys(): re = lib[myid] oline = str(myid) for feature in mykeys: oline += "\t" + str(re[feature]) oline += "\n" of.write(oline) of.close()
def Calculate3UTRUsage(entrez_genes, bedfile, chroms, outfile, threshold, PAfile, extension, index): """ entrez genes are made sure to be on one strand, the bed file are reads for that strand entrez_genes is a KnownEntrezGenes class object The raw read file needs to conform to bed format column_index: column in bed file for sorting """ # Separate reads by chrom rawreadslibName1 = (bedfile).split('/')[-1] rawreadssuffix1 = rawreadslibName1.split('.')[-1] rawreadslibName1 = rawreadslibName1.split('.')[0] rawreadsextension1 = "-" + rawreadslibName1 +'.' + rawreadssuffix1 + "1" if Utility_extended.fileExists(bedfile): if Utility_extended.chrom_files_exist(chroms, rawreadsextension1) != 1: # Separate by chrom and sort by start print chroms, rawreadsextension1, " files do not exist, separate by chroms and sort each file according to the second column. " Utility_extended.separate_by_chrom_sort(chroms, bedfile, rawreadsextension1, str(index)) else: print bedfile, " is not found" sys.exit(1) #This part is to access the polyadenylation sites PA1 = open(PAfile, 'r') PAsiteslist = [] PA2 = 'i' while PA2 != '': PA2 = PA1.readline() if PA2 != '': PA3 = PA2.strip('\n') PA4 = PA3.split('\t') PAsiteslist.append((PA4[0],PA4[1])) PA1.close() # Here the output is 'a', i.e. the output is appended to an existing file instead of creating one outf = open(outfile, 'a') for chrom in chroms: if chrom in entrez_genes.chroms: # a KnownEntrezGenes object entrez_genes_by_chrom = Entrez.KnownEntrezGenes([chrom], entrez_genes.subset_by_chrom(chrom)) # Get the read locations if Utility_extended.fileExists(chrom + rawreadsextension1): f = open(chrom + rawreadsextension1, 'r') tag_positions = [] for line in f: line = line.strip() sline = line.split() #make sure the extension is always 0, otherwise the rest of the program might not work as intended tag_positions.append(associate_tags_with_regions.tag_position(sline, 0)) f.close() if not Utility_extended.is_list_sorted(tag_positions): tag_positions.sort() #By this point tag_positions is a sorted list of all the reads located on the strand and chromosome the code is currently dealing with for entrez_id in entrez_genes_by_chrom.entrez_ids: gene = entrez_genes_by_chrom.entrez_genes[entrez_id] # an EntrezGene class object # get_3UTRs gets the ENTREZ 3'UTR, which appears to generally give the beginning of the 3'UTR and a site very close to the most distal polyadenylation site three_UTRs = gene.get_3UTRs() # Mastertuplemaker uses the ENTREZ 3'UTR and the polyA sites given to create the true data for the 3'UTR needed for CUTR_vs_AUTR to work true3UTRstarts, true3UTRends, UTRregion_start, UTRregion_end, UTRbeginning = Mastertuplemaker(three_UTRs,PAsiteslist,chrom,gene.strand, extension) #value should always be 1 as only 3'UTR with more than 1 polyA site need be considered if len(true3UTRends) > 1: #find all reads inside the 3'UTR inside_reads = associate_tags_with_3UTR(tag_positions, UTRregion_start, UTRregion_end) #finds reads in each region of the 3'UTR and calculates aUTR/cUTR for each of them #PolyAsites potentially useful for output RUDs, basic_RUD, PolyAsites = CUTR_vs_AUTR(true3UTRstarts, true3UTRends, inside_reads, gene.strand, threshold) #important if one wants to output gene_symbol information gene_symbol = [] for mytranscript in gene.transcripts: if mytranscript.additional_annotations[0] not in gene_symbol: gene_symbol.append(mytranscript.additional_annotations[0]) #outline to use to output RUDs outline = str(entrez_id) + "\t" + chrom + "\t" + gene.strand + "\t" + str(basic_RUD) + "\t" + ",".join(map(str, RUDs)) + "\n" #outline to use to output polyA information for a species #outline = str(entrez_id) + "\t" + chrom + "\t" + gene.strand + "\t" + str(UTRbeginning) + "\t" + ",".join(map(str, PolyAsites)) + "\n" outf.write(outline) outf.close()