def get_read_count_on_genic_regions(geneList, bedFile, fragment_size): """ only deals with one chrom geneList is a UCSC_lite object: name, chrom, strand, txStart, txEnd Returns three lists: gene name, length, read count """ (gene_name_list, region_start_list, region_end_list) = get_feature_lists(geneList) tag_position_list = [] f = open(bedFile, 'r') for line in f: if not re.match("#", line): line = line.strip() sline = line.split() tag_position_list.append( associate_tags_with_regions.tag_position(sline, fragment_size)) f.close() if not Utility_extended.is_list_sorted(tag_position_list): tag_position_list.sort() #A list, with total tag number on this region, order as the region lists read_count_list = associate_tags_with_regions.find_readcount_on_regions( tag_position_list, region_start_list, region_end_list) assert len(gene_name_list) == len(read_count_list) region_length_list = [0] * len(gene_name_list) for i in xrange(len(gene_name_list)): region_length_list[i] = region_end_list[i] - region_start_list[i] return gene_name_list, region_length_list, read_count_list
def get_read_count_on_genes(rawreadfile, fragment_size, knowngenefile, regiontype, promoter_upstream_extension, promoter_downstream_extension): """ Promoter and GeneBody are mutually exclusive. Promoter: TSS-upstreamextention, TSS+downstreamextension PromoterGenebody: Promoter + gene body. Return: a dictionary with key of gene name and value of read count """ knowngenes = UCSC.KnownGenes(knowngenefile) chroms = knowngenes.keys() allowed_region_type = ['Promoter', 'GeneBody', 'PromoterGenebody'] if regiontype == 'Promoter': region_dic = knowngenes.getPromoters(promoter_upstream_extension, promoter_downstream_extension) elif regiontype == 'GeneBody': region_dic = knowngenes.getGenebodys(promoter_downstream_extension) elif regiontype == 'PromoterGenebody': region_dic = knowngenes.getPromotergenebodys( promoter_upstream_extension) else: print " The allowed region types are Promoter, GeneBody and PromoterGenebody. The region type is not recognized, exiting" sys.exit(1) if Utility.fileExists(rawreadfile): SeparateByChrom.separateByChrom(chroms, rawreadfile, '.bed1') else: print rawreadfile, " not found" sys.exit(1) genes = {} for chrom in chroms: (gene_name_list, region_start_list, region_end_list) = get_feature_lists(region_dic[chrom]) tag_position_list = [] read_file = chrom + ".bed1" f = open(read_file, 'r') for line in f: if not re.match("#", line): line = line.strip() sline = line.split() tag_position_list.append( associate_tags_with_regions.tag_position( sline, fragment_size)) f.close() tag_position_list.sort() #A list, with total tag number on this region, order as the region lists tag_count_list = associate_tags_with_regions.find_readcount_on_regions( tag_position_list, region_start_list, region_end_list) assert len(gene_name_list) == len(tag_count_list) for i in range(0, len(gene_name_list)): genes[gene_name_list[i]] = tag_count_list[i] SeparateByChrom.cleanup(chroms, '.bed1') return genes
def get_read_count_on_exons(gene_coords, bedFile, fragment_size): """ only deals with one chrom gene_coords is a list of UCSC object Return: three lists: geneName, exonsTotalLength, exonsTotalReadCount """ tag_position_list = [] f = open(bedFile, 'r') for line in f: if not re.match("#", line): line = line.strip() sline = line.split() tag_position_list.append( associate_tags_with_regions.tag_position(sline, fragment_size)) f.close() if not Utility_extended.is_list_sorted(tag_position_list): tag_position_list.sort() geneName = [] exonsTotalLength = [] #used for calculating the RPKM value exonsTotalReadCount = [] for g in gene_coords: geneName.append(g.name) if g.exonCount > 0: exon_Starts_str = (g.exonStarts.split( ','))[:-1] #remove the last '' because the format is '1,2,3,' exon_Ends_str = (g.exonEnds.split( ','))[:-1] #remove the last '' because the format is '1,2,3,' exon_Starts = [int(x) for x in exon_Starts_str] exon_Ends = [int(x) for x in exon_Ends_str] assert len(exon_Starts) == len(exon_Ends) totalLength = 0 for i in xrange(len(exon_Starts)): totalLength += exon_Ends[i] - exon_Starts[i] exonsTotalLength.append(totalLength) exon_read_count_list = associate_tags_with_regions.find_readcount_on_regions( tag_position_list, exon_Starts, exon_Ends) exonsTotalReadCount.append(sum(exon_read_count_list)) else: exonsTotalLength.append(0) exonsTotalReadCount.append(0) return geneName, exonsTotalLength, exonsTotalReadCount
def Calculate3UTRUsage(entrez_genes, bedfile, chroms, outfile, threshold, PAfile, extension, index): """ entrez genes are made sure to be on one strand, the bed file are reads for that strand entrez_genes is a KnownEntrezGenes class object The raw read file needs to conform to bed format column_index: column in bed file for sorting """ # Separate reads by chrom rawreadslibName1 = (bedfile).split('/')[-1] rawreadssuffix1 = rawreadslibName1.split('.')[-1] rawreadslibName1 = rawreadslibName1.split('.')[0] rawreadsextension1 = "-" + rawreadslibName1 + '.' + rawreadssuffix1 + "1" if Utility_extended.fileExists(bedfile): if Utility_extended.chrom_files_exist(chroms, rawreadsextension1) != 1: # Separate by chrom and sort by start print chroms, rawreadsextension1, " files do not exist, separate by chroms and sort each file according to the second column. " Utility_extended.separate_by_chrom_sort(chroms, bedfile, rawreadsextension1, str(index)) else: print bedfile, " is not found" sys.exit(1) #This part is to access the polyadenylation sites PA1 = open(PAfile, 'r') PAsiteslist = [] PA2 = 'i' while PA2 != '': PA2 = PA1.readline() if PA2 != '': PA3 = PA2.strip('\n') PA4 = PA3.split('\t') PAsiteslist.append((PA4[0], PA4[1])) PA1.close() # Here the output is 'a', i.e. the output is appended to an existing file instead of creating one outf = open(outfile, 'a') for chrom in chroms: if chrom in entrez_genes.chroms: # a KnownEntrezGenes object entrez_genes_by_chrom = Entrez.KnownEntrezGenes( [chrom], entrez_genes.subset_by_chrom(chrom)) # Get the read locations if Utility_extended.fileExists(chrom + rawreadsextension1): f = open(chrom + rawreadsextension1, 'r') tag_positions = [] for line in f: line = line.strip() sline = line.split() #make sure the extension is always 0, otherwise the rest of the program might not work as intended tag_positions.append( associate_tags_with_regions.tag_position(sline, 0)) f.close() if not Utility_extended.is_list_sorted(tag_positions): tag_positions.sort() #By this point tag_positions is a sorted list of all the reads located on the strand and chromosome the code is currently dealing with for entrez_id in entrez_genes_by_chrom.entrez_ids: gene = entrez_genes_by_chrom.entrez_genes[ entrez_id] # an EntrezGene class object # get_3UTRs gets the ENTREZ 3'UTR, which appears to generally give the beginning of the 3'UTR and a site very close to the most distal polyadenylation site three_UTRs = gene.get_3UTRs() # Mastertuplemaker uses the ENTREZ 3'UTR and the polyA sites given to create the true data for the 3'UTR needed for CUTR_vs_AUTR to work true3UTRstarts, true3UTRends, UTRregion_start, UTRregion_end, UTRbeginning = Mastertuplemaker( three_UTRs, PAsiteslist, chrom, gene.strand, extension) #value should always be 1 as only 3'UTR with more than 1 polyA site need be considered if len(true3UTRends) > 1: #find all reads inside the 3'UTR inside_reads = associate_tags_with_3UTR( tag_positions, UTRregion_start, UTRregion_end) #finds reads in each region of the 3'UTR and calculates aUTR/cUTR for each of them #PolyAsites potentially useful for output RUDs, basic_RUD, PolyAsites = CUTR_vs_AUTR( true3UTRstarts, true3UTRends, inside_reads, gene.strand, threshold) #important if one wants to output gene_symbol information gene_symbol = [] for mytranscript in gene.transcripts: if mytranscript.additional_annotations[ 0] not in gene_symbol: gene_symbol.append( mytranscript.additional_annotations[0]) #outline to use to output RUDs outline = str( entrez_id ) + "\t" + chrom + "\t" + gene.strand + "\t" + str( basic_RUD) + "\t" + ",".join(map(str, RUDs)) + "\n" #outline to use to output polyA information for a species #outline = str(entrez_id) + "\t" + chrom + "\t" + gene.strand + "\t" + str(UTRbeginning) + "\t" + ",".join(map(str, PolyAsites)) + "\n" outf.write(outline) outf.close()
def getReadCount(KnownGenes, bedfile, chroms, fragment_size, region_type, upstream_extension, downstream_extension, totalcount, out_file): """ Known genes are made sure to be on one strand, and the bed file are reads for that strand The raw read file needs to conform to bed format """ ReadCount = {} # keyed by name, valued by (rc, length, rpkm) # Separate by chrom reads rawreadslibName1 = (bedfile).split('/')[-1] rawreadssuffix1 = rawreadslibName1.split('.')[-1] rawreadslibName1 = rawreadslibName1.split('.')[0] rawreadsextension1 = "-" + rawreadslibName1 + '.' + rawreadssuffix1 + "1" if Utility_extended.fileExists(bedfile): if Utility_extended.chrom_files_exist(chroms, rawreadsextension1) != 1: # Separate by chrom and sort by start print chroms, rawreadsextension1, " files do not exist, separate by chroms and sort each file according to the second column. " Utility_extended.separate_by_chrom_sort(chroms, bedfile, rawreadsextension1, [2]) else: print bedfile, " is not found" sys.exit(1) # dictionary has chrom as key and ucsc_lite object (name, chrom, strand, txStart, txEnd) as values if region_type == 'Promoter': region_dic = KnownGenes.getPromoters(upstream_extension, downstream_extension) elif region_type == 'GeneBody': region_dic = KnownGenes.getGenebodys(downstream_extension) elif region_type == 'ExtendedGeneBody': region_dic = KnownGenes.getExtendedGenebodys(upstream_extension, downstream_extension) elif region_type == 'PromoterGenebody': region_dic = KnownGenes.getPromotergenebodys(upstream_extension) elif region_type == 'GeneEnd': region_dic = KnownGenes.getGeneEnds(upstream_extension, downstream_extension) elif region_type == 'ExonicRegion': region_dic = KnownGenes.getExons() elif region_type == 'IntronicRegion': region_dic = KnownGenes.getIntrons() elif region_type == '5UTR': region_dic = KnownGenes.get5UTRs(upstream_extension, downstream_extension) elif region_type == '3UTR': region_dic = KnownGenes.get3UTRs(upstream_extension, downstream_extension) else: print region_type, "is not recognized" exit(1) outf = open(out_file, 'a') for chrom in chroms: chrombed = chrom + rawreadsextension1 if Utility_extended.fileExists(chrombed) and (chrom in KnownGenes.keys()): tag_position_list = [] inf = open(chrombed, 'r') for line in inf: if not re.match("#", line): line = line.strip() sline = line.split() tag_position_list.append( associate_tags_with_regions.tag_position( sline, fragment_size)) inf.close() if Utility_extended.is_list_sorted(tag_position_list) != 1: tag_position_list.sort() if len(region_dic[chrom]) > 0: for region in region_dic[chrom]: thisregion = [(region.txStart, region.txEnd)] (total_length, rc) = get_read_count_on_regions(thisregion, tag_position_list) if total_length > 0: RPKM = rc * (1000.0 / total_length) * ( 1000000 / float(totalcount)) else: assert rc < 0.01 RPKM = 0 outline = str(region.name) + '\t' + str(rc) + '\t' + str( total_length) + '\t' + str(RPKM) + '\n' outf.write(outline) ReadCount[region.name] = (rc, total_length, RPKM) outf.close() #SeparateByChrom.cleanup(chroms, rawreadsextension1) return ReadCount
def get_read_count_on_onic_transcript(KnownGenes, bedfile, chroms, fragment_size, region_type, totalcount, out_file): """ Return: a dictionary keyed by geneName valued by TotalReadCount,TotalLength, RPKM """ ReadCount = {} # keyed by name, valued by (rc, length, rpkm) # Separate by chrom reads rawreadslibName1 = (bedfile).split('/')[-1] rawreadssuffix1 = rawreadslibName1.split('.')[-1] rawreadslibName1 = rawreadslibName1.split('.')[0] rawreadsextension1 = "-" + rawreadslibName1 + '.' + rawreadssuffix1 + "1" if Utility_extended.fileExists(bedfile): if Utility_extended.chrom_files_exist(chroms, rawreadsextension1) != 1: # Separate by chrom and sort by start print chroms, rawreadsextension1, " files do not exist, separate by chroms and sort each file according to the second column. " Utility_extended.separate_by_chrom_sort(chroms, bedfile, rawreadsextension1, [2]) # sort by start else: print bedfile, " is not found" sys.exit(1) outf = open(out_file, 'a') for chrom in chroms: chrombed = chrom + rawreadsextension1 if Utility_extended.fileExists(chrombed) and (chrom in KnownGenes.keys()): tag_position_list = [] inf = open(chrombed, 'r') for line in inf: if not re.match("#", line): line = line.strip() sline = line.split() tag_position_list.append( associate_tags_with_regions.tag_position( sline, fragment_size)) inf.close() if Utility_extended.is_list_sorted(tag_position_list) != 1: tag_position_list.sort() for gene in KnownGenes[chrom]: if region_type == "ExonicTranscript": ons = gene.getExons() elif region_type == "IntronicTranscript": ons = gene.getIntrons() else: print region_type, "is not recognized." exit(1) if len(ons > 0): (total_length, rc) = get_read_count_on_regions(ons, tag_position_list) RPKM = rc * (1000.0 / total_length) * (1000000 / float(totalcount)) else: total_length = 0 rc = 0 RPKM = 0 outline = str(gene.name) + '\t' + str(rc) + '\t' + str( total_length) + '\t' + str(RPKM) + '\n' outf.write(outline) ReadCount[region.name] = (rc, total_length, RPKM) outf.close() #SeparateByChrom.cleanup(chroms, rawreadsextension1) return ReadCount
def main(argv): parser = OptionParser() parser.add_option("-s", "--species", action="store", type="string", dest="species", help="species, mm8, hg18, etc", metavar="<str>") parser.add_option("-a", "--rawreadfileA", action="store", type="string", dest="readfileA", metavar="<file>", help="raw read file A in bed format") parser.add_option("-b", "--rawreadfileB", action="store", type="string", dest="readfileB", metavar="<file>", help="raw read file B in bed format") parser.add_option("-f", "--fragment_size", action="store", type="int", dest="fragment_size", metavar="<int>", help="average size of a fragment after A experiment") parser.add_option("-d", "--islandfile", action="store", type="string", dest="islandfile", metavar="<file>", help="island file in BED format") parser.add_option("-o", "--outfile", action="store", type="string", dest="out_file", metavar="<file>", help="island read count summary file") (opt, args) = parser.parse_args(argv) if len(argv) < 12: parser.print_help() sys.exit(1) if opt.species in GenomeData.species_chroms.keys(): chroms = GenomeData.species_chroms[opt.species]; else: print "This species is not recognized, exiting"; sys.exit(1); if not Utility.fileExists(opt.readfileA): print opt.readfileA, " not found"; sys.exit(1) if not Utility.fileExists(opt.readfileB): print opt.readfileB, " not found"; sys.exit(1) A_library_size = get_total_tag_counts.get_total_tag_counts(opt.readfileA); B_library_size = get_total_tag_counts.get_total_tag_counts(opt.readfileB); print "Library size of ", opt.readfileA, ": ", A_library_size print "Library size of ", opt.readfileB, ": ", B_library_size totalA = 0; totalB = 0; islands = BED.BED(opt.species, opt.islandfile, "BED3", 0); # separate by chrom the A library SeparateByChrom.separateByChrom(chroms, opt.readfileA, '.bed1'); # separate by chrom the B library SeparateByChrom.separateByChrom(chroms, opt.readfileB, '.bed2'); island_A_readcount = {}; island_B_readcount = {}; #Find read counts on the islands for chrom in chroms: if chrom in islands.keys(): if len(islands[chrom]) != 0: island_list = islands[chrom]; if Utility.is_bed_sorted(island_list) == 0: island_list.sort(key=operator.attrgetter('start')); island_start_list = [] island_end_list = [] for item in island_list: island_start_list.append(item.start) island_end_list.append(item.end) island_A_readcount_list=[0]*len(island_list); read_file = chrom + ".bed1"; f = open(read_file,'r') for line in f: if not re.match("#", line): line = line.strip() sline = line.split() position = associate_tags_with_regions.tag_position(sline, opt.fragment_size) index =associate_tags_with_regions.find_readcount_on_islands(island_start_list, island_end_list, position); if index >= 0: island_A_readcount_list[index] += 1; totalA += 1; f.close(); island_A_readcount[chrom] = island_A_readcount_list; island_B_readcount_list=[0]*len(island_list); read_file = chrom + ".bed2"; f = open(read_file,'r') for line in f: if not re.match("#", line): line = line.strip() sline = line.split() position = associate_tags_with_regions.tag_position(sline, opt.fragment_size) index = associate_tags_with_regions.find_readcount_on_islands(island_start_list, island_end_list, position); if index >= 0: island_B_readcount_list[index] += 1; totalB += 1; f.close(); island_B_readcount[chrom] = island_B_readcount_list; #A_background_read = A_library_size - totalA; #B_background_read = B_library_size - totalB; print "Total number of A reads on islands is: ", totalA; print "Total number of B reads on islands is: ", totalB; # Calculate the p value. library_scaling_factor = A_library_size*1.0/B_library_size; #A vs B pseudo_count = 1; pvalue_A_vs_B_list = []; pvalue_B_vs_A_list = []; for chrom in chroms: if chrom in islands.keys(): if len(islands[chrom]) != 0: island_list = islands[chrom]; for index in xrange(len(island_list)): item = island_list[index]; Acount = (island_A_readcount[chrom])[index]; Bcount = (island_B_readcount[chrom])[index]; pvalue_A_vs_B = pvaule (Acount, Bcount, library_scaling_factor, pseudo_count); pvalue_A_vs_B_list.append(pvalue_A_vs_B); pvalue_B_vs_A = pvaule (Bcount, Acount, 1/library_scaling_factor, pseudo_count); pvalue_B_vs_A_list.append(pvalue_B_vs_A); #Calculate the FDR fdr_A_vs_B_list = fdr(pvalue_A_vs_B_list); fdr_B_vs_A_list = fdr(pvalue_B_vs_A_list); #Output the islands read counts, normalized read counts, fc, pvalue both ways scaling_factor = 1000000; out = open(opt.out_file, 'w'); outline = '#chrom' + "\t" + 'start' + "\t" + 'end' + "\t" + "Readcount_A" + "\t" + 'Normalized_Readcount_A' + "\t" + 'ReadcountB' + "\t" + 'Normalized_Readcount_B' + "\t" + "Fc_A_vs_B" + "\t" + "pvalue_A_vs_B" + "\t" + "FDR_A_vs_B" + "\t" + "Fc_B_vs_A" + "\t" + "pvalue_B_vs_A" + "\t" + "FDR_B_vs_A" + "\n"; out.write(outline); ii=0; for chrom in chroms: if chrom in islands.keys(): if len(islands[chrom]) != 0: island_list = islands[chrom]; for index in xrange(len(island_list)): item = island_list[index]; Acount = (island_A_readcount[chrom])[index]; Bcount = (island_B_readcount[chrom])[index]; normalized_A = Acount/ float(A_library_size) * scaling_factor; normalized_B = Bcount/ float(B_library_size) * scaling_factor; fc_A_vs_B = ((Acount + pseudo_count)*1.0/(Bcount + pseudo_count))/library_scaling_factor; fc_B_vs_A = ((Bcount + pseudo_count)*1.0/(Acount + pseudo_count)) * library_scaling_factor; outline = item.chrom + "\t" + str(item.start) + "\t" + str(item.end) + "\t" + str(Acount) + "\t" + str(normalized_A) + "\t" + str(Bcount) + "\t" + str(normalized_B) + "\t" + str(fc_A_vs_B) + "\t" + str(pvalue_A_vs_B_list[ii]) + "\t" + str(fdr_A_vs_B_list[ii]) + "\t" + str(fc_B_vs_A) + "\t" + str(pvalue_B_vs_A_list[ii]) + "\t" + str(fdr_B_vs_A_list[ii]) + "\n"; out.write(outline); ii += 1; out.close(); SeparateByChrom.cleanup(chroms, '.bed1'); SeparateByChrom.cleanup(chroms, '.bed2'); # Calculate the correlations using normalized read counts A_array=(); B_array=(); for chrom in chroms: if chrom in islands.keys(): if len(islands[chrom]) != 0: temp_array= scipy.array(island_A_readcount[chrom]); A_array=scipy.concatenate((temp_array, A_array)); temp_array= scipy.array(island_B_readcount[chrom]); B_array=scipy.concatenate((temp_array, B_array)); #Normalization to reads per million A_array = A_array/float(A_library_size) * scaling_factor; B_array = B_array/float(B_library_size) * scaling_factor; pearson=scipy.stats.pearsonr(A_array, B_array); print "Pearson's correlation is: ", pearson[0], " with p-value ", pearson[1]; spearman = scipy.stats.spearmanr(A_array, B_array); print "Spearman's correlation is: ", spearman[0], " with p-value ", spearman[1];
def main(argv): parser = OptionParser() parser.add_option("-b", "--bedfile", action="store", type="string", dest="bedfile", metavar="<file>", help="ChIP seq read file") parser.add_option( "-f", "--fragment_size", action="store", type="int", dest="fragment_size", help= "fragment_size determins the shift (half of fragment_size of ChIP-seq read position, in bps", metavar="<int>") parser.add_option("-t", "--RE_tree_pickle_file", action="store", type="string", dest="RE_Tree", metavar="<file>", help="file with RE tree in pickle format") parser.add_option( "-l", "--RE_annotation_file_location", action="store", type="string", dest="RE_file_location", metavar="<file>", help="location of RE files named in repClass_repFamily_repName.txt") parser.add_option("-u", "--upstream_extension", action="store", type="int", dest="upstream_extension", help="upstream extension from start", metavar="<int>") parser.add_option("-d", "--downstream_extension", action="store", type="int", dest="downstream_extension", help="downstream extension from end", metavar="<int>") parser.add_option("-s", "--species", action="store", type="string", dest="species", help="species, mm8, hg18, etc", metavar="<str>") parser.add_option("-n", "--feature_name", action="store", type="string", dest="feature_name", help="name of the library", metavar="<str>") (opt, args) = parser.parse_args(argv) if len(argv) < 16: parser.print_help() sys.exit(1) startTime = time.time() if opt.species in GenomeData.species_chroms.keys(): chroms = GenomeData.species_chroms[opt.species] chrom_lengths = GenomeData.species_chrom_lengths[opt.species] else: print "This species is not recognized, exiting" sys.exit(1) total_count = get_total_tag_counts.get_total_tag_counts(opt.bedfile) #Separate_by_chrom on bedfile lib_name = (opt.bedfile).split('/')[-1] # remove directory suffix = lib_name.split('.')[-1] # txt lib_name = lib_name.split('.')[0] extension = "-" + lib_name + '.' + suffix + "1" if Utility_extended.fileExists(opt.bedfile): if Utility_extended.chrom_files_exist(chroms, extension) != 1: SeparateByChrom.separateByChrom(chroms, opt.bedfile, extension) else: print bedfile, " is not found" sys.exit(1) #load the RE tree to get the RE file names re_tree = pickle.load(open(opt.RE_Tree, 'rb')) (numb_classes, numb_families, numb_names) = numbers(re_tree) print "There are %d classes, %d family, and %d names." % ( numb_classes, numb_families, numb_names) #Prepare the summary read_counts = {} for reClass in re_tree.keys(): read_counts[reClass] = {} for reFamily in re_tree[reClass].keys(): read_counts[reClass][reFamily] = {} for reName in re_tree[reClass][reFamily]: read_counts[reClass][reFamily][reName] = {} #cycle through chrom for chrom in chroms: print chrom chrom_length = chrom_lengths[chrom] chrombed = chrom + extension if Utility_extended.fileExists(chrombed): # load in each read and shift tag_position_list = [] inf = open(chrombed, 'r') for line in inf: if not re.match("#", line): line = line.strip() sline = line.split() tag_position_list.append( associate_tags_with_regions.tag_position( sline, opt.fragment_size)) inf.close() if not Utility_extended.is_list_sorted(tag_position_list): tag_position_list.sort() #[tag_positions] min_re_length = 10 for reClass in re_tree.keys(): for reFamily in re_tree[reClass].keys(): for reName in re_tree[reClass][reFamily]: re_file_name = "_".join([reClass, reFamily, reName ]) + ".txt" #{id:{feature_name:value}} rc_dic = get_read_count( opt.RE_file_location, re_file_name, opt.feature_name, chrom, chrom_length, tag_position_list, total_count, opt.upstream_extension, opt.downstream_extension, min_re_length) # id is unique and updated only once, so this should be ok read_counts[reClass][reFamily][reName].update(rc_dic) #{reClass:{reFamily:{reName:{id:feature_name, value}}}} #feature_name include: feature_name + "_rc", feature_name + "_rpkm" #output_file_name = feature_name + "_on_" + "mm9_rmsk.pkl" #output = open(output_file_name, 'wb') #pickle.dump(read_counts, output) #output.close() #instead of outputing a huge one, let's output many small pieces breakdown_and_output(read_counts, opt.feature_name) repClass = 'LTR' repFamily = 'ERV1' repName = 'RLTR4_Mm' outfile_name = lib_name + "_on_" + "_".join([repClass, repFamily, repName ]) + ".dat" test(read_counts, repClass, repFamily, repName, outfile_name) SeparateByChrom.cleanup(chroms, extension) print "it took", time.time() - startTime, "seconds."
def main(argv): parser = OptionParser() parser.add_option("-s", "--species", action="store", type="string", dest="species", help="species, mm8, hg18", metavar="<str>") parser.add_option("-a", "--rawchipreadfile", action="store", type="string", dest="chipreadfile", metavar="<file>", help="raw read file from chip in bed format") parser.add_option("-b", "--rawcontrolreadfile", action="store", type="string", dest="controlreadfile", metavar="<file>", help="raw read file from control in bed format") parser.add_option("-f", "--fragment_size", action="store", type="int", dest="fragment_size", metavar="<int>", help="average size of a fragment after CHIP experiment") parser.add_option("-d", "--islandfile", action="store", type="string", dest="islandfile", metavar="<file>", help="island file in BED format") parser.add_option("-o", "--outfile", action="store", type="string", dest="out_file", metavar="<file>", help="island read count summary file") parser.add_option("-t", "--mappable_fraction_of_genome_size ", action="store", type="float", dest="fraction", help="mapable fraction of genome size", metavar="<float>") (opt, args) = parser.parse_args(argv) if len(argv) < 14: parser.print_help() sys.exit(1) if opt.species in GenomeData.species_chroms.keys(): chroms = GenomeData.species_chroms[opt.species] genomesize = sum( GenomeData.species_chrom_lengths[opt.species].values()) genomesize = opt.fraction * genomesize else: print "This species is not recognized, exiting" sys.exit(1) chip_library_size = get_total_tag_counts.get_total_tag_counts( opt.chipreadfile) control_library_size = get_total_tag_counts.get_total_tag_counts( opt.controlreadfile) print "chip library size ", chip_library_size print "control library size ", control_library_size totalchip = 0 totalcontrol = 0 islands = BED.BED(opt.species, opt.islandfile, "BED3", 0) # separate by chrom the chip library if Utility.fileExists(opt.chipreadfile): SeparateByChrom.separateByChrom(chroms, opt.chipreadfile, '.bed1') else: print opt.chipreadfile, " not found" sys.exit(1) # separate by chrom the control library if Utility.fileExists(opt.controlreadfile): SeparateByChrom.separateByChrom(chroms, opt.controlreadfile, '.bed2') else: print opt.controlreadfile, " not found" sys.exit(1) island_chip_readcount = {} island_control_readcount = {} for chrom in chroms: if chrom in islands.keys(): if len(islands[chrom]) != 0: island_list = islands[chrom] if Utility.is_bed_sorted(island_list) == 0: island_list.sort(key=operator.attrgetter('start')) island_start_list = [] island_end_list = [] for item in island_list: island_start_list.append(item.start) island_end_list.append(item.end) island_chip_readcount_list = [0] * len(island_list) read_file = chrom + ".bed1" f = open(read_file, 'r') for line in f: if not re.match("#", line): line = line.strip() sline = line.split() position = associate_tags_with_regions.tag_position( sline, opt.fragment_size) index = associate_tags_with_regions.find_readcount_on_islands( island_start_list, island_end_list, position) if index >= 0: island_chip_readcount_list[index] += 1 totalchip += 1 f.close() island_chip_readcount[chrom] = island_chip_readcount_list island_control_readcount_list = [0] * len(island_list) read_file = chrom + ".bed2" f = open(read_file, 'r') for line in f: if not re.match("#", line): line = line.strip() sline = line.split() position = associate_tags_with_regions.tag_position( sline, opt.fragment_size) index = associate_tags_with_regions.find_readcount_on_islands( island_start_list, island_end_list, position) if index >= 0: island_control_readcount_list[index] += 1 totalcontrol += 1 f.close() island_control_readcount[chrom] = island_control_readcount_list chip_background_read = chip_library_size - totalchip control_background_read = control_library_size - totalcontrol #scaling_factor = chip_background_read*1.0/control_background_read; scaling_factor = chip_library_size * 1.0 / control_library_size print "Total number of chip reads on islands is: ", totalchip print "Total number of control reads on islands is: ", totalcontrol #print "chip_background_read ", chip_background_read #print "control_background_read ", control_background_read out = open(opt.out_file, 'w') pvalue_list = [] result_list = [] for chrom in chroms: if chrom in islands.keys(): if len(islands[chrom]) != 0: island_list = islands[chrom] for index in xrange(len(island_list)): item = island_list[index] observation = (island_chip_readcount[chrom])[index] control_tag = (island_control_readcount[chrom])[index] if (island_control_readcount[chrom])[index] > 0: #average = (island_control_readcount[chrom])[index] * scaling_factor; average = control_tag * scaling_factor fc = float(observation) / float(average) else: length = item.end - item.start + 1 average = length * control_library_size * 1.0 / genomesize average = min(0.25, average) * scaling_factor fc = float(observation) / float(average) if observation > average: pvalue = scipy.stats.poisson.sf( (island_chip_readcount[chrom])[index], average)[()] else: pvalue = 1 pvalue_list.append(pvalue) item_dic = {} item_dic['chrom'] = item.chrom item_dic['start'] = item.start item_dic['end'] = item.end item_dic['chip'] = observation item_dic['control'] = control_tag item_dic['pvalue'] = pvalue item_dic['fc'] = fc result_list.append(item_dic) pvaluearray = scipy.array(pvalue_list) pvaluerankarray = scipy.stats.rankdata(pvaluearray) totalnumber = len(result_list) for i in range(totalnumber): item = result_list[i] alpha = pvalue_list[i] * totalnumber / pvaluerankarray[i] if alpha > 1: alpha = 1 outline = item['chrom'] + "\t" + str(item['start']) + "\t" + str( item['end']) + "\t" + str(item['chip']) + "\t" + str( item['control']) + "\t" + str(item['pvalue']) + "\t" + str( item['fc']) + "\t" + str(alpha) + "\n" out.write(outline) #pvalue_list.sort() #for item in result_list: #pvalue = float(item['pvalue']) #alpha = pvalue * len(result_list) / (pvalue_list.index(pvalue) + 1) #if alpha > 1: #alpha = 1; #outline = item['chrom'] + "\t" + str(item['start']) + "\t" + str(item['end']) + "\t" + str(item['chip']) + "\t" + str(item['control']) + "\t" + str(item['pvalue']) + "\t" + str(item['fc']) + "\t" + str(alpha) + "\n"; #out.write(outline); out.close() SeparateByChrom.cleanup(chroms, '.bed1') SeparateByChrom.cleanup(chroms, '.bed2')
def main(argv): parser = OptionParser() parser.add_option("-s", "--species", action="store", type="string", dest="species", help="species, mm8, hg18, etc", metavar="<str>") parser.add_option("-a", "--rawreadfileA", action="store", type="string", dest="readfileA", metavar="<file>", help="raw read file A in bed format") parser.add_option("-b", "--rawreadfileB", action="store", type="string", dest="readfileB", metavar="<file>", help="raw read file B in bed format") parser.add_option("-f", "--fragment_size", action="store", type="int", dest="fragment_size", metavar="<int>", help="average size of a fragment after A experiment") parser.add_option("-d", "--islandfile", action="store", type="string", dest="islandfile", metavar="<file>", help="island file in BED format") parser.add_option("-o", "--outfile", action="store", type="string", dest="out_file", metavar="<file>", help="island read count summary file") (opt, args) = parser.parse_args(argv) if len(argv) < 12: parser.print_help() sys.exit(1) if opt.species in GenomeData.species_chroms.keys(): chroms = GenomeData.species_chroms[opt.species] else: print "This species is not recognized, exiting" sys.exit(1) if not Utility.fileExists(opt.readfileA): print opt.readfileA, " not found" sys.exit(1) if not Utility.fileExists(opt.readfileB): print opt.readfileB, " not found" sys.exit(1) A_library_size = get_total_tag_counts.get_total_tag_counts(opt.readfileA) B_library_size = get_total_tag_counts.get_total_tag_counts(opt.readfileB) print "Library size of ", opt.readfileA, ": ", A_library_size print "Library size of ", opt.readfileB, ": ", B_library_size totalA = 0 totalB = 0 islands = BED.BED(opt.species, opt.islandfile, "BED3", 0) # separate by chrom the A library SeparateByChrom.separateByChrom(chroms, opt.readfileA, '.bed1') # separate by chrom the B library SeparateByChrom.separateByChrom(chroms, opt.readfileB, '.bed2') island_A_readcount = {} island_B_readcount = {} #Find read counts on the islands for chrom in chroms: if chrom in islands.keys(): if len(islands[chrom]) != 0: island_list = islands[chrom] if Utility.is_bed_sorted(island_list) == 0: island_list.sort(key=operator.attrgetter('start')) island_start_list = [] island_end_list = [] for item in island_list: island_start_list.append(item.start) island_end_list.append(item.end) island_A_readcount_list = [0] * len(island_list) read_file = chrom + ".bed1" f = open(read_file, 'r') for line in f: if not re.match("#", line): line = line.strip() sline = line.split() position = associate_tags_with_regions.tag_position( sline, opt.fragment_size) index = associate_tags_with_regions.find_readcount_on_islands( island_start_list, island_end_list, position) if index >= 0: island_A_readcount_list[index] += 1 totalA += 1 f.close() island_A_readcount[chrom] = island_A_readcount_list island_B_readcount_list = [0] * len(island_list) read_file = chrom + ".bed2" f = open(read_file, 'r') for line in f: if not re.match("#", line): line = line.strip() sline = line.split() position = associate_tags_with_regions.tag_position( sline, opt.fragment_size) index = associate_tags_with_regions.find_readcount_on_islands( island_start_list, island_end_list, position) if index >= 0: island_B_readcount_list[index] += 1 totalB += 1 f.close() island_B_readcount[chrom] = island_B_readcount_list #A_background_read = A_library_size - totalA; #B_background_read = B_library_size - totalB; print "Total number of A reads on islands is: ", totalA print "Total number of B reads on islands is: ", totalB # Calculate the p value. library_scaling_factor = A_library_size * 1.0 / B_library_size #A vs B pseudo_count = 1 pvalue_A_vs_B_list = [] pvalue_B_vs_A_list = [] for chrom in chroms: if chrom in islands.keys(): if len(islands[chrom]) != 0: island_list = islands[chrom] for index in xrange(len(island_list)): item = island_list[index] Acount = (island_A_readcount[chrom])[index] Bcount = (island_B_readcount[chrom])[index] pvalue_A_vs_B = pvaule(Acount, Bcount, library_scaling_factor, pseudo_count) pvalue_A_vs_B_list.append(pvalue_A_vs_B) pvalue_B_vs_A = pvaule(Bcount, Acount, 1 / library_scaling_factor, pseudo_count) pvalue_B_vs_A_list.append(pvalue_B_vs_A) #Calculate the FDR fdr_A_vs_B_list = fdr(pvalue_A_vs_B_list) fdr_B_vs_A_list = fdr(pvalue_B_vs_A_list) #Output the islands read counts, normalized read counts, fc, pvalue both ways scaling_factor = 1000000 out = open(opt.out_file, 'w') outline = '#chrom' + "\t" + 'start' + "\t" + 'end' + "\t" + "Readcount_A" + "\t" + 'Normalized_Readcount_A' + "\t" + 'ReadcountB' + "\t" + 'Normalized_Readcount_B' + "\t" + "Fc_A_vs_B" + "\t" + "pvalue_A_vs_B" + "\t" + "FDR_A_vs_B" + "\t" + "Fc_B_vs_A" + "\t" + "pvalue_B_vs_A" + "\t" + "FDR_B_vs_A" + "\n" out.write(outline) ii = 0 for chrom in chroms: if chrom in islands.keys(): if len(islands[chrom]) != 0: island_list = islands[chrom] for index in xrange(len(island_list)): item = island_list[index] Acount = (island_A_readcount[chrom])[index] Bcount = (island_B_readcount[chrom])[index] normalized_A = Acount / float( A_library_size) * scaling_factor normalized_B = Bcount / float( B_library_size) * scaling_factor fc_A_vs_B = ( (Acount + pseudo_count) * 1.0 / (Bcount + pseudo_count)) / library_scaling_factor fc_B_vs_A = ( (Bcount + pseudo_count) * 1.0 / (Acount + pseudo_count)) * library_scaling_factor print("Acount", Acount, "Bcount", Bcount, "pseudo_count", pseudo_count, "library_scaling_factor", library_scaling_factor, "fc_A_vs_B", fc_A_vs_B, "fc_B_vs_A", fc_B_vs_A) outline = item.chrom + "\t" + str(item.start) + "\t" + str( item.end) + "\t" + str(Acount) + "\t" + str( normalized_A) + "\t" + str(Bcount) + "\t" + str( normalized_B ) + "\t" + str(fc_A_vs_B) + "\t" + str( pvalue_A_vs_B_list[ii]) + "\t" + str( fdr_A_vs_B_list[ii] ) + "\t" + str(fc_B_vs_A) + "\t" + str( pvalue_B_vs_A_list[ii]) + "\t" + str( fdr_B_vs_A_list[ii]) + "\n" out.write(outline) ii += 1 out.close() SeparateByChrom.cleanup(chroms, '.bed1') SeparateByChrom.cleanup(chroms, '.bed2') # Calculate the correlations using normalized read counts A_array = () B_array = () for chrom in chroms: if chrom in islands.keys(): if len(islands[chrom]) != 0: temp_array = scipy.array(island_A_readcount[chrom]) A_array = scipy.concatenate((temp_array, A_array)) temp_array = scipy.array(island_B_readcount[chrom]) B_array = scipy.concatenate((temp_array, B_array)) #Normalization to reads per million A_array = A_array / float(A_library_size) * scaling_factor B_array = B_array / float(B_library_size) * scaling_factor pearson = scipy.stats.pearsonr(A_array, B_array) print "Pearson's correlation is: ", pearson[0], " with p-value ", pearson[ 1] spearman = scipy.stats.spearmanr(A_array, B_array) print "Spearman's correlation is: ", spearman[ 0], " with p-value ", spearman[1]
def main(argv): parser = OptionParser() parser.add_option("-s", "--species", action="store", type="string", dest="species", help="species, mm8, hg18", metavar="<str>") parser.add_option("-a", "--rawchipreadfile", action="store", type="string", dest="chipreadfile", metavar="<file>", help="raw read file from chip in bed format") parser.add_option("-b", "--rawcontrolreadfile", action="store", type="string", dest="controlreadfile", metavar="<file>", help="raw read file from control in BAM format") parser.add_option("-f", "--fragment_size", action="store", type="int", dest="fragment_size", metavar="<int>", help="average size of a fragment after CHIP experiment") parser.add_option("-d", "--islandfile", action="store", type="string", dest="islandfile", metavar="<file>", help="island file in BED format") parser.add_option("-o", "--outfile", action="store", type="string", dest="out_file", metavar="<file>", help="island read count summary file") parser.add_option("-t", "--mappable_fraction_of_genome_size ", action="store", type="float", dest="fraction", help="mapable fraction of genome size", metavar="<float>") (opt, args) = parser.parse_args(argv) if len(argv) < 14: parser.print_help() sys.exit(1) if opt.species in GenomeData.species_chroms.keys(): chroms = GenomeData.species_chroms[opt.species]; genomesize = sum (GenomeData.species_chrom_lengths[opt.species].values()); genomesize = opt.fraction * genomesize; else: print "This species is not recognized, exiting"; sys.exit(1); chip_library_size=get_total_tag_counts.get_total_tag_counts_bam(opt.chipreadfile); control_library_size=get_total_tag_counts.get_total_tag_counts_bam(opt.controlreadfile); print "chip library size ", chip_library_size print "control library size ", control_library_size totalchip = 0; totalcontrol = 0; islands = BED.BED(opt.species, opt.islandfile, "BED3", 0); # separate by chrom the chip library if Utility.fileExists(opt.chipreadfile): SeparateByChrom.separateByChromBamToBed(chroms, opt.chipreadfile, '.bed1'); else: print opt.chipreadfile, " not found"; sys.exit(1) # separate by chrom the control library if Utility.fileExists(opt.controlreadfile): SeparateByChrom.separateByChromBamToBed(chroms, opt.controlreadfile, '.bed2'); else: print opt.controlreadfile, " not found"; sys.exit(1) island_chip_readcount = {}; island_control_readcount = {}; for chrom in chroms: if chrom in islands.keys(): if len(islands[chrom]) != 0: island_list = islands[chrom]; if Utility.is_bed_sorted(island_list) == 0: island_list.sort(key=operator.attrgetter('start')); island_start_list = [] island_end_list = [] for item in island_list: island_start_list.append(item.start) island_end_list.append(item.end) island_chip_readcount_list=[0]*len(island_list); read_file = chrom + ".bed1"; f = open(read_file,'r') for line in f: if not re.match("#", line): line = line.strip() sline = line.split() position = associate_tags_with_regions.tag_position(sline, opt.fragment_size) index =associate_tags_with_regions.find_readcount_on_islands(island_start_list, island_end_list, position); if index >= 0: island_chip_readcount_list[index] += 1; totalchip += 1; f.close(); island_chip_readcount[chrom] = island_chip_readcount_list; island_control_readcount_list=[0]*len(island_list); read_file = chrom + ".bed2"; f = open(read_file,'r') for line in f: if not re.match("#", line): line = line.strip() sline = line.split() position = associate_tags_with_regions.tag_position(sline, opt.fragment_size) index = associate_tags_with_regions.find_readcount_on_islands(island_start_list, island_end_list, position); if index >= 0: island_control_readcount_list[index] += 1; totalcontrol += 1; f.close(); island_control_readcount[chrom] = island_control_readcount_list; chip_background_read = chip_library_size - totalchip; control_background_read = control_library_size - totalcontrol; #scaling_factor = chip_background_read*1.0/control_background_read; scaling_factor = chip_library_size*1.0/control_library_size; print "Total number of chip reads on islands is: ", totalchip; print "Total number of control reads on islands is: ", totalcontrol; #print "chip_background_read ", chip_background_read #print "control_background_read ", control_background_read out = open(opt.out_file, 'w'); pvalue_list = []; result_list = []; for chrom in chroms: if chrom in islands.keys(): if len(islands[chrom]) != 0: island_list = islands[chrom]; for index in xrange(len(island_list)): item = island_list[index]; observation = (island_chip_readcount[chrom])[index]; control_tag = (island_control_readcount[chrom])[index]; if (island_control_readcount[chrom])[index] > 0: #average = (island_control_readcount[chrom])[index] * scaling_factor; average = control_tag * scaling_factor fc = float(observation)/float(average); else: length = item.end - item.start + 1; average = length * control_library_size *1.0/genomesize; average = min(0.25, average)* scaling_factor; fc = float(observation)/float(average); if observation > average: pvalue = scipy.stats.poisson.sf((island_chip_readcount[chrom])[index], average)[()]; else: pvalue = 1; pvalue_list.append(pvalue); item_dic = {} item_dic['chrom'] = item.chrom item_dic['start'] = item.start item_dic['end'] = item.end item_dic['chip'] = observation item_dic['control'] = control_tag item_dic['pvalue'] = pvalue item_dic['fc'] = fc result_list.append(item_dic) pvaluearray=scipy.array(pvalue_list); pvaluerankarray=scipy.stats.rankdata(pvaluearray); totalnumber = len(result_list); for i in range(totalnumber): item = result_list[i]; alpha = pvalue_list[i] * totalnumber/pvaluerankarray[i]; if alpha > 1: alpha = 1; outline = item['chrom'] + "\t" + str(item['start']) + "\t" + str(item['end']) + "\t" + str(item['chip']) + "\t" + str(item['control']) + "\t" + str(item['pvalue']) + "\t" + str(item['fc']) + "\t" + str(alpha) + "\n"; out.write(outline); #pvalue_list.sort() #for item in result_list: #pvalue = float(item['pvalue']) #alpha = pvalue * len(result_list) / (pvalue_list.index(pvalue) + 1) #if alpha > 1: #alpha = 1; #outline = item['chrom'] + "\t" + str(item['start']) + "\t" + str(item['end']) + "\t" + str(item['chip']) + "\t" + str(item['control']) + "\t" + str(item['pvalue']) + "\t" + str(item['fc']) + "\t" + str(alpha) + "\n"; #out.write(outline); out.close(); SeparateByChrom.cleanup(chroms, '.bed1'); SeparateByChrom.cleanup(chroms, '.bed2');
def Calculate3UTRUsage(entrez_genes, bedfile, column_index, chroms, fragment_size, downstream_extension, outfile): """ entrez genes are made sure to be on one strand, the bed file are reads for that strand entrez_genes is a KnownEntrezGenes class object The raw read file needs to conform to bed format column_index: column in bed file for sorting """ # Separate reads by chrom rawreadslibName1 = (bedfile).split('/')[-1] rawreadssuffix1 = rawreadslibName1.split('.')[-1] rawreadslibName1 = rawreadslibName1.split('.')[0] rawreadsextension1 = "-" + rawreadslibName1 + '.' + rawreadssuffix1 + "1" if Utility_extended.fileExists(bedfile): if Utility_extended.chrom_files_exist(chroms, rawreadsextension1) != 1: # Separate by chrom and sort by start print chroms, rawreadsextension1, " files do not exist, separate by chroms and sort each file according to the second column. " Utility_extended.separate_by_chrom_sort(chroms, bedfile, rawreadsextension1, [column_index]) else: print bedfile, " is not found" sys.exit(1) # Here the output is 'a' outf = open(outfile, 'a') for chrom in chroms: if chrom in entrez_genes.chroms: # a KnownEntrezGenes object entrez_genes_by_chrom = Entrez.KnownEntrezGenes( [chrom], entrez_genes.subset_by_chrom(chrom)) # this_chrom_length = chrom_lengths[chrom] # Get the read locations if Utility_extended.fileExists(chrom + rawreadsextension1): f = open(chrom + rawreadsextension1, 'r') tag_positions = [] for line in f: line = line.strip() sline = line.split() tag_positions.append( associate_tags_with_regions.tag_position( sline, fragment_size)) if not Utility_extended.is_list_sorted(tag_positions): tag_positions.sort() f.close() for entrez_id in entrez_genes_by_chrom.entrez_ids: gene = entrez_genes_by_chrom.entrez_genes[ entrez_id] # an EntrezGene class object three_UTRs = gene.get_3UTRs(downstream_extension) print three_UTRs union = Utility_extended.union( three_UTRs ) # Find the union of 3UTRs [(start, end)], returns a [(start,end)] if len(union) > 1: print "There are disjoint 3UTRs in %s" % ( str(entrez_id)) else: # returns [((start, end), [tag_positions])], [tag_positions] = return[0][1] inside_reads = (Utility_extended. associate_simple_tags_with_regions( tag_positions, union))[0][1] total_read_count = len(inside_reads) RUD = CUTR_vs_AUTR(three_UTRs, inside_reads, gene.strand) ## For the set of genes, use the distal 3UTR at the designated representative 3UTR #myindex = Calculate3UTRUsageIndexFromCuratedGenes.find_distal_3UTR(genes) #gene = genes[myindex] #results = ThreeUTRCharacteristics(gene, inside_reads) gene_symbol = [] for mytranscript in gene.transcripts: if mytranscript.additional_annotations[ 0] not in gene_symbol: gene_symbol.append( mytranscript.additional_annotations[0]) union_length = union[0][1] - union[0][0] + 1 outline = str(entrez_id) + "\t" + str( union_length) + "\t" + str(RUD) + "\t" + str( total_read_count) + "\t" + ','.join([ transcript.name for transcript in gene.transcripts ]) + "\t" + ','.join(gene_symbol) + "\n" outf.write(outline) outf.close()
def calculateExonIntrons_by_chrom(entrez_genes_by_chrom, chrombed, fragment_size, totalcount, out_file=None): """ entrez_genes_by_chrom is a Entrez class object in a particular chrom bedfile is for a particular chrom totalcount: for RPKM out_file default is None and not writing to file return: reads_on_shared_exons={} # {entrezID:[((start, end), read_count)]} reads_on_shared_introns={} # {entrezID:[((start, end), read_count)]} reads_on_merged_transcripts={} # {entrezID:[((start, end), read_count)]} summary = {} # # {entrezID:{attribute:value}} (summary[entrez_id])["merged_exons_rc"] = merged_exons_rc (summary[entrez_id])["merged_exon_RPKM"] = merged_exon_RPKM (summary[entrez_id])["merged_exons_total_length"] = merged_exons_total_length (summary[entrez_id])["shared_exons_rc"] = shared_exons_rc (summary[entrez_id])["shared_exon_RPKM"] = shared_exon_RPKM (summary[entrez_id])["shared_exons_total_length"] = shared_exons_total_length (summary[entrez_id])["shared_introns_rc"] = shared_introns_rc (summary[entrez_id])["shared_intron_RPKM"] = shared_intron_RPKM (summary[entrez_id])["shared_introns_total_length"] = shared_introns_total_length (summary[entrez_id])["merged_transcript_rc"] = merged_transcript_rc (summary[entrez_id])["merged_transcript_RPKM"] = merged_transcript_RPKM (summary[entrez_id])["merged_transcript_length"] = merged_transcript_length """ if out_file is not None: outf = open(out_file, 'a') if Utility_extended.fileExists(chrombed): # load in each read tag_position_list = [] inf = open(chrombed, 'r') for line in inf: if not re.match("#", line): line = line.strip() sline = line.split() tag_position_list.append( associate_tags_with_regions.tag_position( sline, fragment_size)) inf.close() if not Utility_extended.is_list_sorted(tag_position_list): tag_position_list.sort() #[tag_positions] tag_positions = [ (item, 0) for item in tag_position_list ] # convert its form to acceptable by get_read_counts_on_regions reads_on_shared_exons = {} # {entrezID:[((start, end), read_count)]} reads_on_shared_introns = {} # {entrezID:[((start, end), read_count)]} reads_on_merged_transcripts = { } # {entrezID:[((start, end), read_count)]} summary = {} # {entrezID:{attribute:value}} if entrez_genes_by_chrom.num_genes > 0: for entrez_id in entrez_genes_by_chrom.entrez_ids: gene = (entrez_genes_by_chrom.entrez_genes)[entrez_id] shared_exons = gene.shared_exonic_regions #sorted in absolute coordinate if shared_exons == []: # No shared extrons reads_on_shared_exons[entrez_id] = [] shared_exons_rc = 0 shared_exons_total_length = 0 shared_exon_RPKM = 0 else: # tag_positions is required to be [(position, annotation)], here annotation is not present # element in result_list has structure: [((start, end), read_count)] result_list = Utility_extended.get_read_counts_on_regions( tag_positions, shared_exons) reads_on_shared_exons[entrez_id] = result_list shared_exons_total_length = sum( [region[1] - region[0] + 1 for region in shared_exons]) shared_exons_rc = sum([item[1] for item in result_list]) shared_exon_RPKM = shared_exons_rc * ( 1000.0 / shared_exons_total_length) * ( 1000000.0 / float(totalcount)) shared_introns = gene.shared_intronic_regions #sorted if shared_introns == []: # No shared introns reads_on_shared_introns[entrez_id] = [] shared_introns_rc = 0 shared_introns_total_length = 0 shared_intron_RPKM = 0 else: # element in result_list has structure: [((start, end), read_count)] result_list = Utility_extended.get_read_counts_on_regions( tag_positions, shared_introns) reads_on_shared_introns[entrez_id] = result_list shared_introns_total_length = sum([ region[1] - region[0] + 1 for region in shared_introns ]) shared_introns_rc = sum([item[1] for item in result_list]) shared_intron_RPKM = shared_introns_rc * ( 1000.0 / shared_introns_total_length) * ( 1000000 / float(totalcount)) merged_exons = gene.merged_exonic_regions #sorted if merged_exons == []: merged_exons_total_length = 0 merged_exons_rc = 0 merged_exon_RPKM = 0.0 else: result_list = Utility_extended.get_read_counts_on_regions( tag_positions, merged_exons) merged_exons_total_length = sum( [region[1] - region[0] + 1 for region in merged_exons]) merged_exons_rc = sum([item[1] for item in result_list]) merged_exon_RPKM = merged_exons_rc * ( 1000.0 / merged_exons_total_length) * ( 1000000 / float(totalcount)) merged_transcript = gene.boundaries #[(start, end)] # element in result_list has structure: [((start, end), read_count)] result_list = Utility_extended.get_read_counts_on_regions( tag_positions, merged_transcript) reads_on_merged_transcripts[entrez_id] = result_list merged_transcript_length = sum([ region[1] - region[0] + 1 for region in merged_transcript ]) merged_transcript_rc = sum([item[1] for item in result_list]) merged_transcript_RPKM = merged_transcript_rc * ( 1000.0 / merged_transcript_length) * (1000000 / float(totalcount)) if out_file is not None: gene_symbol = [] for transcript in gene.transcripts: if transcript.additional_annotations[ 0] not in gene_symbol: gene_symbol.append( transcript.additional_annotations[0]) outline = str(entrez_id) + '\t' + str( merged_exons_rc ) + '\t' + str(merged_exons_total_length) + '\t' + str( merged_exon_RPKM ) + '\t' + str(shared_exons_rc) + '\t' + str( shared_exons_total_length ) + '\t' + str(shared_exon_RPKM) + '\t' + str( shared_introns_rc ) + '\t' + str(shared_introns_total_length) + '\t' + str( shared_intron_RPKM) + '\t' + str( merged_transcript_rc) + '\t' + str( merged_transcript_length) + '\t' + str( merged_transcript_RPKM) + '\t' + ','.join([ transcript.name for transcript in gene.transcripts ]) + '\t' + ','.join(gene_symbol) + '\n' outf.write(outline) summary[entrez_id] = {} (summary[entrez_id])["merged_exons_rc"] = merged_exons_rc (summary[entrez_id])["merged_exon_RPKM"] = merged_exon_RPKM (summary[entrez_id] )["merged_exons_total_length"] = merged_exons_total_length (summary[entrez_id])["shared_exons_rc"] = shared_exons_rc (summary[entrez_id])["shared_exon_RPKM"] = shared_exon_RPKM (summary[entrez_id] )["shared_exons_total_length"] = shared_exons_total_length (summary[entrez_id])["shared_introns_rc"] = shared_introns_rc (summary[entrez_id])["shared_intron_RPKM"] = shared_intron_RPKM (summary[entrez_id] )["shared_introns_total_length"] = shared_introns_total_length (summary[entrez_id] )["merged_transcript_rc"] = merged_transcript_rc (summary[entrez_id] )["merged_transcript_RPKM"] = merged_transcript_RPKM (summary[entrez_id] )["merged_transcript_length"] = merged_transcript_length if out_file is not None: outf.close() return (reads_on_shared_exons, reads_on_shared_introns, reads_on_merged_transcripts, summary)
def Calculate3UTRUsage(entrez_genes, bedfile, chroms, outfile, threshold, PAfile, extension, index): """ entrez genes are made sure to be on one strand, the bed file are reads for that strand entrez_genes is a KnownEntrezGenes class object The raw read file needs to conform to bed format column_index: column in bed file for sorting """ # Separate reads by chrom rawreadslibName1 = (bedfile).split('/')[-1] rawreadssuffix1 = rawreadslibName1.split('.')[-1] rawreadslibName1 = rawreadslibName1.split('.')[0] rawreadsextension1 = "-" + rawreadslibName1 +'.' + rawreadssuffix1 + "1" if Utility_extended.fileExists(bedfile): if Utility_extended.chrom_files_exist(chroms, rawreadsextension1) != 1: # Separate by chrom and sort by start print chroms, rawreadsextension1, " files do not exist, separate by chroms and sort each file according to the second column. " Utility_extended.separate_by_chrom_sort(chroms, bedfile, rawreadsextension1, str(index)) else: print bedfile, " is not found" sys.exit(1) #This part is to access the polyadenylation sites PA1 = open(PAfile, 'r') PAsiteslist = [] PA2 = 'i' while PA2 != '': PA2 = PA1.readline() if PA2 != '': PA3 = PA2.strip('\n') PA4 = PA3.split('\t') PAsiteslist.append((PA4[0],PA4[1])) PA1.close() # Here the output is 'a', i.e. the output is appended to an existing file instead of creating one outf = open(outfile, 'a') for chrom in chroms: if chrom in entrez_genes.chroms: # a KnownEntrezGenes object entrez_genes_by_chrom = Entrez.KnownEntrezGenes([chrom], entrez_genes.subset_by_chrom(chrom)) # Get the read locations if Utility_extended.fileExists(chrom + rawreadsextension1): f = open(chrom + rawreadsextension1, 'r') tag_positions = [] for line in f: line = line.strip() sline = line.split() #make sure the extension is always 0, otherwise the rest of the program might not work as intended tag_positions.append(associate_tags_with_regions.tag_position(sline, 0)) f.close() if not Utility_extended.is_list_sorted(tag_positions): tag_positions.sort() #By this point tag_positions is a sorted list of all the reads located on the strand and chromosome the code is currently dealing with for entrez_id in entrez_genes_by_chrom.entrez_ids: gene = entrez_genes_by_chrom.entrez_genes[entrez_id] # an EntrezGene class object # get_3UTRs gets the ENTREZ 3'UTR, which appears to generally give the beginning of the 3'UTR and a site very close to the most distal polyadenylation site three_UTRs = gene.get_3UTRs() # Mastertuplemaker uses the ENTREZ 3'UTR and the polyA sites given to create the true data for the 3'UTR needed for CUTR_vs_AUTR to work true3UTRstarts, true3UTRends, UTRregion_start, UTRregion_end, UTRbeginning = Mastertuplemaker(three_UTRs,PAsiteslist,chrom,gene.strand, extension) #value should always be 1 as only 3'UTR with more than 1 polyA site need be considered if len(true3UTRends) > 1: #find all reads inside the 3'UTR inside_reads = associate_tags_with_3UTR(tag_positions, UTRregion_start, UTRregion_end) #finds reads in each region of the 3'UTR and calculates aUTR/cUTR for each of them #PolyAsites potentially useful for output RUDs, basic_RUD, PolyAsites = CUTR_vs_AUTR(true3UTRstarts, true3UTRends, inside_reads, gene.strand, threshold) #important if one wants to output gene_symbol information gene_symbol = [] for mytranscript in gene.transcripts: if mytranscript.additional_annotations[0] not in gene_symbol: gene_symbol.append(mytranscript.additional_annotations[0]) #outline to use to output RUDs outline = str(entrez_id) + "\t" + chrom + "\t" + gene.strand + "\t" + str(basic_RUD) + "\t" + ",".join(map(str, RUDs)) + "\n" #outline to use to output polyA information for a species #outline = str(entrez_id) + "\t" + chrom + "\t" + gene.strand + "\t" + str(UTRbeginning) + "\t" + ",".join(map(str, PolyAsites)) + "\n" outf.write(outline) outf.close()