def main(argv): parser = OptionParser() parser.add_option("-p", "--peakfile", action="store", type="string", dest="peakfile", help="input ucsc file for PA peaks ", metavar="<file>") parser.add_option("-u", "--annotationfile", action="store", type="string", dest="annotationfile", help="pickle file for annotations ", metavar="<file>") parser.add_option("-o", "--outfile", action="store", type="string", dest="outfile", help="outfile name", metavar="<file>") parser.add_option("-s", "--species", action="store", type="string", dest="species",help="species, mm8, hg18, etc", metavar="<str>") parser.add_option("-t", "--peak_threshold", action="store", type="int", dest="peak_threshold",help="Peak threshold", metavar="<int>") parser.add_option("-d", "--3UTRdownstreamextension", action="store", type="int", dest="downstream_extension",help="3UTR down stream extension", metavar="<int>") (opt, args) = parser.parse_args(argv) if len(argv) < 12: parser.print_help() sys.exit(1) if opt.species in GenomeData.species_chroms.keys(): chroms = GenomeData.species_chroms[opt.species] chrom_lengths = GenomeData.species_chrom_lengths[opt.species] else: print "This species is not recognized, exiting"; sys.exit(1); # entrez_gene_collection is a KnownEntrezGenes class object. The core is a entrez_genes.entrez_genes, a dic (keyed by entrez_id) of lists of EntrezGene object annotation = open(opt.entrez_genes, 'rb') entrez_gene_collection = Entrez.KnownEntrezGenes(chroms, pickle.load(annotation)) annotation.close() # test module test = 0 if test == 1: print "Testing gene structure" test_id = 54 Entrez.test_gene_structure(entrez_gene_collection, test_id) # Filter cluster of refseq_ids (keyed by entrez_id) according to the criterion of identical cdsEnd entrez_ids_with_unique_cdsEnd = entrez_gene_collection.get_ids_unique_cdsEnd() print "There are ", len(entrez_ids_with_unique_cdsEnd), " Entrez IDs each of which has a unique cdsEnd." # Additional filter to remove clusters with intron-containing 3UTRs allowance=0 ids=entrez_ids_with_unique_cdsEnd entrez_ids_with_intronless_3UTRs = entrez_gene_collection.get_ids_with_intronless_3UTR(allowance, ids) print "There are %d Entrez_ids with additional requirement of intronless 3UTR: ", %(len(entrez_ids_with_intronless_3UTRs)) entrez_gene_subset = Entrez.KnownEntrezGenes(chroms, entrez_gene_collection.subset(entrez_ids_with_intronless_3UTRs)) peaks_on_entrez_3UTRs = AssignPeaksToEntrez3UTRs(entrez_gene_subset, opt.peakfile, chroms, chrom_lengths, opt.peak_threshold, opt.downstream_extension) output = open(libName + "_PA_Peaks_associated_with_Annotations.pkl", 'wb') pickle.dump(peaks_on_entrez_3UTRs, output) output.close() Calculate3UTRUsage(peaks_on_entrez_3UTRs, final_entrez_id_collection, opt.outfile)
def main(argv): parser = OptionParser() parser.add_option( "-f", "--forwardreadfile", action="store", type="string", dest="ReadsOnForwardStrand", help="input bed file for RNASeq raw reads on forward strand", metavar="<file>") parser.add_option( "-r", "--reversereadfile", action="store", type="string", dest="ReadsOnReverseStrand", help="input bed file for RNASeq raw reads on reverse strand", metavar="<file>") parser.add_option( "-u", "--entrez_genes_file", action="store", type="string", dest="entrez_genes", metavar="<file>", help= "file with curated known genes clustered by entrez ID in pickle format" ) parser.add_option("-o", "--outfile", action="store", type="string", dest="outfile", help="outfile name", metavar="<file>") parser.add_option("-s", "--species", action="store", type="string", dest="species", help="species, mm8, hg18, etc", metavar="<str>") parser.add_option("-p", "--PAfile", action="store", type="string", dest="PAfile", help="input bed3 file", metavar="<file>") parser.add_option( "-e", "--extension", action="store", type="int", dest="extension", help= "integer value denoting how far downstream the program should look for polyadenylation sites past the Entrez given 3'UTR end", metavar="<float>") (opt, args) = parser.parse_args(argv) if len(argv) < 14: parser.print_help() sys.exit(1) startTime = time.time() allowance = 10 if opt.species in GenomeData.species_chroms.keys(): chroms = GenomeData.species_chroms[opt.species] chrom_lengths = GenomeData.species_chrom_lengths[opt.species] else: print "This species is not recognized, exiting" sys.exit(1) # entrez_gene_collection is a KnownEntrezGenes class object. The core is a entrez_genes.entrez_genes is a dic (keyed by entrez_id) of lists of EntrezGene object annotation = open(opt.entrez_genes, 'rb') entrez_gene_collection = Entrez.KnownEntrezGenes(chroms, pickle.load(annotation)) annotation.close() # test module test = 0 if test == 1: print "Testing gene structure" test_id = 79947 Entrez.test_gene_structure(entrez_gene_collection, test_id) # Filter cluster of refseq_ids (keyed by entrez_id) according to the criterion of identical cdsEnd entrez_ids_with_unique_cdsEnd = entrez_gene_collection.get_ids_with_unique_cdsEnd( ) print "There are ", len(entrez_ids_with_unique_cdsEnd ), " Entrez IDs each of which has a unique cdsEnd." #get total read count totalcount_F = get_total_tag_counts.get_total_tag_counts( opt.ReadsOnForwardStrand) totalcount_R = get_total_tag_counts.get_total_tag_counts( opt.ReadsOnReverseStrand) totalcount = totalcount_F + totalcount_R print totalcount_F, totalcount_R #Clear the file and write the first line outf = open(opt.outfile, 'w') #outline to use to output polyA information for a species #outline = "# Entrez ID" + "\t" + "Chrom" + "\t" + "Strand" + "\t" + "UTRstart" + "\t" + "PolyAsites" + "\n" #outline to use to output RUDs outline = "# Entrez ID" + "\t" + "Chrom" + "\t" + "Strand" + "\t" + "Basic_RUD" + "\t" + "List_of_subRUDs" + "\n" outf.write(outline) outf.close() #index: column in bed file for sorting index = 2 print "Process genes on forward strand" entrez_ids_on_forward_strand = entrez_gene_collection.get_strand_specific_ids( "+", entrez_ids_with_unique_cdsEnd) print "There are ", len( entrez_ids_on_forward_strand), " Entrez IDs on forward strand." entrez_gene_subset = Entrez.KnownEntrezGenes( chroms, entrez_gene_collection.subset(entrez_ids_on_forward_strand)) Calculate3UTRUsage(entrez_gene_subset, opt.ReadsOnForwardStrand, chroms, opt.outfile, allowance, opt.PAfile, opt.extension, index) print "Process genes on reverse strand" entrez_ids_on_reverse_strand = entrez_gene_collection.get_strand_specific_ids( "-", entrez_ids_with_unique_cdsEnd) print "There are ", len( entrez_ids_on_reverse_strand), " Entrez IDs on reverse strand." entrez_gene_subset = Entrez.KnownEntrezGenes( chroms, entrez_gene_collection.subset(entrez_ids_on_reverse_strand)) Calculate3UTRUsage(entrez_gene_subset, opt.ReadsOnReverseStrand, chroms, opt.outfile, allowance, opt.PAfile, opt.extension, index) print "it took", time.time() - startTime, "seconds."
def main(argv): parser = OptionParser() parser.add_option("-r", "--readfile", action="store", type="string", dest="Reads", help="input bed file for non-strand specific raw reads", metavar="<file>") parser.add_option( "-g", "--fragment_size", action="store", type="int", dest="fragment_size", help= "fragment_size determines the shift (half of fragment_size of ChIP-seq read position, in bps", metavar="<int>") parser.add_option( "-u", "--entrez_genes_file", action="store", type="string", dest="entrez_genes", metavar="<file>", help= "file with curated known genes clustered by entrez ID in pickle format" ) parser.add_option("-o", "--outfile", action="store", type="string", dest="outfile", metavar="<file>", help="output file name for genes and tag numbers") parser.add_option("-s", "--species", action="store", type="string", dest="species", help="species, mm8, hg18, etc", metavar="<str>") (opt, args) = parser.parse_args(argv) if len(argv) < 10: parser.print_help() sys.exit(1) startTime = time.time() if opt.species in GenomeData.species_chroms.keys(): chroms = GenomeData.species_chroms[opt.species] chrom_lengths = GenomeData.species_chrom_lengths[opt.species] else: print "This species is not recognized, exiting" sys.exit(1) # entrez_gene_collection is a EntrezGenes class object. The core is a entrez_genes.entrez_genes is a dic (keyed by entrez_id) of lists of EntrezGene object annotation = open(opt.entrez_genes, 'rb') entrez_gene_collection = Entrez.KnownEntrezGenes(chroms, pickle.load(annotation)) annotation.close() # test module test = 0 if test == 1: test_id = 54 Entrez.test_gene_structure(entrez_gene_collection, test_id) rawreadslibName1 = (opt.Reads).split('/')[-1] rawreadssuffix1 = rawreadslibName1.split('.')[-1] rawreadslibName1 = rawreadslibName1.split('.')[0] rawreadsextension1 = "-" + rawreadslibName1 + '.' + rawreadssuffix1 + "1" totalcount = 0 if Utility_extended.fileExists(opt.Reads) == 1: totalcount = get_total_tag_counts.get_total_tag_counts(opt.Reads) else: # if the all file exist, then use the all file, otherwise use the chrom separated file for chrom in chroms: chrombed = chrom + rawreadsextension1 totalcount1 = get_total_tag_counts.get_total_tag_counts(chrombed) print chrom, totalcount1 totalcount += totalcount1 (reads_on_shared_exons, reads_on_shared_introns, reads_on_merged_transcripts, summary) = calculate_non_strandspecific_rc_on_ExonIntrons( entrez_gene_collection, opt.Reads, chroms, opt.fragment_size) #Clear the file. outf = open(opt.outfile, 'w') outline = "# Entrez ID \t Merged Exon Read Count \t Merged Exon Length \t Merged Exon RPKM \t Shared Exon Read Count \t Shared Exon Length \t Shared Exon RPKM \t Shared Intron Read Count \t Share Intron Length \t Shared Intron RPKM \t Merged Transcript Read Count \t Merged Transcript Length \t Merged Transcript RPKM \t RefSeq IDs \t Gene Symbols \n" outf.write(outline) for entrez_id in entrez_gene_collection.entrez_ids: gene = (entrez_gene_collection.entrez_genes)[entrez_id] gene_symbol = [] for transcript in gene.transcripts: if transcript.additional_annotations[0] not in gene_symbol: gene_symbol.append(transcript.additional_annotations[0]) outline = str(entrez_id) + '\t' + str( summary[entrez_id]["merged_exons_rc"] ) + '\t' + str( summary[entrez_id]["merged_exons_total_length"] ) + '\t' + str(summary[entrez_id]["merged_exon_RPKM"]) + '\t' + str( summary[entrez_id]["shared_exons_rc"] ) + '\t' + str( summary[entrez_id]["shared_exons_total_length"] ) + '\t' + str(summary[entrez_id]["shared_exon_RPKM"]) + '\t' + str( summary[entrez_id]["shared_introns_rc"] ) + '\t' + str( summary[entrez_id]["shared_introns_total_length"] ) + '\t' + str(summary[entrez_id]["shared_intron_RPKM"]) + '\t' + str( summary[entrez_id]["merged_transcript_rc"]) + '\t' + str( summary[entrez_id]["merged_transcript_length"]) + '\t' + str( summary[entrez_id] ["merged_transcript_RPKM"]) + '\t' + ','.join([ transcript.name for transcript in gene.transcripts ]) + '\t' + ','.join(gene_symbol) + '\n' outf.write(outline) outf.close() # {entrezID:[((start, end), read_count)]} name = opt.outfile + "_shared_exons.pkl" output = open(name, 'wb') pickle.dump(reads_on_shared_exons, output) output.close() # {entrezID:[((start, end), read_count)]} name = opt.outfile + "_shared_introns.pkl" output = open(name, 'wb') pickle.dump(reads_on_shared_introns, output) output.close() #store the info in a pickle file name = opt.outfile + "_merged_transcripts.pkl" output = open(name, 'wb') pickle.dump(reads_on_merged_transcripts, output) output.close() name = opt.outfile + "_summary.pkl" output = open(name, 'wb') pickle.dump(summary, output) output.close() print "it took", time.time() - startTime, "seconds."
def main(argv): parser = OptionParser() parser.add_option( "-f", "--forwardreadfile", action="store", type="string", dest="ReadsOnForwardStrand", help="input bed file for RNASeq raw reads on forward strand", metavar="<file>") parser.add_option( "-r", "--reversereadfile", action="store", type="string", dest="ReadsOnReverseStrand", help="input bed file for RNASeq raw reads on reverse strand", metavar="<file>") parser.add_option( "-u", "--entrez_genes_file", action="store", type="string", dest="entrez_genes", metavar="<file>", help= "file with curated known genes clustered by entrez ID in pickle format" ) parser.add_option( "-g", "--fragment_size", action="store", type="int", dest="fragment_size", help= "fragment_size determines the shift (half of fragment_size of ChIP-seq read position, in bps", metavar="<int>") parser.add_option("-o", "--outfile", action="store", type="string", dest="outfile", help="outfile name", metavar="<file>") parser.add_option("-s", "--species", action="store", type="string", dest="species", help="species, mm8, hg18, etc", metavar="<str>") parser.add_option("-d", "--3UTRdownstreamextension", action="store", type="int", dest="downstream_extension", help="3UTR down stream extension", metavar="<int>") (opt, args) = parser.parse_args(argv) if len(argv) < 14: parser.print_help() sys.exit(1) startTime = time.time() allowance = 10 if opt.species in GenomeData.species_chroms.keys(): chroms = GenomeData.species_chroms[opt.species] chrom_lengths = GenomeData.species_chrom_lengths[opt.species] else: print "This species is not recognized, exiting" sys.exit(1) # entrez_gene_collection is a KnownEntrezGenes class object. The core is a entrez_genes.entrez_genes is a dic (keyed by entrez_id) of lists of EntrezGene object annotation = open(opt.entrez_genes, 'rb') entrez_gene_collection = Entrez.KnownEntrezGenes(chroms, pickle.load(annotation)) annotation.close() # test module test = 0 if test == 1: print "Testing gene structure" test_id = 54 Entrez.test_gene_structure(entrez_gene_collection, test_id) # Filter cluster of refseq_ids (keyed by entrez_id) according to the criterion of identical cdsEnd entrez_ids_with_unique_cdsEnd = entrez_gene_collection.get_ids_with_unique_cdsEnd( ) print "There are ", len(entrez_ids_with_unique_cdsEnd ), " Entrez IDs each of which has a unique cdsEnd." #get total read count totalcount_F = get_total_tag_counts.get_total_tag_counts( opt.ReadsOnForwardStrand) totalcount_R = get_total_tag_counts.get_total_tag_counts( opt.ReadsOnReverseStrand) totalcount = totalcount_F + totalcount_R print totalcount_F, totalcount_R #Clear the file and write the first line, needs to be modified outf = open(opt.outfile, 'w') #outline = "# Entrez ID \t Main Refseq ID \t 3UTR union length \t Length Index \t PA Multiplicity Index \t 3UTR Read Count \t RefSeq IDs \t Gene symbols \n" outline = "# Entrez ID \t 3UTR Union length \t RUD \t 3UTR Read Count \t RefSeq IDs \t Gene symbols \n" outf.write(outline) outf.close() #index: column in bed file for sorting index = 2 print "Process genes on forward strand" entrez_ids_on_forward_strand = entrez_gene_collection.get_strand_specific_ids( "+", entrez_ids_with_unique_cdsEnd) print "There are ", len( entrez_ids_on_forward_strand), " Entrez IDs on forward strand." entrez_gene_subset = Entrez.KnownEntrezGenes( chroms, entrez_gene_collection.subset(entrez_ids_on_forward_strand)) Calculate3UTRUsage(entrez_gene_subset, opt.ReadsOnForwardStrand, index, chroms, opt.fragment_size, opt.downstream_extension, opt.outfile) print "Process genes on reverse strand" entrez_ids_on_reverse_strand = entrez_gene_collection.get_strand_specific_ids( "-", entrez_ids_with_unique_cdsEnd) print "There are ", len( entrez_ids_on_reverse_strand), " Entrez IDs on reverse strand." entrez_gene_subset = Entrez.KnownEntrezGenes( chroms, entrez_gene_collection.subset(entrez_ids_on_reverse_strand)) Calculate3UTRUsage(entrez_gene_subset, opt.ReadsOnReverseStrand, index, chroms, opt.fragment_size, opt.downstream_extension, opt.outfile) print "it took", time.time() - startTime, "seconds."
def main(argv): parser = OptionParser() parser.add_option( "-f", "--forwardreadfile", action="store", type="string", dest="ReadsOnForwardStrand", help="input bed file for RNASeq raw reads on forward strand", metavar="<file>") parser.add_option( "-r", "--reversereadfile", action="store", type="string", dest="ReadsOnReverseStrand", help="input bed file for RNASeq raw reads on reverse strand", metavar="<file>") parser.add_option( "-g", "--fragment_size", action="store", type="int", dest="fragment_size", help= "fragment_size determines the shift (half of fragment_size of ChIP-seq read position, in bps", metavar="<int>") parser.add_option( "-u", "--entrez_genes_file", action="store", type="string", dest="entrez_genes", metavar="<file>", help= "file with curated known genes clustered by entrez ID in pickle format" ) parser.add_option("-o", "--outfile", action="store", type="string", dest="outfile", metavar="<file>", help="output file name for genes and tag numbers") parser.add_option("-s", "--species", action="store", type="string", dest="species", help="species, mm8, hg18, etc", metavar="<str>") test = 0 (opt, args) = parser.parse_args(argv) if len(argv) < 12: parser.print_help() sys.exit(1) startTime = time.time() ##################################################################3 #The column numbers are 1 based instead of 0 based! #For positive strand start_index_P = 2 #For negative strand start_index_N = 3 ##################################################################3 if opt.species in GenomeData.species_chroms.keys(): chroms = GenomeData.species_chroms[opt.species] chrom_lengths = GenomeData.species_chrom_lengths[opt.species] else: print "This species is not recognized, exiting" sys.exit(1) # entrez_gene_collection is a EntrezGenes class object. The core is a entrez_genes.entrez_genes is a dic (keyed by entrez_id) of lists of EntrezGene object annotation = open(opt.entrez_genes, 'rb') entrez_gene_collection = Entrez.KnownEntrezGenes(chroms, pickle.load(annotation)) annotation.close() # test module test = 0 if test == 1: print "Testing gene structure" test_id = 54 Entrez.test_gene_structure(entrez_gene_collection, test_id) totalcount_F = get_total_tag_counts.get_total_tag_counts( opt.ReadsOnForwardStrand) totalcount_R = get_total_tag_counts.get_total_tag_counts( opt.ReadsOnReverseStrand) totalcount = totalcount_F + totalcount_R print totalcount_F, totalcount_R #Clear the file. outf = open(opt.outfile, 'w') outline = "# Entrez ID \t Merged Exon Read Count \t Merged Exon Length \t Merged Exon RPKM \t Shared Exon Read Count \t Shared Exon Length \t Shared Exon RPKM \t Shared Intron Read Count \t Share Intron Length \t Shared Intron RPKM \t Merged Transcript Read Count \t Merged Transcript Length \t Merged Transcript RPKM \t RefSeq IDs \t Gene Symbols \n" outf.write(outline) outf.close() # The RNA seq data are strand specific. Only use + reads on genes on forward strand, and - reads on genes on reverse strand. print "Process genes on forward strand" entrez_ids_on_forward_strand = entrez_gene_collection.get_strand_specific_ids( "+") print "There are ", len( entrez_ids_on_forward_strand), " Entrez IDs on forward strand." entrez_gene_subset = Entrez.KnownEntrezGenes( chroms, entrez_gene_collection.subset(entrez_ids_on_forward_strand)) (forward_reads_on_shared_exons, forward_reads_on_shared_introns, forward_reads_on_merged_transcripts, forward_summary) = calculateExonIntrons(entrez_gene_subset, opt.ReadsOnForwardStrand, start_index_P, chroms, opt.fragment_size, totalcount, opt.outfile) print "Process genes on reverse strand" entrez_ids_on_reverse_strand = entrez_gene_collection.get_strand_specific_ids( "-") print "There are ", len( entrez_ids_on_reverse_strand), " Entrez IDs on reverse strand." entrez_gene_subset = Entrez.KnownEntrezGenes( chroms, entrez_gene_collection.subset(entrez_ids_on_reverse_strand)) (reverse_reads_on_shared_exons, reverse_reads_on_shared_introns, reverse_reads_on_merged_transcripts, reverse_summary) = calculateExonIntrons(entrez_gene_subset, opt.ReadsOnReverseStrand, start_index_N, chroms, opt.fragment_size, totalcount, opt.outfile) #combine the densities # {entrezID:[((start, end), read_count)]} reads_on_shared_exons = {} reads_on_shared_exons.update(forward_reads_on_shared_exons) reads_on_shared_exons.update(reverse_reads_on_shared_exons) name = opt.outfile + "_shared_exons.pkl" output = open(name, 'wb') pickle.dump(reads_on_shared_exons, output) output.close() if test == 1: test_distribution_dic(reads_on_shared_exons, test_id) # {entrezID:[((start, end), read_count)]} reads_on_shared_introns = {} reads_on_shared_introns.update(forward_reads_on_shared_introns) reads_on_shared_introns.update(reverse_reads_on_shared_introns) #store the info in a pickle file name = opt.outfile + "_shared_introns.pkl" output = open(name, 'wb') pickle.dump(reads_on_shared_introns, output) output.close() if test == 1: test_distribution_dic(reads_on_shared_introns, test_id) reads_on_merged_transcripts = {} reads_on_merged_transcripts.update(forward_reads_on_merged_transcripts) reads_on_merged_transcripts.update(reverse_reads_on_merged_transcripts) #store the info in a pickle file name = opt.outfile + "_merged_transcripts.pkl" output = open(name, 'wb') pickle.dump(reads_on_merged_transcripts, output) output.close() summary = {} summary.update(forward_summary) summary.update(reverse_summary) name = opt.outfile + "_summary.pkl" output = open(name, 'wb') pickle.dump(summary, output) output.close() print "it took", time.time() - startTime, "seconds."
def main(argv): parser = OptionParser() parser.add_option("-f", "--forwardreadfile", action="store", type="string", dest="ReadsOnForwardStrand", help="input bed file for RNASeq raw reads on forward strand", metavar="<file>") parser.add_option("-r", "--reversereadfile", action="store", type="string", dest="ReadsOnReverseStrand", help="input bed file for RNASeq raw reads on reverse strand", metavar="<file>") parser.add_option("-u", "--entrez_genes_file", action="store", type="string", dest="entrez_genes", metavar="<file>", help="file with curated known genes clustered by entrez ID in pickle format") parser.add_option("-o", "--outfile", action="store", type="string", dest="outfile", help="outfile name", metavar="<file>") parser.add_option("-s", "--species", action="store", type="string", dest="species",help="species, mm8, hg18, etc", metavar="<str>") parser.add_option("-p", "--PAfile", action="store", type="string", dest="PAfile", help="input bed3 file", metavar="<file>") parser.add_option("-e", "--extension", action="store", type="int", dest="extension",help="integer value denoting how far downstream the program should look for polyadenylation sites past the Entrez given 3'UTR end", metavar="<float>") (opt, args) = parser.parse_args(argv) if len(argv) < 14: parser.print_help() sys.exit(1) startTime = time.time() allowance = 10 if opt.species in GenomeData.species_chroms.keys(): chroms = GenomeData.species_chroms[opt.species] chrom_lengths = GenomeData.species_chrom_lengths[opt.species] else: print "This species is not recognized, exiting" sys.exit(1) # entrez_gene_collection is a KnownEntrezGenes class object. The core is a entrez_genes.entrez_genes is a dic (keyed by entrez_id) of lists of EntrezGene object annotation = open(opt.entrez_genes, 'rb') entrez_gene_collection = Entrez.KnownEntrezGenes(chroms, pickle.load(annotation)) annotation.close() # test module test = 0 if test == 1: print "Testing gene structure" test_id = 79947 Entrez.test_gene_structure(entrez_gene_collection, test_id) # Filter cluster of refseq_ids (keyed by entrez_id) according to the criterion of identical cdsEnd entrez_ids_with_unique_cdsEnd = entrez_gene_collection.get_ids_with_unique_cdsEnd() print "There are ", len(entrez_ids_with_unique_cdsEnd), " Entrez IDs each of which has a unique cdsEnd." #get total read count totalcount_F = get_total_tag_counts.get_total_tag_counts(opt.ReadsOnForwardStrand) totalcount_R = get_total_tag_counts.get_total_tag_counts(opt.ReadsOnReverseStrand) totalcount = totalcount_F + totalcount_R print totalcount_F, totalcount_R #Clear the file and write the first line outf = open(opt.outfile, 'w') #outline to use to output polyA information for a species #outline = "# Entrez ID" + "\t" + "Chrom" + "\t" + "Strand" + "\t" + "UTRstart" + "\t" + "PolyAsites" + "\n" #outline to use to output RUDs outline = "# Entrez ID" + "\t" + "Chrom" + "\t" + "Strand" + "\t" + "Basic_RUD" + "\t" + "List_of_subRUDs" + "\n" outf.write(outline) outf.close() #index: column in bed file for sorting index = 2 print "Process genes on forward strand" entrez_ids_on_forward_strand = entrez_gene_collection.get_strand_specific_ids("+", entrez_ids_with_unique_cdsEnd) print "There are ", len(entrez_ids_on_forward_strand), " Entrez IDs on forward strand." entrez_gene_subset = Entrez.KnownEntrezGenes(chroms, entrez_gene_collection.subset(entrez_ids_on_forward_strand)) Calculate3UTRUsage(entrez_gene_subset, opt.ReadsOnForwardStrand, chroms, opt.outfile, allowance, opt.PAfile, opt.extension, index) print "Process genes on reverse strand" entrez_ids_on_reverse_strand = entrez_gene_collection.get_strand_specific_ids("-", entrez_ids_with_unique_cdsEnd) print "There are ", len(entrez_ids_on_reverse_strand), " Entrez IDs on reverse strand." entrez_gene_subset = Entrez.KnownEntrezGenes(chroms, entrez_gene_collection.subset(entrez_ids_on_reverse_strand)) Calculate3UTRUsage(entrez_gene_subset, opt.ReadsOnReverseStrand, chroms, opt.outfile, allowance, opt.PAfile, opt.extension, index) print "it took", time.time() - startTime, "seconds."