Exemple #1
0
def main(argv):
	parser = OptionParser()
	parser.add_option("-p", "--peakfile", action="store", type="string", dest="peakfile", help="input ucsc file for PA peaks ", metavar="<file>")
	parser.add_option("-u", "--annotationfile", action="store", type="string", dest="annotationfile", help="pickle file for annotations ", metavar="<file>")
	parser.add_option("-o", "--outfile", action="store", type="string", dest="outfile", help="outfile name", metavar="<file>")
	parser.add_option("-s", "--species", action="store", type="string", dest="species",help="species, mm8, hg18, etc", metavar="<str>")
	parser.add_option("-t", "--peak_threshold", action="store", type="int", dest="peak_threshold",help="Peak threshold", metavar="<int>")
	parser.add_option("-d", "--3UTRdownstreamextension", action="store", type="int", dest="downstream_extension",help="3UTR down stream extension", metavar="<int>")
	
	(opt, args) = parser.parse_args(argv)
	if len(argv) < 12:
		parser.print_help()
		sys.exit(1)
		
	if opt.species in GenomeData.species_chroms.keys():
		chroms = GenomeData.species_chroms[opt.species]
		chrom_lengths = GenomeData.species_chrom_lengths[opt.species]
	else:
		print "This species is not recognized, exiting";
		sys.exit(1);
	
	# entrez_gene_collection is a KnownEntrezGenes class object. The core is a entrez_genes.entrez_genes, a dic (keyed by entrez_id) of lists of EntrezGene object
	annotation = open(opt.entrez_genes, 'rb')
	entrez_gene_collection = Entrez.KnownEntrezGenes(chroms, pickle.load(annotation)) 
	annotation.close()
	
	# test module
	test = 0
	if test == 1:
		print "Testing gene structure"
		test_id = 54
		Entrez.test_gene_structure(entrez_gene_collection, test_id)

	# Filter cluster of refseq_ids (keyed by entrez_id) according to the criterion of identical cdsEnd
	entrez_ids_with_unique_cdsEnd = entrez_gene_collection.get_ids_unique_cdsEnd()
	print "There are ", len(entrez_ids_with_unique_cdsEnd), " Entrez IDs each of which has a unique cdsEnd."
	
	# Additional filter to remove clusters with intron-containing 3UTRs
	allowance=0
	ids=entrez_ids_with_unique_cdsEnd
	entrez_ids_with_intronless_3UTRs = entrez_gene_collection.get_ids_with_intronless_3UTR(allowance, ids)
	print "There are %d Entrez_ids with additional requirement of intronless 3UTR: ", %(len(entrez_ids_with_intronless_3UTRs))
	
	entrez_gene_subset = Entrez.KnownEntrezGenes(chroms, entrez_gene_collection.subset(entrez_ids_with_intronless_3UTRs))
	
	peaks_on_entrez_3UTRs = AssignPeaksToEntrez3UTRs(entrez_gene_subset, opt.peakfile, chroms, chrom_lengths, opt.peak_threshold, opt.downstream_extension)
	
	output = open(libName + "_PA_Peaks_associated_with_Annotations.pkl", 'wb')
	pickle.dump(peaks_on_entrez_3UTRs, output)
	output.close()
	
	Calculate3UTRUsage(peaks_on_entrez_3UTRs, final_entrez_id_collection, opt.outfile)
Exemple #2
0
def main(argv):
    parser = OptionParser()
    parser.add_option(
        "-f",
        "--forwardreadfile",
        action="store",
        type="string",
        dest="ReadsOnForwardStrand",
        help="input bed file for RNASeq raw reads on forward strand",
        metavar="<file>")
    parser.add_option(
        "-r",
        "--reversereadfile",
        action="store",
        type="string",
        dest="ReadsOnReverseStrand",
        help="input bed file for RNASeq raw reads on reverse strand",
        metavar="<file>")
    parser.add_option(
        "-u",
        "--entrez_genes_file",
        action="store",
        type="string",
        dest="entrez_genes",
        metavar="<file>",
        help=
        "file with curated known genes clustered by entrez ID in pickle format"
    )
    parser.add_option("-o",
                      "--outfile",
                      action="store",
                      type="string",
                      dest="outfile",
                      help="outfile name",
                      metavar="<file>")
    parser.add_option("-s",
                      "--species",
                      action="store",
                      type="string",
                      dest="species",
                      help="species, mm8, hg18, etc",
                      metavar="<str>")
    parser.add_option("-p",
                      "--PAfile",
                      action="store",
                      type="string",
                      dest="PAfile",
                      help="input bed3 file",
                      metavar="<file>")
    parser.add_option(
        "-e",
        "--extension",
        action="store",
        type="int",
        dest="extension",
        help=
        "integer value denoting how far downstream the program should look for polyadenylation sites past the Entrez given 3'UTR end",
        metavar="<float>")

    (opt, args) = parser.parse_args(argv)

    if len(argv) < 14:
        parser.print_help()
        sys.exit(1)

    startTime = time.time()

    allowance = 10

    if opt.species in GenomeData.species_chroms.keys():
        chroms = GenomeData.species_chroms[opt.species]
        chrom_lengths = GenomeData.species_chrom_lengths[opt.species]
    else:
        print "This species is not recognized, exiting"
        sys.exit(1)

    # entrez_gene_collection is a KnownEntrezGenes class object. The core is a entrez_genes.entrez_genes is a dic (keyed by entrez_id) of lists of EntrezGene object
    annotation = open(opt.entrez_genes, 'rb')
    entrez_gene_collection = Entrez.KnownEntrezGenes(chroms,
                                                     pickle.load(annotation))
    annotation.close()

    # test module
    test = 0
    if test == 1:
        print "Testing gene structure"
        test_id = 79947
        Entrez.test_gene_structure(entrez_gene_collection, test_id)

    # Filter cluster of refseq_ids (keyed by entrez_id) according to the criterion of identical cdsEnd
    entrez_ids_with_unique_cdsEnd = entrez_gene_collection.get_ids_with_unique_cdsEnd(
    )
    print "There are ", len(entrez_ids_with_unique_cdsEnd
                            ), " Entrez IDs each of which has a unique cdsEnd."

    #get total read count
    totalcount_F = get_total_tag_counts.get_total_tag_counts(
        opt.ReadsOnForwardStrand)
    totalcount_R = get_total_tag_counts.get_total_tag_counts(
        opt.ReadsOnReverseStrand)
    totalcount = totalcount_F + totalcount_R
    print totalcount_F, totalcount_R

    #Clear the file and write the first line
    outf = open(opt.outfile, 'w')

    #outline to use to output polyA information for a species
    #outline = "# Entrez ID" + "\t" + "Chrom" + "\t" + "Strand" + "\t" + "UTRstart" + "\t" + "PolyAsites" + "\n"
    #outline to use to output RUDs
    outline = "# Entrez ID" + "\t" + "Chrom" + "\t" + "Strand" + "\t" + "Basic_RUD" + "\t" + "List_of_subRUDs" + "\n"
    outf.write(outline)
    outf.close()

    #index: column in bed file for sorting
    index = 2

    print "Process genes on forward strand"
    entrez_ids_on_forward_strand = entrez_gene_collection.get_strand_specific_ids(
        "+", entrez_ids_with_unique_cdsEnd)
    print "There are ", len(
        entrez_ids_on_forward_strand), " Entrez IDs on forward strand."
    entrez_gene_subset = Entrez.KnownEntrezGenes(
        chroms, entrez_gene_collection.subset(entrez_ids_on_forward_strand))

    Calculate3UTRUsage(entrez_gene_subset, opt.ReadsOnForwardStrand, chroms,
                       opt.outfile, allowance, opt.PAfile, opt.extension,
                       index)

    print "Process genes on reverse strand"
    entrez_ids_on_reverse_strand = entrez_gene_collection.get_strand_specific_ids(
        "-", entrez_ids_with_unique_cdsEnd)
    print "There are ", len(
        entrez_ids_on_reverse_strand), " Entrez IDs on reverse strand."
    entrez_gene_subset = Entrez.KnownEntrezGenes(
        chroms, entrez_gene_collection.subset(entrez_ids_on_reverse_strand))

    Calculate3UTRUsage(entrez_gene_subset, opt.ReadsOnReverseStrand, chroms,
                       opt.outfile, allowance, opt.PAfile, opt.extension,
                       index)

    print "it took", time.time() - startTime, "seconds."
Exemple #3
0
def main(argv):
    parser = OptionParser()
    parser.add_option("-r",
                      "--readfile",
                      action="store",
                      type="string",
                      dest="Reads",
                      help="input bed file for non-strand specific raw reads",
                      metavar="<file>")
    parser.add_option(
        "-g",
        "--fragment_size",
        action="store",
        type="int",
        dest="fragment_size",
        help=
        "fragment_size determines the shift (half of fragment_size of ChIP-seq read position, in bps",
        metavar="<int>")
    parser.add_option(
        "-u",
        "--entrez_genes_file",
        action="store",
        type="string",
        dest="entrez_genes",
        metavar="<file>",
        help=
        "file with curated known genes clustered by entrez ID in pickle format"
    )
    parser.add_option("-o",
                      "--outfile",
                      action="store",
                      type="string",
                      dest="outfile",
                      metavar="<file>",
                      help="output file name for genes and tag numbers")
    parser.add_option("-s",
                      "--species",
                      action="store",
                      type="string",
                      dest="species",
                      help="species, mm8, hg18, etc",
                      metavar="<str>")

    (opt, args) = parser.parse_args(argv)
    if len(argv) < 10:
        parser.print_help()
        sys.exit(1)

    startTime = time.time()

    if opt.species in GenomeData.species_chroms.keys():
        chroms = GenomeData.species_chroms[opt.species]
        chrom_lengths = GenomeData.species_chrom_lengths[opt.species]
    else:
        print "This species is not recognized, exiting"
        sys.exit(1)

    # entrez_gene_collection is a EntrezGenes class object. The core is a entrez_genes.entrez_genes is a dic (keyed by entrez_id) of lists of EntrezGene object
    annotation = open(opt.entrez_genes, 'rb')
    entrez_gene_collection = Entrez.KnownEntrezGenes(chroms,
                                                     pickle.load(annotation))
    annotation.close()

    # test module
    test = 0
    if test == 1:
        test_id = 54
        Entrez.test_gene_structure(entrez_gene_collection, test_id)

    rawreadslibName1 = (opt.Reads).split('/')[-1]
    rawreadssuffix1 = rawreadslibName1.split('.')[-1]
    rawreadslibName1 = rawreadslibName1.split('.')[0]
    rawreadsextension1 = "-" + rawreadslibName1 + '.' + rawreadssuffix1 + "1"

    totalcount = 0
    if Utility_extended.fileExists(opt.Reads) == 1:
        totalcount = get_total_tag_counts.get_total_tag_counts(opt.Reads)
    else:  # if the all file exist, then use the all file, otherwise use the chrom separated file
        for chrom in chroms:
            chrombed = chrom + rawreadsextension1
            totalcount1 = get_total_tag_counts.get_total_tag_counts(chrombed)
            print chrom, totalcount1
            totalcount += totalcount1

    (reads_on_shared_exons, reads_on_shared_introns,
     reads_on_merged_transcripts,
     summary) = calculate_non_strandspecific_rc_on_ExonIntrons(
         entrez_gene_collection, opt.Reads, chroms, opt.fragment_size)

    #Clear the file.
    outf = open(opt.outfile, 'w')
    outline = "# Entrez ID \t Merged Exon Read Count \t Merged Exon Length \t Merged Exon RPKM \t Shared Exon Read Count \t  Shared Exon Length \t Shared Exon RPKM \t Shared Intron Read Count \t Share Intron Length \t Shared Intron RPKM \t Merged Transcript Read Count \t Merged Transcript Length \t Merged Transcript RPKM \t RefSeq IDs \t Gene Symbols \n"
    outf.write(outline)
    for entrez_id in entrez_gene_collection.entrez_ids:
        gene = (entrez_gene_collection.entrez_genes)[entrez_id]
        gene_symbol = []
        for transcript in gene.transcripts:
            if transcript.additional_annotations[0] not in gene_symbol:
                gene_symbol.append(transcript.additional_annotations[0])
        outline = str(entrez_id) + '\t' + str(
            summary[entrez_id]["merged_exons_rc"]
        ) + '\t' + str(
            summary[entrez_id]["merged_exons_total_length"]
        ) + '\t' + str(summary[entrez_id]["merged_exon_RPKM"]) + '\t' + str(
            summary[entrez_id]["shared_exons_rc"]
        ) + '\t' + str(
            summary[entrez_id]["shared_exons_total_length"]
        ) + '\t' + str(summary[entrez_id]["shared_exon_RPKM"]) + '\t' + str(
            summary[entrez_id]["shared_introns_rc"]
        ) + '\t' + str(
            summary[entrez_id]["shared_introns_total_length"]
        ) + '\t' + str(summary[entrez_id]["shared_intron_RPKM"]) + '\t' + str(
            summary[entrez_id]["merged_transcript_rc"]) + '\t' + str(
                summary[entrez_id]["merged_transcript_length"]) + '\t' + str(
                    summary[entrez_id]
                    ["merged_transcript_RPKM"]) + '\t' + ','.join([
                        transcript.name for transcript in gene.transcripts
                    ]) + '\t' + ','.join(gene_symbol) + '\n'
        outf.write(outline)
    outf.close()

    # {entrezID:[((start, end), read_count)]}
    name = opt.outfile + "_shared_exons.pkl"
    output = open(name, 'wb')
    pickle.dump(reads_on_shared_exons, output)
    output.close()

    # {entrezID:[((start, end), read_count)]}
    name = opt.outfile + "_shared_introns.pkl"
    output = open(name, 'wb')
    pickle.dump(reads_on_shared_introns, output)
    output.close()

    #store the info in a pickle file
    name = opt.outfile + "_merged_transcripts.pkl"
    output = open(name, 'wb')
    pickle.dump(reads_on_merged_transcripts, output)
    output.close()

    name = opt.outfile + "_summary.pkl"
    output = open(name, 'wb')
    pickle.dump(summary, output)
    output.close()

    print "it took", time.time() - startTime, "seconds."
def main(argv):
    parser = OptionParser()
    parser.add_option(
        "-f",
        "--forwardreadfile",
        action="store",
        type="string",
        dest="ReadsOnForwardStrand",
        help="input bed file for RNASeq raw reads on forward strand",
        metavar="<file>")
    parser.add_option(
        "-r",
        "--reversereadfile",
        action="store",
        type="string",
        dest="ReadsOnReverseStrand",
        help="input bed file for RNASeq raw reads on reverse strand",
        metavar="<file>")
    parser.add_option(
        "-u",
        "--entrez_genes_file",
        action="store",
        type="string",
        dest="entrez_genes",
        metavar="<file>",
        help=
        "file with curated known genes clustered by entrez ID in pickle format"
    )
    parser.add_option(
        "-g",
        "--fragment_size",
        action="store",
        type="int",
        dest="fragment_size",
        help=
        "fragment_size determines the shift (half of fragment_size of ChIP-seq read position, in bps",
        metavar="<int>")
    parser.add_option("-o",
                      "--outfile",
                      action="store",
                      type="string",
                      dest="outfile",
                      help="outfile name",
                      metavar="<file>")
    parser.add_option("-s",
                      "--species",
                      action="store",
                      type="string",
                      dest="species",
                      help="species, mm8, hg18, etc",
                      metavar="<str>")
    parser.add_option("-d",
                      "--3UTRdownstreamextension",
                      action="store",
                      type="int",
                      dest="downstream_extension",
                      help="3UTR down stream extension",
                      metavar="<int>")

    (opt, args) = parser.parse_args(argv)

    if len(argv) < 14:
        parser.print_help()
        sys.exit(1)

    startTime = time.time()

    allowance = 10

    if opt.species in GenomeData.species_chroms.keys():
        chroms = GenomeData.species_chroms[opt.species]
        chrom_lengths = GenomeData.species_chrom_lengths[opt.species]
    else:
        print "This species is not recognized, exiting"
        sys.exit(1)

    # entrez_gene_collection is a KnownEntrezGenes class object. The core is a entrez_genes.entrez_genes is a dic (keyed by entrez_id) of lists of EntrezGene object
    annotation = open(opt.entrez_genes, 'rb')
    entrez_gene_collection = Entrez.KnownEntrezGenes(chroms,
                                                     pickle.load(annotation))
    annotation.close()

    # test module
    test = 0
    if test == 1:
        print "Testing gene structure"
        test_id = 54
        Entrez.test_gene_structure(entrez_gene_collection, test_id)

    # Filter cluster of refseq_ids (keyed by entrez_id) according to the criterion of identical cdsEnd
    entrez_ids_with_unique_cdsEnd = entrez_gene_collection.get_ids_with_unique_cdsEnd(
    )
    print "There are ", len(entrez_ids_with_unique_cdsEnd
                            ), " Entrez IDs each of which has a unique cdsEnd."

    #get total read count
    totalcount_F = get_total_tag_counts.get_total_tag_counts(
        opt.ReadsOnForwardStrand)
    totalcount_R = get_total_tag_counts.get_total_tag_counts(
        opt.ReadsOnReverseStrand)
    totalcount = totalcount_F + totalcount_R
    print totalcount_F, totalcount_R

    #Clear the file and write the first line, needs to be modified
    outf = open(opt.outfile, 'w')
    #outline = "# Entrez ID \t Main Refseq ID \t 3UTR union length \t Length Index \t PA Multiplicity Index \t 3UTR Read Count \t RefSeq IDs \t Gene symbols \n"
    outline = "# Entrez ID \t 3UTR Union length \t RUD \t 3UTR Read Count \t RefSeq IDs \t Gene symbols \n"
    outf.write(outline)
    outf.close()

    #index: column in bed file for sorting
    index = 2

    print "Process genes on forward strand"
    entrez_ids_on_forward_strand = entrez_gene_collection.get_strand_specific_ids(
        "+", entrez_ids_with_unique_cdsEnd)
    print "There are ", len(
        entrez_ids_on_forward_strand), " Entrez IDs on forward strand."
    entrez_gene_subset = Entrez.KnownEntrezGenes(
        chroms, entrez_gene_collection.subset(entrez_ids_on_forward_strand))

    Calculate3UTRUsage(entrez_gene_subset, opt.ReadsOnForwardStrand, index,
                       chroms, opt.fragment_size, opt.downstream_extension,
                       opt.outfile)

    print "Process genes on reverse strand"
    entrez_ids_on_reverse_strand = entrez_gene_collection.get_strand_specific_ids(
        "-", entrez_ids_with_unique_cdsEnd)
    print "There are ", len(
        entrez_ids_on_reverse_strand), " Entrez IDs on reverse strand."
    entrez_gene_subset = Entrez.KnownEntrezGenes(
        chroms, entrez_gene_collection.subset(entrez_ids_on_reverse_strand))

    Calculate3UTRUsage(entrez_gene_subset, opt.ReadsOnReverseStrand, index,
                       chroms, opt.fragment_size, opt.downstream_extension,
                       opt.outfile)

    print "it took", time.time() - startTime, "seconds."
Exemple #5
0
def main(argv):
    parser = OptionParser()
    parser.add_option(
        "-f",
        "--forwardreadfile",
        action="store",
        type="string",
        dest="ReadsOnForwardStrand",
        help="input bed file for RNASeq raw reads on forward strand",
        metavar="<file>")
    parser.add_option(
        "-r",
        "--reversereadfile",
        action="store",
        type="string",
        dest="ReadsOnReverseStrand",
        help="input bed file for RNASeq raw reads on reverse strand",
        metavar="<file>")
    parser.add_option(
        "-g",
        "--fragment_size",
        action="store",
        type="int",
        dest="fragment_size",
        help=
        "fragment_size determines the shift (half of fragment_size of ChIP-seq read position, in bps",
        metavar="<int>")
    parser.add_option(
        "-u",
        "--entrez_genes_file",
        action="store",
        type="string",
        dest="entrez_genes",
        metavar="<file>",
        help=
        "file with curated known genes clustered by entrez ID in pickle format"
    )
    parser.add_option("-o",
                      "--outfile",
                      action="store",
                      type="string",
                      dest="outfile",
                      metavar="<file>",
                      help="output file name for genes and tag numbers")
    parser.add_option("-s",
                      "--species",
                      action="store",
                      type="string",
                      dest="species",
                      help="species, mm8, hg18, etc",
                      metavar="<str>")

    test = 0

    (opt, args) = parser.parse_args(argv)
    if len(argv) < 12:
        parser.print_help()
        sys.exit(1)

    startTime = time.time()

    ##################################################################3
    #The column numbers are 1 based instead of 0 based!
    #For positive strand
    start_index_P = 2
    #For negative strand
    start_index_N = 3
    ##################################################################3

    if opt.species in GenomeData.species_chroms.keys():
        chroms = GenomeData.species_chroms[opt.species]
        chrom_lengths = GenomeData.species_chrom_lengths[opt.species]
    else:
        print "This species is not recognized, exiting"
        sys.exit(1)

    # entrez_gene_collection is a EntrezGenes class object. The core is a entrez_genes.entrez_genes is a dic (keyed by entrez_id) of lists of EntrezGene object
    annotation = open(opt.entrez_genes, 'rb')
    entrez_gene_collection = Entrez.KnownEntrezGenes(chroms,
                                                     pickle.load(annotation))
    annotation.close()

    # test module
    test = 0
    if test == 1:
        print "Testing gene structure"
        test_id = 54
        Entrez.test_gene_structure(entrez_gene_collection, test_id)

    totalcount_F = get_total_tag_counts.get_total_tag_counts(
        opt.ReadsOnForwardStrand)
    totalcount_R = get_total_tag_counts.get_total_tag_counts(
        opt.ReadsOnReverseStrand)
    totalcount = totalcount_F + totalcount_R
    print totalcount_F, totalcount_R

    #Clear the file.
    outf = open(opt.outfile, 'w')
    outline = "# Entrez ID \t Merged Exon Read Count \t Merged Exon Length \t Merged Exon RPKM \t Shared Exon Read Count \t  Shared Exon Length \t Shared Exon RPKM \t Shared Intron Read Count \t Share Intron Length \t Shared Intron RPKM \t Merged Transcript Read Count \t Merged Transcript Length \t Merged Transcript RPKM \t RefSeq IDs \t Gene Symbols \n"
    outf.write(outline)
    outf.close()

    # The RNA seq data are strand specific. Only use + reads on genes on forward strand, and - reads on genes on reverse strand.
    print "Process genes on forward strand"
    entrez_ids_on_forward_strand = entrez_gene_collection.get_strand_specific_ids(
        "+")
    print "There are ", len(
        entrez_ids_on_forward_strand), " Entrez IDs on forward strand."
    entrez_gene_subset = Entrez.KnownEntrezGenes(
        chroms, entrez_gene_collection.subset(entrez_ids_on_forward_strand))

    (forward_reads_on_shared_exons, forward_reads_on_shared_introns,
     forward_reads_on_merged_transcripts,
     forward_summary) = calculateExonIntrons(entrez_gene_subset,
                                             opt.ReadsOnForwardStrand,
                                             start_index_P, chroms,
                                             opt.fragment_size, totalcount,
                                             opt.outfile)

    print "Process genes on reverse strand"
    entrez_ids_on_reverse_strand = entrez_gene_collection.get_strand_specific_ids(
        "-")
    print "There are ", len(
        entrez_ids_on_reverse_strand), " Entrez IDs on reverse strand."
    entrez_gene_subset = Entrez.KnownEntrezGenes(
        chroms, entrez_gene_collection.subset(entrez_ids_on_reverse_strand))

    (reverse_reads_on_shared_exons, reverse_reads_on_shared_introns,
     reverse_reads_on_merged_transcripts,
     reverse_summary) = calculateExonIntrons(entrez_gene_subset,
                                             opt.ReadsOnReverseStrand,
                                             start_index_N, chroms,
                                             opt.fragment_size, totalcount,
                                             opt.outfile)

    #combine the densities
    # {entrezID:[((start, end), read_count)]}
    reads_on_shared_exons = {}
    reads_on_shared_exons.update(forward_reads_on_shared_exons)
    reads_on_shared_exons.update(reverse_reads_on_shared_exons)
    name = opt.outfile + "_shared_exons.pkl"
    output = open(name, 'wb')
    pickle.dump(reads_on_shared_exons, output)
    output.close()

    if test == 1:
        test_distribution_dic(reads_on_shared_exons, test_id)

    # {entrezID:[((start, end), read_count)]}
    reads_on_shared_introns = {}
    reads_on_shared_introns.update(forward_reads_on_shared_introns)
    reads_on_shared_introns.update(reverse_reads_on_shared_introns)
    #store the info in a pickle file
    name = opt.outfile + "_shared_introns.pkl"
    output = open(name, 'wb')
    pickle.dump(reads_on_shared_introns, output)
    output.close()

    if test == 1:
        test_distribution_dic(reads_on_shared_introns, test_id)

    reads_on_merged_transcripts = {}
    reads_on_merged_transcripts.update(forward_reads_on_merged_transcripts)
    reads_on_merged_transcripts.update(reverse_reads_on_merged_transcripts)
    #store the info in a pickle file
    name = opt.outfile + "_merged_transcripts.pkl"
    output = open(name, 'wb')
    pickle.dump(reads_on_merged_transcripts, output)
    output.close()

    summary = {}
    summary.update(forward_summary)
    summary.update(reverse_summary)
    name = opt.outfile + "_summary.pkl"
    output = open(name, 'wb')
    pickle.dump(summary, output)
    output.close()

    print "it took", time.time() - startTime, "seconds."
def main(argv):
	parser = OptionParser()
	parser.add_option("-f", "--forwardreadfile", action="store", type="string", dest="ReadsOnForwardStrand", help="input bed file for RNASeq raw reads on forward strand", metavar="<file>")
	parser.add_option("-r", "--reversereadfile", action="store", type="string", dest="ReadsOnReverseStrand", help="input bed file for RNASeq raw reads on reverse strand", metavar="<file>")
	parser.add_option("-u", "--entrez_genes_file", action="store", type="string", dest="entrez_genes", metavar="<file>", help="file with curated known genes clustered by entrez ID in pickle format")
	parser.add_option("-o", "--outfile", action="store", type="string", dest="outfile", help="outfile name", metavar="<file>")
	parser.add_option("-s", "--species", action="store", type="string", dest="species",help="species, mm8, hg18, etc", metavar="<str>")
	parser.add_option("-p", "--PAfile", action="store", type="string", dest="PAfile", help="input bed3 file", metavar="<file>")	
	parser.add_option("-e", "--extension", action="store", type="int", dest="extension",help="integer value denoting how far downstream the program should look for polyadenylation sites past the Entrez given 3'UTR end", metavar="<float>")
		

	(opt, args) = parser.parse_args(argv)

	if len(argv) < 14:
		parser.print_help()
		sys.exit(1)

	startTime = time.time()

	allowance = 10

	if opt.species in GenomeData.species_chroms.keys():
		chroms = GenomeData.species_chroms[opt.species]
		chrom_lengths = GenomeData.species_chrom_lengths[opt.species]
	else:
		print "This species is not recognized, exiting"
		sys.exit(1)

	# entrez_gene_collection is a KnownEntrezGenes class object. The core is a entrez_genes.entrez_genes is a dic (keyed by entrez_id) of lists of EntrezGene object
	annotation = open(opt.entrez_genes, 'rb')
	entrez_gene_collection = Entrez.KnownEntrezGenes(chroms, pickle.load(annotation))
	annotation.close()

	# test module
	test = 0
	if test == 1:
		print "Testing gene structure"
		test_id = 79947
		Entrez.test_gene_structure(entrez_gene_collection, test_id)


	# Filter cluster of refseq_ids (keyed by entrez_id) according to the criterion of identical cdsEnd
	entrez_ids_with_unique_cdsEnd = entrez_gene_collection.get_ids_with_unique_cdsEnd()
	print "There are ", len(entrez_ids_with_unique_cdsEnd), " Entrez IDs each of which has a unique cdsEnd."


	#get total read count
	totalcount_F = get_total_tag_counts.get_total_tag_counts(opt.ReadsOnForwardStrand)
	totalcount_R = get_total_tag_counts.get_total_tag_counts(opt.ReadsOnReverseStrand)
	totalcount = totalcount_F + totalcount_R
	print totalcount_F, totalcount_R

	#Clear the file and write the first line
	outf = open(opt.outfile, 'w')
	
	#outline to use to output polyA information for a species	
	#outline = "# Entrez ID" + "\t" + "Chrom" + "\t" + "Strand" + "\t" + "UTRstart" + "\t" + "PolyAsites" + "\n"
	#outline to use to output RUDs
	outline = "# Entrez ID" + "\t" + "Chrom" + "\t" + "Strand" + "\t" + "Basic_RUD" + "\t" + "List_of_subRUDs" + "\n"
	outf.write(outline)
	outf.close()

	#index: column in bed file for sorting
	index = 2

	print "Process genes on forward strand"
	entrez_ids_on_forward_strand = entrez_gene_collection.get_strand_specific_ids("+", entrez_ids_with_unique_cdsEnd)
	print "There are ", len(entrez_ids_on_forward_strand), " Entrez IDs on forward strand."
	entrez_gene_subset = Entrez.KnownEntrezGenes(chroms, entrez_gene_collection.subset(entrez_ids_on_forward_strand))

	Calculate3UTRUsage(entrez_gene_subset, opt.ReadsOnForwardStrand, chroms, opt.outfile, allowance, opt.PAfile, opt.extension, index)


	print "Process genes on reverse strand"
	entrez_ids_on_reverse_strand = entrez_gene_collection.get_strand_specific_ids("-", entrez_ids_with_unique_cdsEnd)
	print "There are ", len(entrez_ids_on_reverse_strand), " Entrez IDs on reverse strand."
	entrez_gene_subset = Entrez.KnownEntrezGenes(chroms, entrez_gene_collection.subset(entrez_ids_on_reverse_strand))

	Calculate3UTRUsage(entrez_gene_subset, opt.ReadsOnReverseStrand, chroms, opt.outfile, allowance, opt.PAfile, opt.extension, index)

	print "it took", time.time() - startTime, "seconds."