Example #1
0
def genbank_entries_from_accession(accessions,
                                   read_out=False,
                                   email='*****@*****.**'):
    import Entrez

    entries = []
    Entrez.email = email

    request = Entrez.epost('nucleotide', id=','.join(map(str, accessions)))
    result = Entrez.read(request)

    handle = Entrez.efetch(db='nucleotide',
                           retmode='xml',
                           webenv=result['WebEnv'],
                           query_key=result['QueryKey'])

    for r in Entrez.parse(handle):
        try:
            entry = [x for x in r['GBSeq_other-seqids'] if 'gi' in x][0]
            gi = int(entry.split('|')[1])
            entries.append(gi)
        except ValueError:
            gi = None

        if read_out is True:
            print(">GI {0} {1} {2}\n{3}".format(gi,
                                                r['GBSeq_primary-accession'],
                                                r['GBSeq_definition'],
                                                r['GBSeq_sequence'][:15]))
    return entries
Example #2
0
def erratum_check(PMID, comments):
    """ Does this PMID have an erratum?
    I can't do the erratum formatting automatically.
    But I can warn user that there is an erratum.
    """

    Entrez.email = app.config['EMAIL']
    handle = Entrez.efetch(db="pubmed", id=PMID, rettype="gb", retmode="xml")
    records = Entrez.read(handle)

    erratum_count = 0
    try:
        corrections = records['PubmedArticle'][0]['MedlineCitation'][
            'CommentsCorrectionsList']
        for correction in corrections:
            if correction.attributes['RefType'] == 'ErratumIn':
                if erratum_count == 0:
                    comments.append(
                        "I smell an erratum: " + correction['RefSource'] +
                        ". \nAdd it to the end of your citation: [Erratum in Journal, Issue(Volume): page. DOI: #. Accessed date.]"
                    )
                else:  # Is this the second (or later) erratum we're reporting for this article? Then shorten report.
                    comments.append("There's another erratum! What a mess." +
                                    correction['RefSource'])

                erratum_count += 1

    except:
        pass  # No errata? Do nothing.

    return comments
    def ncbi_search(self, database, term):
        """
        Submit search to NCBI and return the records.
        """
        self.handle = Entrez.esearch(db=database, term=term, usehistory="y",
                                     retmax=100000000)
        self.record = Entrez.read(self.handle)
        self.handle.close()

        return self.record
Example #4
0
    def ncbi_search(self, database, term):
        """
        Submit search to NCBI and return the records.
        """
        self.handle = Entrez.esearch(db=database, term=term, usehistory="y",
                                     retmax=10, idtype="acc")
        self.record = Entrez.read(self.handle)
        self.handle.close()

        return self.record
Example #5
0
def PMID_to_formatted_citation(PMID, comments):
    """
    Take a PMID, retrieve the PubMed data, and format it based on our style guide.
    :param PMID: PubMed ID
    :return:
    """
    Entrez.email = app.config['EMAIL']

    try:
        handle = Entrez.esummary(db="pubmed", id=PMID)
        record = Entrez.read(handle)[0]
    except IOError:
        comments.append(
            "Is there a network problem? Unleash me please!")  # Network error
        return '', comments
    except:
        comments.append("I can't fetch an article with that ID."
                        )  # DOI not on PubMed? Or Bad PMID?
        comments.append(
            "Try looking up the article on www.pubmed.gov. (Note: PubMed may not have the DOI.) If the article is there, copy its PMID and bring it to me. If the article isn't on PubMed, I can't fetch a citation for you. Sorry. :( "
        )
        return '', comments

    author_names = authors.format_authors(record)

    try:
        year = record['PubDate'][0:4]
    except:
        comments.append("What year is it?")
        year = '20??'

    title, comments = article_title.format_title(record, PMID, comments)
    journal, comments = journal_name.format_journal(record, comments)
    vol_iss, comments = volume_issue.format_volume_issue(record, comments)
    pages, comments = page_numbers.format_pages(record, comments)
    DOI_value, comments = DOI.format_DOI(record, comments)
    accessed = accessed_date.format_accessed_date()

    # When there's no author: Title (Year)...
    if author_names == '':
        if title[-1] == '.':
            title = title[0:-1]
        citation = title + ' (' + year + '). ' + journal + vol_iss + pages + '. ' + DOI_value + accessed

    # Otherwise: Authors (Year). Title...
    else:
        citation = author_names + ' (' + year + '). ' + title + ' ' + journal + vol_iss + pages + '. ' + DOI_value + accessed

    # print(citation)
    # print(' ')
    comments = errata.erratum_check(PMID, comments)
    handle.close()
    return citation, comments
Example #6
0
def main(argv):
	parser = OptionParser()
	parser.add_option("-p", "--peakfile", action="store", type="string", dest="peakfile", help="input ucsc file for PA peaks ", metavar="<file>")
	parser.add_option("-u", "--annotationfile", action="store", type="string", dest="annotationfile", help="pickle file for annotations ", metavar="<file>")
	parser.add_option("-o", "--outfile", action="store", type="string", dest="outfile", help="outfile name", metavar="<file>")
	parser.add_option("-s", "--species", action="store", type="string", dest="species",help="species, mm8, hg18, etc", metavar="<str>")
	parser.add_option("-t", "--peak_threshold", action="store", type="int", dest="peak_threshold",help="Peak threshold", metavar="<int>")
	parser.add_option("-d", "--3UTRdownstreamextension", action="store", type="int", dest="downstream_extension",help="3UTR down stream extension", metavar="<int>")
	
	(opt, args) = parser.parse_args(argv)
	if len(argv) < 12:
		parser.print_help()
		sys.exit(1)
		
	if opt.species in GenomeData.species_chroms.keys():
		chroms = GenomeData.species_chroms[opt.species]
		chrom_lengths = GenomeData.species_chrom_lengths[opt.species]
	else:
		print "This species is not recognized, exiting";
		sys.exit(1);
	
	# entrez_gene_collection is a KnownEntrezGenes class object. The core is a entrez_genes.entrez_genes, a dic (keyed by entrez_id) of lists of EntrezGene object
	annotation = open(opt.entrez_genes, 'rb')
	entrez_gene_collection = Entrez.KnownEntrezGenes(chroms, pickle.load(annotation)) 
	annotation.close()
	
	# test module
	test = 0
	if test == 1:
		print "Testing gene structure"
		test_id = 54
		Entrez.test_gene_structure(entrez_gene_collection, test_id)

	# Filter cluster of refseq_ids (keyed by entrez_id) according to the criterion of identical cdsEnd
	entrez_ids_with_unique_cdsEnd = entrez_gene_collection.get_ids_unique_cdsEnd()
	print "There are ", len(entrez_ids_with_unique_cdsEnd), " Entrez IDs each of which has a unique cdsEnd."
	
	# Additional filter to remove clusters with intron-containing 3UTRs
	allowance=0
	ids=entrez_ids_with_unique_cdsEnd
	entrez_ids_with_intronless_3UTRs = entrez_gene_collection.get_ids_with_intronless_3UTR(allowance, ids)
	print "There are %d Entrez_ids with additional requirement of intronless 3UTR: ", %(len(entrez_ids_with_intronless_3UTRs))
	
	entrez_gene_subset = Entrez.KnownEntrezGenes(chroms, entrez_gene_collection.subset(entrez_ids_with_intronless_3UTRs))
	
	peaks_on_entrez_3UTRs = AssignPeaksToEntrez3UTRs(entrez_gene_subset, opt.peakfile, chroms, chrom_lengths, opt.peak_threshold, opt.downstream_extension)
	
	output = open(libName + "_PA_Peaks_associated_with_Annotations.pkl", 'wb')
	pickle.dump(peaks_on_entrez_3UTRs, output)
	output.close()
	
	Calculate3UTRUsage(peaks_on_entrez_3UTRs, final_entrez_id_collection, opt.outfile)
Example #7
0
def validate_and_convert_DOI_or_PMID_to_PMID(lookupID, comments):
    """
    Look up a DOI -- or PMID -- and return a PMID.
    :param lookupID: either a DOI or a PMID.
    :return a PMID
    """

    # Format as string and strip any leading white spaces. Do now so we can reach DOI/PMIDs.
    lookupID = str(lookupID).lstrip()

    # Remove any prefacing text that might've come through. Only if it's at the start of the lookupID.
    preface_tags = ['DOI:', 'doi:', 'PMID:', 'pmid:']
    for tag in preface_tags:
        if lookupID.startswith(tag):
            lookupID = re.sub(tag, '', lookupID)

    # Drop any white spaces that remain.
    lookupID = lookupID.replace(" ", "")

    try:
        Entrez.email = app.config['EMAIL']
        handle = Entrez.esearch(db="pubmed", retmax=10, term=lookupID)
        record = Entrez.read(handle)
        handle.close()
        if int(record['Count']) == 0:
            comments.append("I can't fetch an article with that ID."
                            )  # DOI not on PubMed? Or Bad PMID?
            comments.append(
                "Try looking up the article on www.pubmed.gov. (Note: PubMed may not have the DOI.) If the article is there, copy its PMID and bring it to me. If the article isn't on PubMed, I can't fetch a citation for you. Sorry. :( "
            )
            return '', comments
        elif int(record['Count']) > 1:
            comments.append(
                'I found more than one article. Are there characters missing from the ID?'
            )
            return '', comments
        else:  # Only 1 result, perfect!
            return (record['IdList'][0]), comments
    except IOError:  # Network error
        comments.append(
            "Is there a network problem? Unleash me please!")  # Network error
        return '', comments
    except:
        comments.append("I can't fetch an article with ID " + lookupID +
                        '. Can you double check it?')  # Bad PMID?
        comments.append(
            "Try looking up the article on www.pubmed.gov. (Note: PubMed may not have the DOI.) If the article is there, copy its PMID and bring it to me. If the article isn't on PubMed, I can't fetch a citation for you. Sorry. :( "
        )
        return '', comments
def main(argv):
    parser = OptionParser()
    parser.add_option(
        "-r",
        "--refseqfile",
        action="store",
        type="string",
        dest="refseq_ucsc_file",
        help=
        "input ucsc file for annotated genes, eg,  refFlat_hg19_EntrezID.ucsc",
        metavar="<file>")
    parser.add_option("-i",
                      "--entrezIDfile",
                      action="store",
                      type="string",
                      dest="entrez_ids_file",
                      help="file for entrez ids",
                      metavar="<file>")
    parser.add_option(
        "-u",
        "--entrez_genes_file",
        action="store",
        type="string",
        dest="entrez_genes",
        metavar="<file>",
        help=
        "file with curated known genes clustered by entrez ID in pickle format"
    )
    parser.add_option(
        "-o",
        "--refseqfile",
        action="store",
        type="string",
        dest="refseq_subset_file",
        help="ucsc file for refseq transcripts belonging to those entrez_ids",
        metavar="<file>")
    (opt, args) = parser.parse_args(argv)
    if len(argv) < 8:
        parser.print_help()
        sys.exit(1)

    # entrez_gene_collection is a KnownEntrezGenes class object. The core is a entrez_genes.entrez_genes is a dic (keyed by entrez_id) of lists of EntrezGene object
    annotation = open(opt.entrez_genes, 'rb')
    entrez_gene_collection = Entrez.KnownEntrezGenes(chroms,
                                                     pickle.load(annotation))
    annotation.close()

    entrez_ids = []
    f = open(opt.entrez_ids_file, 'r')
    for line in f:
        if not comment.match(line):
            line = line.strip()
            sline = line.split('\t')
            entrez_ids.append(sline[0])
    f.close()

    refseq_ids = entrez_gene_collection.get_transcript_ids(entrez_ids)
    gene_set_manipulation.output_UCSCsubset_in_file(opt.refseq_ucsc_file,
                                                    refseq_ids,
                                                    opt.refseq_subset_file)
Example #9
0
    def record_processor(self, record, database):
        """
        Splits the record returned by Entrez into sparate variables and returns
        them.
        """
        count = int(record["Count"])  # Int
        webenv = record["WebEnv"]  # String
        query_key = record["QueryKey"]  # String
        IDs = []

        for i in range(0, count, 10000):
            iter_handle = Entrez.efetch(db=database, webenv=webenv,
                                        query_key=query_key, retmax=10000,
                                        rettype="acc", retstart=i)
            IDs += [x.rstrip() for x in iter_handle]
            iter_handle.close()

        assert count == len(IDs)

        if count == 0 and self.gui == 0:
            sys.exit("Your serch query returned no results!")

        elif count == 0:
            self.no_match.emit("Your serch query returned no results!")
            return None

        return count, IDs, webenv, query_key
Example #10
0
def calculateExonIntrons(entrez_genes, bedfile, column_index, chroms,  fragment_size, totalcount, out_file):
	lib_name = (bedfile).split('/')[-1] # remove directory
	suffix = lib_name.split('.')[-1] # txt
	lib_name = lib_name.split('.')[0] 
	extension = "-" + lib_name +'.' + suffix +"1"
	if Utility_extended.fileExists(bedfile):
		if Utility_extended.chrom_files_exist(chroms, extension) != 1:
			# Separate by chrom and sort by start
			print chroms, extension, " files do not exist, separate by chroms. "
			Utility_extended.separate_by_chrom_sort(chroms, bedfile, extension, [column_index])
	else:
		print bedfile, " is not found";
		sys.exit(1)
	
	all_reads_on_shared_exons = {} # {entrezID:[((start, end), read_count)]}
	all_reads_on_shared_introns = {} # {entrezID:[((start, end), read_count)]}
	all_reads_on_merged_transcripts = {} #{entrezID:[((start, end), read_count)]}
	all_summary = {}
	
	for chrom in chroms:
		chrombed = chrom + extension
		entrez_genes_by_chrom =  Entrez.KnownEntrezGenes([chrom], entrez_genes.subset_by_chrom(chrom))
		(reads_on_shared_exons, reads_on_shared_introns, reads_on_merged_transcripts, summary) =  calculateExonIntrons_by_chrom (entrez_genes_by_chrom, chrombed, fragment_size, totalcount, out_file)
		#if chrom == chroms[0]:
			#myid = reads_on_shared_exons.keys()[0]
			#test(entrez_genes_by_chrom, reads_on_shared_introns, myid)
		all_reads_on_shared_exons.update(reads_on_shared_exons)
		all_reads_on_shared_introns.update(reads_on_shared_introns)
		all_reads_on_merged_transcripts.update(reads_on_merged_transcripts)
		all_summary.update(summary)
		
	SeparateByChrom.cleanup(chroms, extension)
	return (all_reads_on_shared_exons, all_reads_on_shared_introns, all_reads_on_merged_transcripts, summary)
Example #11
0
def retrieve_abstract(PMID):
    Entrez.email = app.config['EMAIL']
    handle = Entrez.efetch(db="pubmed",rettype="medline", retmode="text", id=PMID)

    record = Medline.read(handle)
    handle.close()
    try:
        abstract = record['AB']
    except:
        abstract = ''
    return abstract
Example #12
0
def get_description(mail,ID):
	"""Used by xml write for searching the definition of the hit using the accession number from NCBI
	which appears in the rapsearch output"""
	Entrez.email = mail
	handle = Entrez.efetch(db="protein", id=ID, rettype="gb", retmode="text")
	entry=(handle.read().strip())
	complete=entry.split("\n")
	definition=complete[1][12:]
	definition2=definition.strip(".")
	handle.close()
	return (definition2)
Example #13
0
def get_description(mail,ID):
	"""Used by make description for searching the definition of the hit using the accession number from NCBI
	which appears in the nnotation output
	Adapted from the RapsearchToXml.py file"""
	Entrez.email = mail
	handle = Entrez.efetch(db="protein", id=ID, rettype="gb", retmode="text")
	entry=(handle.read().strip())
	complete=entry.split("\n")
	definition=complete[1][12:] #get the definition camp
	definition2=definition.split("[")#removes the species informaton
	handle.close()
	return (definition2[0])
    def fetch_by_id(self, IDs, b_size):
        """
        Fetches NCBI data based on the IDs, rather than a search query. Returns
        the data handle string.
        """
        id_handle = Entrez.efetch(db=self.database,
                                  id=IDs,
                                  rettype="fasta",
                                  retmode="text",
                                  retmax=b_size)
        data = id_handle.read()
        id_handle.close()

        return data
Example #15
0
    def fetch_by_id(self, IDs, b_size):
        """
        Fetches NCBI data based on the IDs, rather than a search query. Returns
        the data handle string.
        """
        id_handle = Entrez.efetch(db=self.database,
                                  id=IDs,
                                  rettype="fasta",
                                  retmode="text",
                                  retmax=b_size)
        data = id_handle.read()
        id_handle.close()

        return data
    def fetch_by_history(self, start, b_size, webenv, query_key):
        """
        Fetches NCBI data based on the provided search query. Returns the data
        handle string.
        """
        hist_handle = Entrez.efetch(db=self.database,
                                    retstart=start,
                                    rettype="fasta",
                                    retmode="text",
                                    retmax=b_size,
                                    webenv=webenv,
                                    query_key=query_key)
        data = hist_handle.read()
        hist_handle.close()

        return data
Example #17
0
    def fetch_by_history(self, start, b_size, webenv, query_key):
        """
        Fetches NCBI data based on the provided search query. Returns the data
        handle string.
        """
        hist_handle = Entrez.efetch(db=self.database,
                                    retstart=start,
                                    rettype="fasta",
                                    retmode="text",
                                    retmax=b_size,
                                    webenv=webenv,
                                    query_key=query_key)
        data = hist_handle.read()
        hist_handle.close()

        return data
Example #18
0
 def runTest(self):
     p = SeqIO.read(gzip.open('test.dat.bgz'), 'swiss')
     embl = [xref.split(':')[1] for xref in p.dbxrefs if 'EMBL' in xref][-1]
     source_seq, feature = Entrez.get_source_seq(p)
     self.assertIn(embl, source_seq.id)
Example #19
0
def main(argv):
    parser = OptionParser()
    parser.add_option("-r",
                      "--readfile",
                      action="store",
                      type="string",
                      dest="Reads",
                      help="input bed file for non-strand specific raw reads",
                      metavar="<file>")
    parser.add_option(
        "-g",
        "--fragment_size",
        action="store",
        type="int",
        dest="fragment_size",
        help=
        "fragment_size determines the shift (half of fragment_size of ChIP-seq read position, in bps",
        metavar="<int>")
    parser.add_option(
        "-u",
        "--entrez_genes_file",
        action="store",
        type="string",
        dest="entrez_genes",
        metavar="<file>",
        help=
        "file with curated known genes clustered by entrez ID in pickle format"
    )
    parser.add_option("-o",
                      "--outfile",
                      action="store",
                      type="string",
                      dest="outfile",
                      metavar="<file>",
                      help="output file name for genes and tag numbers")
    parser.add_option("-s",
                      "--species",
                      action="store",
                      type="string",
                      dest="species",
                      help="species, mm8, hg18, etc",
                      metavar="<str>")

    (opt, args) = parser.parse_args(argv)
    if len(argv) < 10:
        parser.print_help()
        sys.exit(1)

    startTime = time.time()

    if opt.species in GenomeData.species_chroms.keys():
        chroms = GenomeData.species_chroms[opt.species]
        chrom_lengths = GenomeData.species_chrom_lengths[opt.species]
    else:
        print "This species is not recognized, exiting"
        sys.exit(1)

    # entrez_gene_collection is a EntrezGenes class object. The core is a entrez_genes.entrez_genes is a dic (keyed by entrez_id) of lists of EntrezGene object
    annotation = open(opt.entrez_genes, 'rb')
    entrez_gene_collection = Entrez.KnownEntrezGenes(chroms,
                                                     pickle.load(annotation))
    annotation.close()

    # test module
    test = 0
    if test == 1:
        test_id = 54
        Entrez.test_gene_structure(entrez_gene_collection, test_id)

    rawreadslibName1 = (opt.Reads).split('/')[-1]
    rawreadssuffix1 = rawreadslibName1.split('.')[-1]
    rawreadslibName1 = rawreadslibName1.split('.')[0]
    rawreadsextension1 = "-" + rawreadslibName1 + '.' + rawreadssuffix1 + "1"

    totalcount = 0
    if Utility_extended.fileExists(opt.Reads) == 1:
        totalcount = get_total_tag_counts.get_total_tag_counts(opt.Reads)
    else:  # if the all file exist, then use the all file, otherwise use the chrom separated file
        for chrom in chroms:
            chrombed = chrom + rawreadsextension1
            totalcount1 = get_total_tag_counts.get_total_tag_counts(chrombed)
            print chrom, totalcount1
            totalcount += totalcount1

    (reads_on_shared_exons, reads_on_shared_introns,
     reads_on_merged_transcripts,
     summary) = calculate_non_strandspecific_rc_on_ExonIntrons(
         entrez_gene_collection, opt.Reads, chroms, opt.fragment_size)

    #Clear the file.
    outf = open(opt.outfile, 'w')
    outline = "# Entrez ID \t Merged Exon Read Count \t Merged Exon Length \t Merged Exon RPKM \t Shared Exon Read Count \t  Shared Exon Length \t Shared Exon RPKM \t Shared Intron Read Count \t Share Intron Length \t Shared Intron RPKM \t Merged Transcript Read Count \t Merged Transcript Length \t Merged Transcript RPKM \t RefSeq IDs \t Gene Symbols \n"
    outf.write(outline)
    for entrez_id in entrez_gene_collection.entrez_ids:
        gene = (entrez_gene_collection.entrez_genes)[entrez_id]
        gene_symbol = []
        for transcript in gene.transcripts:
            if transcript.additional_annotations[0] not in gene_symbol:
                gene_symbol.append(transcript.additional_annotations[0])
        outline = str(entrez_id) + '\t' + str(
            summary[entrez_id]["merged_exons_rc"]
        ) + '\t' + str(
            summary[entrez_id]["merged_exons_total_length"]
        ) + '\t' + str(summary[entrez_id]["merged_exon_RPKM"]) + '\t' + str(
            summary[entrez_id]["shared_exons_rc"]
        ) + '\t' + str(
            summary[entrez_id]["shared_exons_total_length"]
        ) + '\t' + str(summary[entrez_id]["shared_exon_RPKM"]) + '\t' + str(
            summary[entrez_id]["shared_introns_rc"]
        ) + '\t' + str(
            summary[entrez_id]["shared_introns_total_length"]
        ) + '\t' + str(summary[entrez_id]["shared_intron_RPKM"]) + '\t' + str(
            summary[entrez_id]["merged_transcript_rc"]) + '\t' + str(
                summary[entrez_id]["merged_transcript_length"]) + '\t' + str(
                    summary[entrez_id]
                    ["merged_transcript_RPKM"]) + '\t' + ','.join([
                        transcript.name for transcript in gene.transcripts
                    ]) + '\t' + ','.join(gene_symbol) + '\n'
        outf.write(outline)
    outf.close()

    # {entrezID:[((start, end), read_count)]}
    name = opt.outfile + "_shared_exons.pkl"
    output = open(name, 'wb')
    pickle.dump(reads_on_shared_exons, output)
    output.close()

    # {entrezID:[((start, end), read_count)]}
    name = opt.outfile + "_shared_introns.pkl"
    output = open(name, 'wb')
    pickle.dump(reads_on_shared_introns, output)
    output.close()

    #store the info in a pickle file
    name = opt.outfile + "_merged_transcripts.pkl"
    output = open(name, 'wb')
    pickle.dump(reads_on_merged_transcripts, output)
    output.close()

    name = opt.outfile + "_summary.pkl"
    output = open(name, 'wb')
    pickle.dump(summary, output)
    output.close()

    print "it took", time.time() - startTime, "seconds."
def assign_AluElements_to_intronexons_by_chrom(my_entrez_genes,
                                               Alufile_by_chrom, chrom):
    """
	entrez genes are made sure to be on one chrom, and the bed file are reads for that strand
	The raw read file needs to conform to bed format
	"""
    # Separate by chrom reads

    if Utility_extended.fileExists(Alufile_by_chrom) and (
            chrom in my_entrez_genes.chroms):
        print chrom
        # set up a KnownEntrezGenes Instance for entrez_genes on this particular chrom
        entrez_genes_by_chrom = Entrez.KnownEntrezGenes(
            [chrom], my_entrez_genes.subset_by_chrom(chrom))

        # load in the Alus on chrom
        Alus = RepElements.KnownRepElements.initiate_from_file(
            [chrom], Alufile_by_chrom)
        print "There are %d of elements on %s" % (Alus.number, chrom)
        # Use the mid point of an Alu Element to represent its position, each element is a tuple of (position, id)
        Alu_positions = []
        for myid in Alus.rep_elements.keys():
            element = Alus.rep_elements[myid]
            position = int((element.genoStart + element.genoEnd) / 2.0)
            Alu_positions.append((position, myid))
        Alu_positions.sort(key=itemgetter(0))

        shared_exon_Alus = {}  #{entrezID:[(region, [Alu_positions])]}
        shared_intron_Alus = {}  #{entrezID:[(region, [Alu_positions])]}
        merged_transcript_Alus = {}  #{entrezID:[(region, [Alu_positions])]}
        if entrez_genes_by_chrom.num_genes > 0:
            for entrez_id in entrez_genes_by_chrom.entrez_ids:

                gene = (entrez_genes_by_chrom.entrez_genes)[entrez_id]

                shared_exons = gene.shared_exonic_regions  #sorted in absolute coordinate
                if shared_exons == []:  # No shared extrons
                    shared_exon_Alus[entrez_id] = []
                else:
                    # element in result_list has structure: [((start, end), [Alu_positions in range])]
                    result_list = Utility_extended.associate_tags_with_regions(
                        Alu_positions, shared_exons
                    )  #returns a list, [region, [Alu_positions]]
                    shared_exon_Alus[entrez_id] = result_list

                shared_introns = gene.shared_intronic_regions  #sorted
                if shared_introns == []:  # No shared introns
                    shared_intron_Alus[entrez_id] = []
                else:
                    # element in result_list has structure: [((start, end), [Alus in range])]
                    result_list = Utility_extended.associate_tags_with_regions(
                        Alu_positions, shared_introns
                    )  #returns a list, [region, [Alu_elements]]
                    shared_intron_Alus[entrez_id] = result_list

                merged_transcript = gene.boundaries  #[(start, end)]
                # element in result_list has structure: [((start, end), [Alus in range])]
                result_list = Utility_extended.associate_tags_with_regions(
                    Alu_positions, merged_transcript)
                merged_transcript_Alus[entrez_id] = result_list

    return (shared_intron_Alus, shared_exon_Alus, merged_transcript_Alus)
def main(argv):
    parser = OptionParser()
    parser.add_option(
        "-a",
        "--AluElementsFile",
        action="store",
        type="string",
        dest="Alus",
        help="input Alu annotation file for non-strand specific analysis",
        metavar="<file>")
    parser.add_option(
        "-u",
        "--entrez_genes_file",
        action="store",
        type="string",
        dest="entrez_collection",
        metavar="<file>",
        help=
        "file with curated known genes clustered by entrez ID in pickle format"
    )
    parser.add_option("-o",
                      "--outfile",
                      action="store",
                      type="string",
                      dest="outfile",
                      metavar="<file>",
                      help="output file name for genes and tag numbers")
    parser.add_option("-s",
                      "--species",
                      action="store",
                      type="string",
                      dest="species",
                      help="species, mm8, hg18, etc",
                      metavar="<str>")

    (opt, args) = parser.parse_args(argv)
    if len(argv) < 8:
        parser.print_help()
        sys.exit(1)

    startTime = time.time()

    if opt.species in GenomeData.species_chroms.keys():
        chroms = GenomeData.species_chroms[opt.species]
    else:
        print "This species is not recognized, exiting"
        sys.exit(1)

    # entrez_collection is a dic (keyed by entrez_id) of lists of EntrezGene object
    annotation = open(opt.entrez_collection, 'rb')
    temp = pickle.load(annotation)
    my_entrez_genes = Entrez.KnownEntrezGenes(chroms, temp)
    annotation.close()

    #test entrez, checks out
    #id = my_entrez_genes.entrez_genes.keys()[0]
    #print id
    #for i in my_entrez_genes.entrez_genes[id].transcripts:
    #print i.getAll()

    lib_name = (opt.Alus).split('/')[-1]  # remove directory
    suffix = lib_name.split('.')[-1]  # txt
    lib_name = lib_name.split('.')[0]
    extension = "-" + lib_name + '.' + suffix + "1"
    if Utility_extended.fileExists(opt.Alus):
        if Utility_extended.chrom_files_exist(chroms, extension) != 1:
            # Separate by chrom and sort by start
            print chroms, extension, " files do not exist, separate by chroms. "
            SeparateByChrom.separateByChrom(chroms, opt.Alus, extension)
    else:
        print opt.Alus, " is not found"
        sys.exit(1)

    Alus_in_shared_intron = {}
    Alus_in_shared_exon = {}
    Alus_in_merged_transcript = {}

    for chrom in chroms:
        (shared_intron_Alus, shared_exon_Alus,
         merged_transcript_Alus) = assign_AluElements_to_intronexons_by_chrom(
             my_entrez_genes, chrom + extension, chrom)
        if chrom == chroms[0]:
            myid = shared_intron_Alus.keys()[0]
            test(my_entrez_genes, shared_intron_Alus, myid)
        Alus_in_shared_intron.update(shared_intron_Alus)
        Alus_in_shared_exon.update(shared_exon_Alus)
        Alus_in_merged_transcript.update(merged_transcript_Alus)

    #{entrezID:[(region=(start, end), Alu_count)]}
    Alus_in_shared_intron_dist = {}
    for myid in Alus_in_shared_intron.keys():
        shared_intronic_regions_on_this_gene = Alus_in_shared_intron[myid]
        Alus_on_shared_intronic_regions_on_this_gene = []
        for region in shared_intronic_regions_on_this_gene:
            region_coord, Alu_positions = region
            number_of_Alus = len(Alu_positions)
            Alus_on_shared_intronic_regions_on_this_gene.append(
                (region_coord, number_of_Alus))
        Alus_in_shared_intron_dist[
            myid] = Alus_on_shared_intronic_regions_on_this_gene
    outname = opt.outfile + "_Alu_distribution_in_shared_intron.pkl"
    output = open(outname, 'wb')
    pickle.dump(Alus_in_shared_intron_dist, output)
    print "The number of genes output to %s is %d " % (
        outname, len(Alus_in_shared_intron.keys()))
    output.close()

    #total_intronic_regions = 0
    #for myid in Alus_in_shared_intron.keys():
    #	total_intronic_regions += len(Alus_in_shared_intron[myid])
    #print "There are %d genes with %d shared intronic regions " % (len(Alus_in_shared_intron.keys()),  total_intronic_regions)

    #{entrezID:[(region, Alu_positions)]}
    outname = opt.outfile + "_Alus_in_shared_intron.pkl"
    output = open(outname, 'wb')
    pickle.dump(Alus_in_shared_intron, output)
    print "The number of genes output to %s is %d " % (
        outname, len(Alus_in_shared_intron.keys()))
    output.close()

    #{entrezID:[(region, Alu_positions)]}
    outname = opt.outfile + "_Alus_in_shared_exon.pkl"
    output = open(outname, 'wb')
    pickle.dump(Alus_in_shared_exon, output)
    print "The number of genes output to %s is %d " % (
        outname, len(Alus_in_shared_exon.keys()))
    output.close()

    #Though in this case the structure can be simpler: {entrezID:(region, Alu_count)}, it is better to make the interface uniform.{entrezID:[(region, Alu_count)]}
    Alus_in_merged_transcript_dist = {}
    for myid in Alus_in_merged_transcript.keys():
        assert len(Alus_in_merged_transcript[myid]) == 1
        region = (Alus_in_merged_transcript[myid])[0]
        region_coord, Alu_positions = region
        number_of_Alus = len(Alu_positions)
        Alus_in_merged_transcript_dist[myid] = [(region_coord, number_of_Alus)]
    outname = opt.outfile + "_Alu_distribution_in_merged_transcript.pkl"
    output = open(outname, 'wb')
    pickle.dump(Alus_in_merged_transcript_dist, output)
    print "The number of genes output to %s is %d " % (
        outname, len(Alus_in_merged_transcript.keys()))
    output.close()

    #{entrezID:[(region, Alu_positions)]}
    outname = opt.outfile + "_Alus_in_merged_transcript.pkl"
    output = open(outname, 'wb')
    pickle.dump(Alus_in_merged_transcript, output)
    print "The number of genes output to %s is %d " % (
        outname, len(Alus_in_merged_transcript.keys()))
    output.close()

    print "it took", time.time() - startTime, "seconds."
def Calculate3UTRUsage(entrez_genes, bedfile, column_index, chroms,
                       fragment_size, downstream_extension, outfile):
    """
	entrez genes are made sure to be on one strand, 
	the bed file are reads for that strand
	
	entrez_genes is a KnownEntrezGenes class object
	The raw read file needs to conform to bed format
	
	column_index: column in bed file for sorting
	
	"""
    # Separate reads by chrom
    rawreadslibName1 = (bedfile).split('/')[-1]
    rawreadssuffix1 = rawreadslibName1.split('.')[-1]
    rawreadslibName1 = rawreadslibName1.split('.')[0]
    rawreadsextension1 = "-" + rawreadslibName1 + '.' + rawreadssuffix1 + "1"
    if Utility_extended.fileExists(bedfile):
        if Utility_extended.chrom_files_exist(chroms, rawreadsextension1) != 1:
            # Separate by chrom and sort by start
            print chroms, rawreadsextension1, " files do not exist, separate by chroms and sort each file according to the second column. "
            Utility_extended.separate_by_chrom_sort(chroms, bedfile,
                                                    rawreadsextension1,
                                                    [column_index])
    else:
        print bedfile, " is not found"
        sys.exit(1)

    # Here the output is 'a'
    outf = open(outfile, 'a')
    for chrom in chroms:
        if chrom in entrez_genes.chroms:
            # a KnownEntrezGenes object
            entrez_genes_by_chrom = Entrez.KnownEntrezGenes(
                [chrom], entrez_genes.subset_by_chrom(chrom))
            # this_chrom_length = chrom_lengths[chrom]
            # Get the read locations
            if Utility_extended.fileExists(chrom + rawreadsextension1):
                f = open(chrom + rawreadsextension1, 'r')
                tag_positions = []
                for line in f:
                    line = line.strip()
                    sline = line.split()
                    tag_positions.append(
                        associate_tags_with_regions.tag_position(
                            sline, fragment_size))
                if not Utility_extended.is_list_sorted(tag_positions):
                    tag_positions.sort()
                f.close()

                for entrez_id in entrez_genes_by_chrom.entrez_ids:
                    gene = entrez_genes_by_chrom.entrez_genes[
                        entrez_id]  # an EntrezGene class object
                    three_UTRs = gene.get_3UTRs(downstream_extension)
                    print three_UTRs
                    union = Utility_extended.union(
                        three_UTRs
                    )  # Find the union of 3UTRs [(start, end)], returns a [(start,end)]
                    if len(union) > 1:
                        print "There are disjoint 3UTRs in %s" % (
                            str(entrez_id))
                    else:
                        # returns [((start, end), [tag_positions])], [tag_positions] = return[0][1]
                        inside_reads = (Utility_extended.
                                        associate_simple_tags_with_regions(
                                            tag_positions, union))[0][1]
                        total_read_count = len(inside_reads)
                        RUD = CUTR_vs_AUTR(three_UTRs, inside_reads,
                                           gene.strand)

                        ## For the set of genes, use the distal 3UTR at the designated representative 3UTR
                        #myindex = Calculate3UTRUsageIndexFromCuratedGenes.find_distal_3UTR(genes)
                        #gene = genes[myindex]
                        #results = ThreeUTRCharacteristics(gene, inside_reads)

                        gene_symbol = []
                        for mytranscript in gene.transcripts:
                            if mytranscript.additional_annotations[
                                    0] not in gene_symbol:
                                gene_symbol.append(
                                    mytranscript.additional_annotations[0])

                        union_length = union[0][1] - union[0][0] + 1
                        outline = str(entrez_id) + "\t" + str(
                            union_length) + "\t" + str(RUD) + "\t" + str(
                                total_read_count) + "\t" + ','.join([
                                    transcript.name
                                    for transcript in gene.transcripts
                                ]) + "\t" + ','.join(gene_symbol) + "\n"

                    outf.write(outline)
    outf.close()
def main(argv):
    parser = OptionParser()
    parser.add_option(
        "-f",
        "--forwardreadfile",
        action="store",
        type="string",
        dest="ReadsOnForwardStrand",
        help="input bed file for RNASeq raw reads on forward strand",
        metavar="<file>")
    parser.add_option(
        "-r",
        "--reversereadfile",
        action="store",
        type="string",
        dest="ReadsOnReverseStrand",
        help="input bed file for RNASeq raw reads on reverse strand",
        metavar="<file>")
    parser.add_option(
        "-u",
        "--entrez_genes_file",
        action="store",
        type="string",
        dest="entrez_genes",
        metavar="<file>",
        help=
        "file with curated known genes clustered by entrez ID in pickle format"
    )
    parser.add_option(
        "-g",
        "--fragment_size",
        action="store",
        type="int",
        dest="fragment_size",
        help=
        "fragment_size determines the shift (half of fragment_size of ChIP-seq read position, in bps",
        metavar="<int>")
    parser.add_option("-o",
                      "--outfile",
                      action="store",
                      type="string",
                      dest="outfile",
                      help="outfile name",
                      metavar="<file>")
    parser.add_option("-s",
                      "--species",
                      action="store",
                      type="string",
                      dest="species",
                      help="species, mm8, hg18, etc",
                      metavar="<str>")
    parser.add_option("-d",
                      "--3UTRdownstreamextension",
                      action="store",
                      type="int",
                      dest="downstream_extension",
                      help="3UTR down stream extension",
                      metavar="<int>")

    (opt, args) = parser.parse_args(argv)

    if len(argv) < 14:
        parser.print_help()
        sys.exit(1)

    startTime = time.time()

    allowance = 10

    if opt.species in GenomeData.species_chroms.keys():
        chroms = GenomeData.species_chroms[opt.species]
        chrom_lengths = GenomeData.species_chrom_lengths[opt.species]
    else:
        print "This species is not recognized, exiting"
        sys.exit(1)

    # entrez_gene_collection is a KnownEntrezGenes class object. The core is a entrez_genes.entrez_genes is a dic (keyed by entrez_id) of lists of EntrezGene object
    annotation = open(opt.entrez_genes, 'rb')
    entrez_gene_collection = Entrez.KnownEntrezGenes(chroms,
                                                     pickle.load(annotation))
    annotation.close()

    # test module
    test = 0
    if test == 1:
        print "Testing gene structure"
        test_id = 54
        Entrez.test_gene_structure(entrez_gene_collection, test_id)

    # Filter cluster of refseq_ids (keyed by entrez_id) according to the criterion of identical cdsEnd
    entrez_ids_with_unique_cdsEnd = entrez_gene_collection.get_ids_with_unique_cdsEnd(
    )
    print "There are ", len(entrez_ids_with_unique_cdsEnd
                            ), " Entrez IDs each of which has a unique cdsEnd."

    #get total read count
    totalcount_F = get_total_tag_counts.get_total_tag_counts(
        opt.ReadsOnForwardStrand)
    totalcount_R = get_total_tag_counts.get_total_tag_counts(
        opt.ReadsOnReverseStrand)
    totalcount = totalcount_F + totalcount_R
    print totalcount_F, totalcount_R

    #Clear the file and write the first line, needs to be modified
    outf = open(opt.outfile, 'w')
    #outline = "# Entrez ID \t Main Refseq ID \t 3UTR union length \t Length Index \t PA Multiplicity Index \t 3UTR Read Count \t RefSeq IDs \t Gene symbols \n"
    outline = "# Entrez ID \t 3UTR Union length \t RUD \t 3UTR Read Count \t RefSeq IDs \t Gene symbols \n"
    outf.write(outline)
    outf.close()

    #index: column in bed file for sorting
    index = 2

    print "Process genes on forward strand"
    entrez_ids_on_forward_strand = entrez_gene_collection.get_strand_specific_ids(
        "+", entrez_ids_with_unique_cdsEnd)
    print "There are ", len(
        entrez_ids_on_forward_strand), " Entrez IDs on forward strand."
    entrez_gene_subset = Entrez.KnownEntrezGenes(
        chroms, entrez_gene_collection.subset(entrez_ids_on_forward_strand))

    Calculate3UTRUsage(entrez_gene_subset, opt.ReadsOnForwardStrand, index,
                       chroms, opt.fragment_size, opt.downstream_extension,
                       opt.outfile)

    print "Process genes on reverse strand"
    entrez_ids_on_reverse_strand = entrez_gene_collection.get_strand_specific_ids(
        "-", entrez_ids_with_unique_cdsEnd)
    print "There are ", len(
        entrez_ids_on_reverse_strand), " Entrez IDs on reverse strand."
    entrez_gene_subset = Entrez.KnownEntrezGenes(
        chroms, entrez_gene_collection.subset(entrez_ids_on_reverse_strand))

    Calculate3UTRUsage(entrez_gene_subset, opt.ReadsOnReverseStrand, index,
                       chroms, opt.fragment_size, opt.downstream_extension,
                       opt.outfile)

    print "it took", time.time() - startTime, "seconds."
Example #24
0
def main(argv):
    parser = OptionParser()
    parser.add_option("-u",
                      "--annotation_pickle_file",
                      action="store",
                      type="string",
                      dest="annotation",
                      metavar="<file>",
                      help="annotation for strand information")
    parser.add_option(
        "-a",
        "--rnaseq_intron_pickle_file",
        action="store",
        type="string",
        dest="RNA_Seq_intron_pickle",
        metavar="<file>",
        help=
        "read densities for individual shared intronic regions in pickle format"
    )
    parser.add_option(
        "-b",
        "--rnaseq_exon_pickle_file",
        action="store",
        type="string",
        dest="RNA_Seq_exon_pickle",
        metavar="<file>",
        help=
        "read densities for individual shared exonic regions in pickle format")
    parser.add_option(
        "-d",
        "--alu_distribution_intron_pkl",
        action="store",
        type="string",
        dest="alu_distribution_intron_pkl",
        metavar="<file>",
        help=
        "Alu densities for individual shared intronic regions in pickle format"
    )
    parser.add_option(
        "-e",
        "--alu_distribution_in_merged_transcript_pkl",
        action="store",
        type="string",
        dest="alu_distribution_in_merged_transcript_pkl",
        metavar="<file>",
        help="Alu counts for transcript regions in pickle format")
    parser.add_option("-f",
                      "--ids",
                      action="store",
                      type="string",
                      dest="id_subset_file",
                      metavar="<file>",
                      help="file that records ids of interest",
                      default="")
    parser.add_option("-s",
                      "--species",
                      action="store",
                      type="string",
                      dest="species",
                      help="species, mm8, hg18, etc",
                      metavar="<str>")

    (opt, args) = parser.parse_args(argv)
    if len(argv) < 14:
        parser.print_help()
        sys.exit(1)

    if opt.species in GenomeData.species_chroms.keys():
        chroms = GenomeData.species_chroms[opt.species]
    else:
        print "This species is not recognized, exiting"
        sys.exit(1)

    print "Read in RNASeq information"
    # read in from intron pickle resulted in {entrezID:[rpkms]}
    intron_read_density_distribution = readin_RNASeq(opt.RNA_Seq_intron_pickle)
    # read in from exon pickle resulted in {entrezID:[rpkms]}
    exon_read_density_distribution = readin_RNASeq(opt.RNA_Seq_exon_pickle)

    # read in annotation file for strand and gene cluster information
    print "Read in annotation"
    annotation = open(opt.annotation, 'rb')
    entrez_gene_collection = Entrez.KnownEntrezGenes(chroms,
                                                     pickle.load(annotation))
    annotation.close()

    # read in the Alus
    print "Read in Alu information"
    Alu_distribution_on_introns = readin_Alu(opt.alu_distribution_intron_pkl)
    Alu_distribution_on_transcripts = readin_Alu(
        opt.alu_distribution_in_merged_transcript_pkl)

    print "Load in ids"
    if opt.id_subset_file != "":
        id_set = []
        f = open(opt.id_subset_file, 'r')
        for line in f:
            if not comment.match(line):
                line = line.strip()
                sline = line.split('\t')
                id_set.append(int(sline[0]))
        f.close()
        print "There are %d ids in %s" % (len(id_set), opt.id_subset_file)

    pc = 0.000000001
    exon_read_density_cutoff = 1
    combine_iri_Alu_genic_level(intron_read_density_distribution,
                                exon_read_density_distribution,
                                Alu_distribution_on_transcripts, id_set,
                                exon_read_density_cutoff)

    combine_iri_Alu_intron_level(intron_read_density_distribution,
                                 exon_read_density_distribution,
                                 Alu_distribution_on_introns, id_set,
                                 exon_read_density_cutoff, pc)
def main(argv):
    parser = OptionParser()
    parser.add_option("-f",
                      "--forwardalufile",
                      action="store",
                      type="string",
                      dest="AlusOnForwardStrand",
                      help="input file for Alus on forward strand",
                      metavar="<file>")
    parser.add_option("-r",
                      "--reversealufile",
                      action="store",
                      type="string",
                      dest="AlusOnReverseStrand",
                      help="input file for Alus on reverse strand",
                      metavar="<file>")
    parser.add_option(
        "-u",
        "--entrez_genes_file",
        action="store",
        type="string",
        dest="entrez_genes",
        metavar="<file>",
        help=
        "file with curated known genes clustered by entrez ID in pickle format"
    )
    parser.add_option("-o",
                      "--outfile",
                      action="store",
                      type="string",
                      dest="outfile",
                      metavar="<file>",
                      help="output file name for genes and tag numbers")
    parser.add_option("-s",
                      "--species",
                      action="store",
                      type="string",
                      dest="species",
                      help="species, mm8, hg18, etc",
                      metavar="<str>")

    (opt, args) = parser.parse_args(argv)
    if len(argv) < 10:
        parser.print_help()
        sys.exit(1)

    startTime = time.time()

    if opt.species in GenomeData.species_chroms.keys():
        chroms = GenomeData.species_chroms[opt.species]
        chrom_lengths = GenomeData.species_chrom_lengths[opt.species]
    else:
        print "This species is not recognized, exiting"
        sys.exit(1)

    # entrez_genes is a EntrezGenes class object. The core is a entrez_genes.entrez_genes is a dic (keyed by entrez_id) of lists of EntrezGene object
    annotation = open(opt.entrez_genes, 'rb')
    entrez_genes = Entrez.EntrezGenes(chroms, pickle.load(annotation))
    annotation.close()

    #test module
    Entrez.test(entrez_genes)

    #Clear the file.
    outf = open(opt.outfile, 'w')
    outline = "# Entrez ID \t Merged Exon Read Count \t Merged Exon Length \t Merged Exon RPKM \t Shared Exon Read Count \t  Shared Exon Length \t Shared Exon RPKM \t Shared Intron Read Count \t Share Intron Length \t Shared Intron RPKM \t Merged Transcript Read Count \t Merged Transcript Length \t Merged Transcript RPKM \t RefSeq IDs \t Gene Symbols \n"
    outf.write(outline)
    outf.close()

    # The RNA seq data are strand specific. Only use + reads on genes on forward strand, and - reads on genes on reverse strand.
    print "Process genes on forward strand"
    (forward_shared_exon_count,
     forward_shared_intron_count) = CalculateExonIntrons(
         entrez_genes_on_forward_strand, opt.ReadsOnForwardStrand, chroms,
         opt.fragment_size, totalcount, opt.outfile)
    print "Process genes on reverse strand"
    (reverse_shared_exon_count,
     reverse_shared_intron_count) = CalculateExonIntrons(
         entrez_genes_on_reverse_strand, opt.AlusOnReverseStrand, chroms,
         opt.fragment_size, totalcount, opt.outfile)

    #combine the densities
    shared_exon_count = {}
    shared_intron_count = {}
    for chrom in chroms:
        # exon
        if chrom in forward_shared_exon_count.keys():
            shared_exon_count[chrom] = forward_shared_exon_count[chrom]
        if chrom in reverse_shared_exon_count.keys():
            shared_exon_count[chrom].update(reverse_shared_exon_count[chrom])
        # intron
        if chrom in forward_shared_intron_count.keys():
            shared_intron_count[chrom] = forward_shared_intron_count[chrom]
        if chrom in reverse_shared_intron_count.keys():
            shared_intron_count[chrom].update(
                reverse_shared_intron_count[chrom])
    #store the info in a pickle file
    name = opt.outfile + "_shared_exon_RPKMS.pkl"
    output = open(name, 'wb')
    pickle.dump(shared_exon_count, output)
    output.close()
    name = opt.outfile + "_shared_intron_RPKMS.pkl"
    output = open(name, 'wb')
    pickle.dump(shared_intron_count, output)
    output.close()

    print "it took", time.time() - startTime, "seconds."
def main(argv):
	parser = OptionParser()
	parser.add_option("-f", "--forwardreadfile", action="store", type="string", dest="ReadsOnForwardStrand", help="input bed file for RNASeq raw reads on forward strand", metavar="<file>")
	parser.add_option("-r", "--reversereadfile", action="store", type="string", dest="ReadsOnReverseStrand", help="input bed file for RNASeq raw reads on reverse strand", metavar="<file>")
	parser.add_option("-u", "--entrez_genes_file", action="store", type="string", dest="entrez_genes", metavar="<file>", help="file with curated known genes clustered by entrez ID in pickle format")
	parser.add_option("-o", "--outfile", action="store", type="string", dest="outfile", help="outfile name", metavar="<file>")
	parser.add_option("-s", "--species", action="store", type="string", dest="species",help="species, mm8, hg18, etc", metavar="<str>")
	parser.add_option("-p", "--PAfile", action="store", type="string", dest="PAfile", help="input bed3 file", metavar="<file>")	
	parser.add_option("-e", "--extension", action="store", type="int", dest="extension",help="integer value denoting how far downstream the program should look for polyadenylation sites past the Entrez given 3'UTR end", metavar="<float>")
		

	(opt, args) = parser.parse_args(argv)

	if len(argv) < 14:
		parser.print_help()
		sys.exit(1)

	startTime = time.time()

	allowance = 10

	if opt.species in GenomeData.species_chroms.keys():
		chroms = GenomeData.species_chroms[opt.species]
		chrom_lengths = GenomeData.species_chrom_lengths[opt.species]
	else:
		print "This species is not recognized, exiting"
		sys.exit(1)

	# entrez_gene_collection is a KnownEntrezGenes class object. The core is a entrez_genes.entrez_genes is a dic (keyed by entrez_id) of lists of EntrezGene object
	annotation = open(opt.entrez_genes, 'rb')
	entrez_gene_collection = Entrez.KnownEntrezGenes(chroms, pickle.load(annotation))
	annotation.close()

	# test module
	test = 0
	if test == 1:
		print "Testing gene structure"
		test_id = 79947
		Entrez.test_gene_structure(entrez_gene_collection, test_id)


	# Filter cluster of refseq_ids (keyed by entrez_id) according to the criterion of identical cdsEnd
	entrez_ids_with_unique_cdsEnd = entrez_gene_collection.get_ids_with_unique_cdsEnd()
	print "There are ", len(entrez_ids_with_unique_cdsEnd), " Entrez IDs each of which has a unique cdsEnd."


	#get total read count
	totalcount_F = get_total_tag_counts.get_total_tag_counts(opt.ReadsOnForwardStrand)
	totalcount_R = get_total_tag_counts.get_total_tag_counts(opt.ReadsOnReverseStrand)
	totalcount = totalcount_F + totalcount_R
	print totalcount_F, totalcount_R

	#Clear the file and write the first line
	outf = open(opt.outfile, 'w')
	
	#outline to use to output polyA information for a species	
	#outline = "# Entrez ID" + "\t" + "Chrom" + "\t" + "Strand" + "\t" + "UTRstart" + "\t" + "PolyAsites" + "\n"
	#outline to use to output RUDs
	outline = "# Entrez ID" + "\t" + "Chrom" + "\t" + "Strand" + "\t" + "Basic_RUD" + "\t" + "List_of_subRUDs" + "\n"
	outf.write(outline)
	outf.close()

	#index: column in bed file for sorting
	index = 2

	print "Process genes on forward strand"
	entrez_ids_on_forward_strand = entrez_gene_collection.get_strand_specific_ids("+", entrez_ids_with_unique_cdsEnd)
	print "There are ", len(entrez_ids_on_forward_strand), " Entrez IDs on forward strand."
	entrez_gene_subset = Entrez.KnownEntrezGenes(chroms, entrez_gene_collection.subset(entrez_ids_on_forward_strand))

	Calculate3UTRUsage(entrez_gene_subset, opt.ReadsOnForwardStrand, chroms, opt.outfile, allowance, opt.PAfile, opt.extension, index)


	print "Process genes on reverse strand"
	entrez_ids_on_reverse_strand = entrez_gene_collection.get_strand_specific_ids("-", entrez_ids_with_unique_cdsEnd)
	print "There are ", len(entrez_ids_on_reverse_strand), " Entrez IDs on reverse strand."
	entrez_gene_subset = Entrez.KnownEntrezGenes(chroms, entrez_gene_collection.subset(entrez_ids_on_reverse_strand))

	Calculate3UTRUsage(entrez_gene_subset, opt.ReadsOnReverseStrand, chroms, opt.outfile, allowance, opt.PAfile, opt.extension, index)

	print "it took", time.time() - startTime, "seconds."
Example #27
0
def main(argv):
    parser = OptionParser()
    parser.add_option(
        "-f",
        "--forwardreadfile",
        action="store",
        type="string",
        dest="ReadsOnForwardStrand",
        help="input bed file for RNASeq raw reads on forward strand",
        metavar="<file>")
    parser.add_option(
        "-r",
        "--reversereadfile",
        action="store",
        type="string",
        dest="ReadsOnReverseStrand",
        help="input bed file for RNASeq raw reads on reverse strand",
        metavar="<file>")
    parser.add_option(
        "-u",
        "--entrez_genes_file",
        action="store",
        type="string",
        dest="entrez_genes",
        metavar="<file>",
        help=
        "file with curated known genes clustered by entrez ID in pickle format"
    )
    parser.add_option("-o",
                      "--outfile",
                      action="store",
                      type="string",
                      dest="outfile",
                      help="outfile name",
                      metavar="<file>")
    parser.add_option("-s",
                      "--species",
                      action="store",
                      type="string",
                      dest="species",
                      help="species, mm8, hg18, etc",
                      metavar="<str>")
    parser.add_option("-p",
                      "--PAfile",
                      action="store",
                      type="string",
                      dest="PAfile",
                      help="input bed3 file",
                      metavar="<file>")
    parser.add_option(
        "-e",
        "--extension",
        action="store",
        type="int",
        dest="extension",
        help=
        "integer value denoting how far downstream the program should look for polyadenylation sites past the Entrez given 3'UTR end",
        metavar="<float>")

    (opt, args) = parser.parse_args(argv)

    if len(argv) < 14:
        parser.print_help()
        sys.exit(1)

    startTime = time.time()

    allowance = 10

    if opt.species in GenomeData.species_chroms.keys():
        chroms = GenomeData.species_chroms[opt.species]
        chrom_lengths = GenomeData.species_chrom_lengths[opt.species]
    else:
        print "This species is not recognized, exiting"
        sys.exit(1)

    # entrez_gene_collection is a KnownEntrezGenes class object. The core is a entrez_genes.entrez_genes is a dic (keyed by entrez_id) of lists of EntrezGene object
    annotation = open(opt.entrez_genes, 'rb')
    entrez_gene_collection = Entrez.KnownEntrezGenes(chroms,
                                                     pickle.load(annotation))
    annotation.close()

    # test module
    test = 0
    if test == 1:
        print "Testing gene structure"
        test_id = 79947
        Entrez.test_gene_structure(entrez_gene_collection, test_id)

    # Filter cluster of refseq_ids (keyed by entrez_id) according to the criterion of identical cdsEnd
    entrez_ids_with_unique_cdsEnd = entrez_gene_collection.get_ids_with_unique_cdsEnd(
    )
    print "There are ", len(entrez_ids_with_unique_cdsEnd
                            ), " Entrez IDs each of which has a unique cdsEnd."

    #get total read count
    totalcount_F = get_total_tag_counts.get_total_tag_counts(
        opt.ReadsOnForwardStrand)
    totalcount_R = get_total_tag_counts.get_total_tag_counts(
        opt.ReadsOnReverseStrand)
    totalcount = totalcount_F + totalcount_R
    print totalcount_F, totalcount_R

    #Clear the file and write the first line
    outf = open(opt.outfile, 'w')

    #outline to use to output polyA information for a species
    #outline = "# Entrez ID" + "\t" + "Chrom" + "\t" + "Strand" + "\t" + "UTRstart" + "\t" + "PolyAsites" + "\n"
    #outline to use to output RUDs
    outline = "# Entrez ID" + "\t" + "Chrom" + "\t" + "Strand" + "\t" + "Basic_RUD" + "\t" + "List_of_subRUDs" + "\n"
    outf.write(outline)
    outf.close()

    #index: column in bed file for sorting
    index = 2

    print "Process genes on forward strand"
    entrez_ids_on_forward_strand = entrez_gene_collection.get_strand_specific_ids(
        "+", entrez_ids_with_unique_cdsEnd)
    print "There are ", len(
        entrez_ids_on_forward_strand), " Entrez IDs on forward strand."
    entrez_gene_subset = Entrez.KnownEntrezGenes(
        chroms, entrez_gene_collection.subset(entrez_ids_on_forward_strand))

    Calculate3UTRUsage(entrez_gene_subset, opt.ReadsOnForwardStrand, chroms,
                       opt.outfile, allowance, opt.PAfile, opt.extension,
                       index)

    print "Process genes on reverse strand"
    entrez_ids_on_reverse_strand = entrez_gene_collection.get_strand_specific_ids(
        "-", entrez_ids_with_unique_cdsEnd)
    print "There are ", len(
        entrez_ids_on_reverse_strand), " Entrez IDs on reverse strand."
    entrez_gene_subset = Entrez.KnownEntrezGenes(
        chroms, entrez_gene_collection.subset(entrez_ids_on_reverse_strand))

    Calculate3UTRUsage(entrez_gene_subset, opt.ReadsOnReverseStrand, chroms,
                       opt.outfile, allowance, opt.PAfile, opt.extension,
                       index)

    print "it took", time.time() - startTime, "seconds."
Example #28
0
def Calculate3UTRUsage(entrez_genes, bedfile, chroms, outfile, threshold,
                       PAfile, extension, index):
    """
	entrez genes are made sure to be on one strand, 
	the bed file are reads for that strand

	entrez_genes is a KnownEntrezGenes class object
	The raw read file needs to conform to bed format

	column_index: column in bed file for sorting

	"""
    # Separate reads by chrom
    rawreadslibName1 = (bedfile).split('/')[-1]
    rawreadssuffix1 = rawreadslibName1.split('.')[-1]
    rawreadslibName1 = rawreadslibName1.split('.')[0]
    rawreadsextension1 = "-" + rawreadslibName1 + '.' + rawreadssuffix1 + "1"
    if Utility_extended.fileExists(bedfile):
        if Utility_extended.chrom_files_exist(chroms, rawreadsextension1) != 1:
            # Separate by chrom and sort by start
            print chroms, rawreadsextension1, " files do not exist, separate by chroms and sort each file according to the second column. "
            Utility_extended.separate_by_chrom_sort(chroms, bedfile,
                                                    rawreadsextension1,
                                                    str(index))
    else:
        print bedfile, " is not found"
        sys.exit(1)

    #This part is to access the polyadenylation sites
    PA1 = open(PAfile, 'r')

    PAsiteslist = []
    PA2 = 'i'
    while PA2 != '':
        PA2 = PA1.readline()
        if PA2 != '':
            PA3 = PA2.strip('\n')
            PA4 = PA3.split('\t')
            PAsiteslist.append((PA4[0], PA4[1]))

    PA1.close()

    # Here the output is 'a', i.e. the output is appended to an existing file instead of creating one
    outf = open(outfile, 'a')
    for chrom in chroms:
        if chrom in entrez_genes.chroms:
            # a KnownEntrezGenes object
            entrez_genes_by_chrom = Entrez.KnownEntrezGenes(
                [chrom], entrez_genes.subset_by_chrom(chrom))
            # Get the read locations
            if Utility_extended.fileExists(chrom + rawreadsextension1):
                f = open(chrom + rawreadsextension1, 'r')
                tag_positions = []
                for line in f:
                    line = line.strip()
                    sline = line.split()
                    #make sure the extension is always 0, otherwise the rest of the program might not work as intended
                    tag_positions.append(
                        associate_tags_with_regions.tag_position(sline, 0))

                f.close()
                if not Utility_extended.is_list_sorted(tag_positions):
                    tag_positions.sort()
                #By this point tag_positions is a sorted list of all the reads located on the strand and chromosome the code is currently dealing with

                for entrez_id in entrez_genes_by_chrom.entrez_ids:
                    gene = entrez_genes_by_chrom.entrez_genes[
                        entrez_id]  # an EntrezGene class object
                    # get_3UTRs gets the ENTREZ 3'UTR, which appears to generally give the beginning of the 3'UTR and a site very close to the most distal polyadenylation site
                    three_UTRs = gene.get_3UTRs()
                    # Mastertuplemaker uses the ENTREZ 3'UTR and the polyA sites given to create the true data for the 3'UTR needed for CUTR_vs_AUTR to work
                    true3UTRstarts, true3UTRends, UTRregion_start, UTRregion_end, UTRbeginning = Mastertuplemaker(
                        three_UTRs, PAsiteslist, chrom, gene.strand, extension)
                    #value should always be 1 as only 3'UTR with more than 1 polyA site need be considered
                    if len(true3UTRends) > 1:
                        #find all reads inside the 3'UTR
                        inside_reads = associate_tags_with_3UTR(
                            tag_positions, UTRregion_start, UTRregion_end)
                        #finds reads in each region of the 3'UTR and calculates aUTR/cUTR for each of them
                        #PolyAsites potentially useful for output
                        RUDs, basic_RUD, PolyAsites = CUTR_vs_AUTR(
                            true3UTRstarts, true3UTRends, inside_reads,
                            gene.strand, threshold)

                        #important if one wants to output gene_symbol information
                        gene_symbol = []
                        for mytranscript in gene.transcripts:
                            if mytranscript.additional_annotations[
                                    0] not in gene_symbol:
                                gene_symbol.append(
                                    mytranscript.additional_annotations[0])

                        #outline to use to output RUDs
                        outline = str(
                            entrez_id
                        ) + "\t" + chrom + "\t" + gene.strand + "\t" + str(
                            basic_RUD) + "\t" + ",".join(map(str, RUDs)) + "\n"

                        #outline to use to output polyA information for a species
                        #outline = str(entrez_id) + "\t" + chrom + "\t" + gene.strand + "\t" + str(UTRbeginning) + "\t" + ",".join(map(str, PolyAsites)) + "\n"

                        outf.write(outline)
    outf.close()
Example #29
0
def calculateExonIntrons(entrez_genes,
                         bedfile,
                         column_index,
                         chroms,
                         fragment_size,
                         totalcount,
                         out_file=None):
    """
	entrez_genes is a EntrezGenes class object. The core is a entrez_genes.entrez_genes is a dic (keyed by entrez_id) of lists of EntrezGene object
	
	return:
	all_reads_on_shared_exons = {} # {entrezID:[((start, end), read_count)]}
	all_reads_on_shared_introns = {} # {entrezID:[((start, end), read_count)]}
	all_reads_on_merged_transcripts = {} #{entrezID:[((start, end), read_count)]}
	all_summary = {} # {entrezID:{attribute:value}}
		(summary[entrez_id])["merged_exons_rc"] = merged_exons_rc
		(summary[entrez_id])["merged_exon_RPKM"] = merged_exon_RPKM
		(summary[entrez_id])["merged_exons_total_length"] = merged_exons_total_length
		(summary[entrez_id])["shared_exons_rc"] = shared_exons_rc
		(summary[entrez_id])["shared_exon_RPKM"] = shared_exon_RPKM
		(summary[entrez_id])["shared_exons_total_length"] = shared_exons_total_length
		(summary[entrez_id])["shared_introns_rc"] = shared_introns_rc
		(summary[entrez_id])["shared_intron_RPKM"] = shared_intron_RPKM
		(summary[entrez_id])["shared_introns_total_length"] = shared_introns_total_length
		(summary[entrez_id])["merged_transcript_rc"] = merged_transcript_rc
		(summary[entrez_id])["merged_transcript_RPKM"] = merged_transcript_RPKM
		(summary[entrez_id])["merged_transcript_length"] = merged_transcript_length
	"""
    lib_name = (bedfile).split('/')[-1]  # remove directory
    suffix = lib_name.split('.')[-1]  # txt
    lib_name = lib_name.split('.')[0]
    extension = "-" + lib_name + '.' + suffix + "1"
    if Utility_extended.fileExists(bedfile):
        if Utility_extended.chrom_files_exist(chroms, extension) != 1:
            # Separate by chrom and sort by start
            print chroms, extension, " files do not exist, separate by chroms. "
            Utility_extended.separate_by_chrom_sort(chroms, bedfile, extension,
                                                    [column_index])
    else:
        print bedfile, " is not found"
        sys.exit(1)

    all_reads_on_shared_exons = {}  # {entrezID:[((start, end), read_count)]}
    all_reads_on_shared_introns = {}  # {entrezID:[((start, end), read_count)]}
    all_reads_on_merged_transcripts = {
    }  #{entrezID:[((start, end), read_count)]}
    all_summary = {}  # {entrezID:{attributes}}

    for chrom in chroms:
        chrombed = chrom + extension
        if chrom in entrez_genes.chroms:
            entrez_genes_by_chrom = Entrez.KnownEntrezGenes(
                [chrom], entrez_genes.subset_by_chrom(chrom))
            (reads_on_shared_exons, reads_on_shared_introns,
             reads_on_merged_transcripts,
             summary) = calculateExonIntrons_by_chrom(entrez_genes_by_chrom,
                                                      chrombed, fragment_size,
                                                      totalcount, out_file)
            #if chrom == chroms[0]:
            #myid = reads_on_shared_exons.keys()[0]
            #test(entrez_genes_by_chrom, reads_on_shared_introns, myid)
            all_reads_on_shared_exons.update(reads_on_shared_exons)
            all_reads_on_shared_introns.update(reads_on_shared_introns)
            all_reads_on_merged_transcripts.update(reads_on_merged_transcripts)
            all_summary.update(summary)
            print len(all_summary.keys())

    SeparateByChrom.cleanup(chroms, extension)
    return (all_reads_on_shared_exons, all_reads_on_shared_introns,
            all_reads_on_merged_transcripts, all_summary)
Example #30
0
def AssignPeaksToEntrez3UTRs(entrez_genes, peakfile, chroms, chrom_lengths, peak_threshold, downstream_extension):
	"""
	Returns {entrez_id:(gene, ThreeUTR_length, peaks_on_3UTR)}
	gene:gene = entrez_genes_by_chrom.entrez_genes[entrez_id] 
	ThreeUTR_length: longest 3UTR length; length includes the downstream extension
	peaks_on_3UTR:[(location, read_count)]
	"""
	
	peaks_on_entrez_3UTRs = {} #store the peaks for each 3UTR of the entrez cluster. {Entrez_ID: (gene, ThreeUTR_length, peaks_on_3UTR)}
	
	if Utility_extended.fileExists(peakfile):
		# Read the peaks, which is assumed to have the pseudo ucsc format
		island_libName1 = (peakfile).split('/')[-1]
		island_suffix1 = island_libName1.split('.')[-1] 
		island_libName1 = island_libName1.split('.')[0]
		island_extension1 = "-" + island_libName1 + '.' + island_suffix1 + "1"
		SeparateByChrom.separateByChrom(chroms, peakfile, island_extension1)
	else:
		print peakfile, " is not found";
		sys.exit(1)
	
	for chrom in chroms: 
		if chrom in entrez_genes.chroms:
			entrez_genes_by_chrom =  Entrez.KnownEntrezGenes([chrom], entrez_genes.subset_by_chrom(chrom))
			this_chrom_length = chrom_lengths[chrom]
			
			# Load in the PA peak information 
			if Utility_extended.fileExists(chrom + island_extension1):
				inf = open(chrom + island_extension1, 'r')
				# Read in the peaks and separate the forward strand peaks and the reverse strand peaks
				five_peaks = [] # peaks on forward strand, element (location, read_count)
				three_peaks = [] # peaks on reverse strand, element (location, read_count)
				for line in inf:
					line = line.strip();
					sline = line.split();
					strand = sline[2]
					if plus.match(strand):
						if float(sline[10]) >= peak_threshold:
							five_peaks.append ((int(sline[3]), float(sline[10])))
					elif minus.match(strand):
						if float(sline[10]) >= peak_threshold:
							three_peaks.append ((int(sline[4]), float(sline[10])))
				five_peaks = sorted(five_peaks, key = itemgetter(0)) #sort according to location
				five_peaks_location = [item[0] for item in five_peaks]
				three_peaks = sorted(three_peaks, key = itemgetter(0))
				three_peaks_location = [item[0] for item in three_peaks]
				inf.close()
				
				for entrez_id in entrez_genes_by_chrom.entrez_ids:
					gene = entrez_genes_by_chrom.entrez_genes[entrez_id] # an EntrezGene class object
					
					# For the set of transcripts, use the longest 3UTR at the designated representative 3UTR
					transcript_with_longest_3UTR = gene.identify_transcript_with_longest_3UTR() # a UCSC class object
					
					if plus.match(transcript_with_longest_3UTR.strand):
						start = transcript_with_longest_3UTR.cdsEnd
						end = min(transcript_with_longest_3UTR.txEnd + downstream_extension, this_chrom_length)
						start_ind = bisect.bisect_left(five_peaks_location, start);
						end_ind = bisect.bisect_right(five_peaks_location, end);
						peaks_on_3UTR = five_peaks[start_ind: end_ind] #[(mode_location, readcount)]
					if minus.match(transcript_with_longest_3UTR.strand):
						start = max(transcript_with_longest_3UTR.txStart - downstream_extension, 0)
						end = transcript_with_longest_3UTR.cdsStart
						start_ind = bisect.bisect_left(three_peaks_location, start);
						end_ind = bisect.bisect_right(three_peaks_location, end);
						peaks_on_3UTR = three_peaks[start_ind: end_ind]
					ThreeUTR_length = end - start + 1 #length includes the downstream extension
					peaks_on_entrez_3UTRs[entrez_id] = (gene, ThreeUTR_length, peaks_on_3UTR)
				
	SeparateByChrom.cleanup(chroms, island_extension1)
	return peaks_on_entrez_3UTRs
Example #31
0
def main(argv):
    parser = OptionParser()
    parser.add_option(
        "-f",
        "--forwardreadfile",
        action="store",
        type="string",
        dest="ReadsOnForwardStrand",
        help="input bed file for RNASeq raw reads on forward strand",
        metavar="<file>")
    parser.add_option(
        "-r",
        "--reversereadfile",
        action="store",
        type="string",
        dest="ReadsOnReverseStrand",
        help="input bed file for RNASeq raw reads on reverse strand",
        metavar="<file>")
    parser.add_option(
        "-g",
        "--fragment_size",
        action="store",
        type="int",
        dest="fragment_size",
        help=
        "fragment_size determines the shift (half of fragment_size of ChIP-seq read position, in bps",
        metavar="<int>")
    parser.add_option(
        "-u",
        "--entrez_genes_file",
        action="store",
        type="string",
        dest="entrez_genes",
        metavar="<file>",
        help=
        "file with curated known genes clustered by entrez ID in pickle format"
    )
    parser.add_option("-o",
                      "--outfile",
                      action="store",
                      type="string",
                      dest="outfile",
                      metavar="<file>",
                      help="output file name for genes and tag numbers")
    parser.add_option("-s",
                      "--species",
                      action="store",
                      type="string",
                      dest="species",
                      help="species, mm8, hg18, etc",
                      metavar="<str>")

    test = 0

    (opt, args) = parser.parse_args(argv)
    if len(argv) < 12:
        parser.print_help()
        sys.exit(1)

    startTime = time.time()

    ##################################################################3
    #The column numbers are 1 based instead of 0 based!
    #For positive strand
    start_index_P = 2
    #For negative strand
    start_index_N = 3
    ##################################################################3

    if opt.species in GenomeData.species_chroms.keys():
        chroms = GenomeData.species_chroms[opt.species]
        chrom_lengths = GenomeData.species_chrom_lengths[opt.species]
    else:
        print "This species is not recognized, exiting"
        sys.exit(1)

    # entrez_gene_collection is a EntrezGenes class object. The core is a entrez_genes.entrez_genes is a dic (keyed by entrez_id) of lists of EntrezGene object
    annotation = open(opt.entrez_genes, 'rb')
    entrez_gene_collection = Entrez.KnownEntrezGenes(chroms,
                                                     pickle.load(annotation))
    annotation.close()

    # test module
    test = 0
    if test == 1:
        print "Testing gene structure"
        test_id = 54
        Entrez.test_gene_structure(entrez_gene_collection, test_id)

    totalcount_F = get_total_tag_counts.get_total_tag_counts(
        opt.ReadsOnForwardStrand)
    totalcount_R = get_total_tag_counts.get_total_tag_counts(
        opt.ReadsOnReverseStrand)
    totalcount = totalcount_F + totalcount_R
    print totalcount_F, totalcount_R

    #Clear the file.
    outf = open(opt.outfile, 'w')
    outline = "# Entrez ID \t Merged Exon Read Count \t Merged Exon Length \t Merged Exon RPKM \t Shared Exon Read Count \t  Shared Exon Length \t Shared Exon RPKM \t Shared Intron Read Count \t Share Intron Length \t Shared Intron RPKM \t Merged Transcript Read Count \t Merged Transcript Length \t Merged Transcript RPKM \t RefSeq IDs \t Gene Symbols \n"
    outf.write(outline)
    outf.close()

    # The RNA seq data are strand specific. Only use + reads on genes on forward strand, and - reads on genes on reverse strand.
    print "Process genes on forward strand"
    entrez_ids_on_forward_strand = entrez_gene_collection.get_strand_specific_ids(
        "+")
    print "There are ", len(
        entrez_ids_on_forward_strand), " Entrez IDs on forward strand."
    entrez_gene_subset = Entrez.KnownEntrezGenes(
        chroms, entrez_gene_collection.subset(entrez_ids_on_forward_strand))

    (forward_reads_on_shared_exons, forward_reads_on_shared_introns,
     forward_reads_on_merged_transcripts,
     forward_summary) = calculateExonIntrons(entrez_gene_subset,
                                             opt.ReadsOnForwardStrand,
                                             start_index_P, chroms,
                                             opt.fragment_size, totalcount,
                                             opt.outfile)

    print "Process genes on reverse strand"
    entrez_ids_on_reverse_strand = entrez_gene_collection.get_strand_specific_ids(
        "-")
    print "There are ", len(
        entrez_ids_on_reverse_strand), " Entrez IDs on reverse strand."
    entrez_gene_subset = Entrez.KnownEntrezGenes(
        chroms, entrez_gene_collection.subset(entrez_ids_on_reverse_strand))

    (reverse_reads_on_shared_exons, reverse_reads_on_shared_introns,
     reverse_reads_on_merged_transcripts,
     reverse_summary) = calculateExonIntrons(entrez_gene_subset,
                                             opt.ReadsOnReverseStrand,
                                             start_index_N, chroms,
                                             opt.fragment_size, totalcount,
                                             opt.outfile)

    #combine the densities
    # {entrezID:[((start, end), read_count)]}
    reads_on_shared_exons = {}
    reads_on_shared_exons.update(forward_reads_on_shared_exons)
    reads_on_shared_exons.update(reverse_reads_on_shared_exons)
    name = opt.outfile + "_shared_exons.pkl"
    output = open(name, 'wb')
    pickle.dump(reads_on_shared_exons, output)
    output.close()

    if test == 1:
        test_distribution_dic(reads_on_shared_exons, test_id)

    # {entrezID:[((start, end), read_count)]}
    reads_on_shared_introns = {}
    reads_on_shared_introns.update(forward_reads_on_shared_introns)
    reads_on_shared_introns.update(reverse_reads_on_shared_introns)
    #store the info in a pickle file
    name = opt.outfile + "_shared_introns.pkl"
    output = open(name, 'wb')
    pickle.dump(reads_on_shared_introns, output)
    output.close()

    if test == 1:
        test_distribution_dic(reads_on_shared_introns, test_id)

    reads_on_merged_transcripts = {}
    reads_on_merged_transcripts.update(forward_reads_on_merged_transcripts)
    reads_on_merged_transcripts.update(reverse_reads_on_merged_transcripts)
    #store the info in a pickle file
    name = opt.outfile + "_merged_transcripts.pkl"
    output = open(name, 'wb')
    pickle.dump(reads_on_merged_transcripts, output)
    output.close()

    summary = {}
    summary.update(forward_summary)
    summary.update(reverse_summary)
    name = opt.outfile + "_summary.pkl"
    output = open(name, 'wb')
    pickle.dump(summary, output)
    output.close()

    print "it took", time.time() - startTime, "seconds."