def genbank_entries_from_accession(accessions, read_out=False, email='*****@*****.**'): import Entrez entries = [] Entrez.email = email request = Entrez.epost('nucleotide', id=','.join(map(str, accessions))) result = Entrez.read(request) handle = Entrez.efetch(db='nucleotide', retmode='xml', webenv=result['WebEnv'], query_key=result['QueryKey']) for r in Entrez.parse(handle): try: entry = [x for x in r['GBSeq_other-seqids'] if 'gi' in x][0] gi = int(entry.split('|')[1]) entries.append(gi) except ValueError: gi = None if read_out is True: print(">GI {0} {1} {2}\n{3}".format(gi, r['GBSeq_primary-accession'], r['GBSeq_definition'], r['GBSeq_sequence'][:15])) return entries
def erratum_check(PMID, comments): """ Does this PMID have an erratum? I can't do the erratum formatting automatically. But I can warn user that there is an erratum. """ Entrez.email = app.config['EMAIL'] handle = Entrez.efetch(db="pubmed", id=PMID, rettype="gb", retmode="xml") records = Entrez.read(handle) erratum_count = 0 try: corrections = records['PubmedArticle'][0]['MedlineCitation'][ 'CommentsCorrectionsList'] for correction in corrections: if correction.attributes['RefType'] == 'ErratumIn': if erratum_count == 0: comments.append( "I smell an erratum: " + correction['RefSource'] + ". \nAdd it to the end of your citation: [Erratum in Journal, Issue(Volume): page. DOI: #. Accessed date.]" ) else: # Is this the second (or later) erratum we're reporting for this article? Then shorten report. comments.append("There's another erratum! What a mess." + correction['RefSource']) erratum_count += 1 except: pass # No errata? Do nothing. return comments
def ncbi_search(self, database, term): """ Submit search to NCBI and return the records. """ self.handle = Entrez.esearch(db=database, term=term, usehistory="y", retmax=100000000) self.record = Entrez.read(self.handle) self.handle.close() return self.record
def ncbi_search(self, database, term): """ Submit search to NCBI and return the records. """ self.handle = Entrez.esearch(db=database, term=term, usehistory="y", retmax=10, idtype="acc") self.record = Entrez.read(self.handle) self.handle.close() return self.record
def PMID_to_formatted_citation(PMID, comments): """ Take a PMID, retrieve the PubMed data, and format it based on our style guide. :param PMID: PubMed ID :return: """ Entrez.email = app.config['EMAIL'] try: handle = Entrez.esummary(db="pubmed", id=PMID) record = Entrez.read(handle)[0] except IOError: comments.append( "Is there a network problem? Unleash me please!") # Network error return '', comments except: comments.append("I can't fetch an article with that ID." ) # DOI not on PubMed? Or Bad PMID? comments.append( "Try looking up the article on www.pubmed.gov. (Note: PubMed may not have the DOI.) If the article is there, copy its PMID and bring it to me. If the article isn't on PubMed, I can't fetch a citation for you. Sorry. :( " ) return '', comments author_names = authors.format_authors(record) try: year = record['PubDate'][0:4] except: comments.append("What year is it?") year = '20??' title, comments = article_title.format_title(record, PMID, comments) journal, comments = journal_name.format_journal(record, comments) vol_iss, comments = volume_issue.format_volume_issue(record, comments) pages, comments = page_numbers.format_pages(record, comments) DOI_value, comments = DOI.format_DOI(record, comments) accessed = accessed_date.format_accessed_date() # When there's no author: Title (Year)... if author_names == '': if title[-1] == '.': title = title[0:-1] citation = title + ' (' + year + '). ' + journal + vol_iss + pages + '. ' + DOI_value + accessed # Otherwise: Authors (Year). Title... else: citation = author_names + ' (' + year + '). ' + title + ' ' + journal + vol_iss + pages + '. ' + DOI_value + accessed # print(citation) # print(' ') comments = errata.erratum_check(PMID, comments) handle.close() return citation, comments
def main(argv): parser = OptionParser() parser.add_option("-p", "--peakfile", action="store", type="string", dest="peakfile", help="input ucsc file for PA peaks ", metavar="<file>") parser.add_option("-u", "--annotationfile", action="store", type="string", dest="annotationfile", help="pickle file for annotations ", metavar="<file>") parser.add_option("-o", "--outfile", action="store", type="string", dest="outfile", help="outfile name", metavar="<file>") parser.add_option("-s", "--species", action="store", type="string", dest="species",help="species, mm8, hg18, etc", metavar="<str>") parser.add_option("-t", "--peak_threshold", action="store", type="int", dest="peak_threshold",help="Peak threshold", metavar="<int>") parser.add_option("-d", "--3UTRdownstreamextension", action="store", type="int", dest="downstream_extension",help="3UTR down stream extension", metavar="<int>") (opt, args) = parser.parse_args(argv) if len(argv) < 12: parser.print_help() sys.exit(1) if opt.species in GenomeData.species_chroms.keys(): chroms = GenomeData.species_chroms[opt.species] chrom_lengths = GenomeData.species_chrom_lengths[opt.species] else: print "This species is not recognized, exiting"; sys.exit(1); # entrez_gene_collection is a KnownEntrezGenes class object. The core is a entrez_genes.entrez_genes, a dic (keyed by entrez_id) of lists of EntrezGene object annotation = open(opt.entrez_genes, 'rb') entrez_gene_collection = Entrez.KnownEntrezGenes(chroms, pickle.load(annotation)) annotation.close() # test module test = 0 if test == 1: print "Testing gene structure" test_id = 54 Entrez.test_gene_structure(entrez_gene_collection, test_id) # Filter cluster of refseq_ids (keyed by entrez_id) according to the criterion of identical cdsEnd entrez_ids_with_unique_cdsEnd = entrez_gene_collection.get_ids_unique_cdsEnd() print "There are ", len(entrez_ids_with_unique_cdsEnd), " Entrez IDs each of which has a unique cdsEnd." # Additional filter to remove clusters with intron-containing 3UTRs allowance=0 ids=entrez_ids_with_unique_cdsEnd entrez_ids_with_intronless_3UTRs = entrez_gene_collection.get_ids_with_intronless_3UTR(allowance, ids) print "There are %d Entrez_ids with additional requirement of intronless 3UTR: ", %(len(entrez_ids_with_intronless_3UTRs)) entrez_gene_subset = Entrez.KnownEntrezGenes(chroms, entrez_gene_collection.subset(entrez_ids_with_intronless_3UTRs)) peaks_on_entrez_3UTRs = AssignPeaksToEntrez3UTRs(entrez_gene_subset, opt.peakfile, chroms, chrom_lengths, opt.peak_threshold, opt.downstream_extension) output = open(libName + "_PA_Peaks_associated_with_Annotations.pkl", 'wb') pickle.dump(peaks_on_entrez_3UTRs, output) output.close() Calculate3UTRUsage(peaks_on_entrez_3UTRs, final_entrez_id_collection, opt.outfile)
def validate_and_convert_DOI_or_PMID_to_PMID(lookupID, comments): """ Look up a DOI -- or PMID -- and return a PMID. :param lookupID: either a DOI or a PMID. :return a PMID """ # Format as string and strip any leading white spaces. Do now so we can reach DOI/PMIDs. lookupID = str(lookupID).lstrip() # Remove any prefacing text that might've come through. Only if it's at the start of the lookupID. preface_tags = ['DOI:', 'doi:', 'PMID:', 'pmid:'] for tag in preface_tags: if lookupID.startswith(tag): lookupID = re.sub(tag, '', lookupID) # Drop any white spaces that remain. lookupID = lookupID.replace(" ", "") try: Entrez.email = app.config['EMAIL'] handle = Entrez.esearch(db="pubmed", retmax=10, term=lookupID) record = Entrez.read(handle) handle.close() if int(record['Count']) == 0: comments.append("I can't fetch an article with that ID." ) # DOI not on PubMed? Or Bad PMID? comments.append( "Try looking up the article on www.pubmed.gov. (Note: PubMed may not have the DOI.) If the article is there, copy its PMID and bring it to me. If the article isn't on PubMed, I can't fetch a citation for you. Sorry. :( " ) return '', comments elif int(record['Count']) > 1: comments.append( 'I found more than one article. Are there characters missing from the ID?' ) return '', comments else: # Only 1 result, perfect! return (record['IdList'][0]), comments except IOError: # Network error comments.append( "Is there a network problem? Unleash me please!") # Network error return '', comments except: comments.append("I can't fetch an article with ID " + lookupID + '. Can you double check it?') # Bad PMID? comments.append( "Try looking up the article on www.pubmed.gov. (Note: PubMed may not have the DOI.) If the article is there, copy its PMID and bring it to me. If the article isn't on PubMed, I can't fetch a citation for you. Sorry. :( " ) return '', comments
def main(argv): parser = OptionParser() parser.add_option( "-r", "--refseqfile", action="store", type="string", dest="refseq_ucsc_file", help= "input ucsc file for annotated genes, eg, refFlat_hg19_EntrezID.ucsc", metavar="<file>") parser.add_option("-i", "--entrezIDfile", action="store", type="string", dest="entrez_ids_file", help="file for entrez ids", metavar="<file>") parser.add_option( "-u", "--entrez_genes_file", action="store", type="string", dest="entrez_genes", metavar="<file>", help= "file with curated known genes clustered by entrez ID in pickle format" ) parser.add_option( "-o", "--refseqfile", action="store", type="string", dest="refseq_subset_file", help="ucsc file for refseq transcripts belonging to those entrez_ids", metavar="<file>") (opt, args) = parser.parse_args(argv) if len(argv) < 8: parser.print_help() sys.exit(1) # entrez_gene_collection is a KnownEntrezGenes class object. The core is a entrez_genes.entrez_genes is a dic (keyed by entrez_id) of lists of EntrezGene object annotation = open(opt.entrez_genes, 'rb') entrez_gene_collection = Entrez.KnownEntrezGenes(chroms, pickle.load(annotation)) annotation.close() entrez_ids = [] f = open(opt.entrez_ids_file, 'r') for line in f: if not comment.match(line): line = line.strip() sline = line.split('\t') entrez_ids.append(sline[0]) f.close() refseq_ids = entrez_gene_collection.get_transcript_ids(entrez_ids) gene_set_manipulation.output_UCSCsubset_in_file(opt.refseq_ucsc_file, refseq_ids, opt.refseq_subset_file)
def record_processor(self, record, database): """ Splits the record returned by Entrez into sparate variables and returns them. """ count = int(record["Count"]) # Int webenv = record["WebEnv"] # String query_key = record["QueryKey"] # String IDs = [] for i in range(0, count, 10000): iter_handle = Entrez.efetch(db=database, webenv=webenv, query_key=query_key, retmax=10000, rettype="acc", retstart=i) IDs += [x.rstrip() for x in iter_handle] iter_handle.close() assert count == len(IDs) if count == 0 and self.gui == 0: sys.exit("Your serch query returned no results!") elif count == 0: self.no_match.emit("Your serch query returned no results!") return None return count, IDs, webenv, query_key
def calculateExonIntrons(entrez_genes, bedfile, column_index, chroms, fragment_size, totalcount, out_file): lib_name = (bedfile).split('/')[-1] # remove directory suffix = lib_name.split('.')[-1] # txt lib_name = lib_name.split('.')[0] extension = "-" + lib_name +'.' + suffix +"1" if Utility_extended.fileExists(bedfile): if Utility_extended.chrom_files_exist(chroms, extension) != 1: # Separate by chrom and sort by start print chroms, extension, " files do not exist, separate by chroms. " Utility_extended.separate_by_chrom_sort(chroms, bedfile, extension, [column_index]) else: print bedfile, " is not found"; sys.exit(1) all_reads_on_shared_exons = {} # {entrezID:[((start, end), read_count)]} all_reads_on_shared_introns = {} # {entrezID:[((start, end), read_count)]} all_reads_on_merged_transcripts = {} #{entrezID:[((start, end), read_count)]} all_summary = {} for chrom in chroms: chrombed = chrom + extension entrez_genes_by_chrom = Entrez.KnownEntrezGenes([chrom], entrez_genes.subset_by_chrom(chrom)) (reads_on_shared_exons, reads_on_shared_introns, reads_on_merged_transcripts, summary) = calculateExonIntrons_by_chrom (entrez_genes_by_chrom, chrombed, fragment_size, totalcount, out_file) #if chrom == chroms[0]: #myid = reads_on_shared_exons.keys()[0] #test(entrez_genes_by_chrom, reads_on_shared_introns, myid) all_reads_on_shared_exons.update(reads_on_shared_exons) all_reads_on_shared_introns.update(reads_on_shared_introns) all_reads_on_merged_transcripts.update(reads_on_merged_transcripts) all_summary.update(summary) SeparateByChrom.cleanup(chroms, extension) return (all_reads_on_shared_exons, all_reads_on_shared_introns, all_reads_on_merged_transcripts, summary)
def retrieve_abstract(PMID): Entrez.email = app.config['EMAIL'] handle = Entrez.efetch(db="pubmed",rettype="medline", retmode="text", id=PMID) record = Medline.read(handle) handle.close() try: abstract = record['AB'] except: abstract = '' return abstract
def get_description(mail,ID): """Used by xml write for searching the definition of the hit using the accession number from NCBI which appears in the rapsearch output""" Entrez.email = mail handle = Entrez.efetch(db="protein", id=ID, rettype="gb", retmode="text") entry=(handle.read().strip()) complete=entry.split("\n") definition=complete[1][12:] definition2=definition.strip(".") handle.close() return (definition2)
def get_description(mail,ID): """Used by make description for searching the definition of the hit using the accession number from NCBI which appears in the nnotation output Adapted from the RapsearchToXml.py file""" Entrez.email = mail handle = Entrez.efetch(db="protein", id=ID, rettype="gb", retmode="text") entry=(handle.read().strip()) complete=entry.split("\n") definition=complete[1][12:] #get the definition camp definition2=definition.split("[")#removes the species informaton handle.close() return (definition2[0])
def fetch_by_id(self, IDs, b_size): """ Fetches NCBI data based on the IDs, rather than a search query. Returns the data handle string. """ id_handle = Entrez.efetch(db=self.database, id=IDs, rettype="fasta", retmode="text", retmax=b_size) data = id_handle.read() id_handle.close() return data
def fetch_by_history(self, start, b_size, webenv, query_key): """ Fetches NCBI data based on the provided search query. Returns the data handle string. """ hist_handle = Entrez.efetch(db=self.database, retstart=start, rettype="fasta", retmode="text", retmax=b_size, webenv=webenv, query_key=query_key) data = hist_handle.read() hist_handle.close() return data
def runTest(self): p = SeqIO.read(gzip.open('test.dat.bgz'), 'swiss') embl = [xref.split(':')[1] for xref in p.dbxrefs if 'EMBL' in xref][-1] source_seq, feature = Entrez.get_source_seq(p) self.assertIn(embl, source_seq.id)
def main(argv): parser = OptionParser() parser.add_option("-r", "--readfile", action="store", type="string", dest="Reads", help="input bed file for non-strand specific raw reads", metavar="<file>") parser.add_option( "-g", "--fragment_size", action="store", type="int", dest="fragment_size", help= "fragment_size determines the shift (half of fragment_size of ChIP-seq read position, in bps", metavar="<int>") parser.add_option( "-u", "--entrez_genes_file", action="store", type="string", dest="entrez_genes", metavar="<file>", help= "file with curated known genes clustered by entrez ID in pickle format" ) parser.add_option("-o", "--outfile", action="store", type="string", dest="outfile", metavar="<file>", help="output file name for genes and tag numbers") parser.add_option("-s", "--species", action="store", type="string", dest="species", help="species, mm8, hg18, etc", metavar="<str>") (opt, args) = parser.parse_args(argv) if len(argv) < 10: parser.print_help() sys.exit(1) startTime = time.time() if opt.species in GenomeData.species_chroms.keys(): chroms = GenomeData.species_chroms[opt.species] chrom_lengths = GenomeData.species_chrom_lengths[opt.species] else: print "This species is not recognized, exiting" sys.exit(1) # entrez_gene_collection is a EntrezGenes class object. The core is a entrez_genes.entrez_genes is a dic (keyed by entrez_id) of lists of EntrezGene object annotation = open(opt.entrez_genes, 'rb') entrez_gene_collection = Entrez.KnownEntrezGenes(chroms, pickle.load(annotation)) annotation.close() # test module test = 0 if test == 1: test_id = 54 Entrez.test_gene_structure(entrez_gene_collection, test_id) rawreadslibName1 = (opt.Reads).split('/')[-1] rawreadssuffix1 = rawreadslibName1.split('.')[-1] rawreadslibName1 = rawreadslibName1.split('.')[0] rawreadsextension1 = "-" + rawreadslibName1 + '.' + rawreadssuffix1 + "1" totalcount = 0 if Utility_extended.fileExists(opt.Reads) == 1: totalcount = get_total_tag_counts.get_total_tag_counts(opt.Reads) else: # if the all file exist, then use the all file, otherwise use the chrom separated file for chrom in chroms: chrombed = chrom + rawreadsextension1 totalcount1 = get_total_tag_counts.get_total_tag_counts(chrombed) print chrom, totalcount1 totalcount += totalcount1 (reads_on_shared_exons, reads_on_shared_introns, reads_on_merged_transcripts, summary) = calculate_non_strandspecific_rc_on_ExonIntrons( entrez_gene_collection, opt.Reads, chroms, opt.fragment_size) #Clear the file. outf = open(opt.outfile, 'w') outline = "# Entrez ID \t Merged Exon Read Count \t Merged Exon Length \t Merged Exon RPKM \t Shared Exon Read Count \t Shared Exon Length \t Shared Exon RPKM \t Shared Intron Read Count \t Share Intron Length \t Shared Intron RPKM \t Merged Transcript Read Count \t Merged Transcript Length \t Merged Transcript RPKM \t RefSeq IDs \t Gene Symbols \n" outf.write(outline) for entrez_id in entrez_gene_collection.entrez_ids: gene = (entrez_gene_collection.entrez_genes)[entrez_id] gene_symbol = [] for transcript in gene.transcripts: if transcript.additional_annotations[0] not in gene_symbol: gene_symbol.append(transcript.additional_annotations[0]) outline = str(entrez_id) + '\t' + str( summary[entrez_id]["merged_exons_rc"] ) + '\t' + str( summary[entrez_id]["merged_exons_total_length"] ) + '\t' + str(summary[entrez_id]["merged_exon_RPKM"]) + '\t' + str( summary[entrez_id]["shared_exons_rc"] ) + '\t' + str( summary[entrez_id]["shared_exons_total_length"] ) + '\t' + str(summary[entrez_id]["shared_exon_RPKM"]) + '\t' + str( summary[entrez_id]["shared_introns_rc"] ) + '\t' + str( summary[entrez_id]["shared_introns_total_length"] ) + '\t' + str(summary[entrez_id]["shared_intron_RPKM"]) + '\t' + str( summary[entrez_id]["merged_transcript_rc"]) + '\t' + str( summary[entrez_id]["merged_transcript_length"]) + '\t' + str( summary[entrez_id] ["merged_transcript_RPKM"]) + '\t' + ','.join([ transcript.name for transcript in gene.transcripts ]) + '\t' + ','.join(gene_symbol) + '\n' outf.write(outline) outf.close() # {entrezID:[((start, end), read_count)]} name = opt.outfile + "_shared_exons.pkl" output = open(name, 'wb') pickle.dump(reads_on_shared_exons, output) output.close() # {entrezID:[((start, end), read_count)]} name = opt.outfile + "_shared_introns.pkl" output = open(name, 'wb') pickle.dump(reads_on_shared_introns, output) output.close() #store the info in a pickle file name = opt.outfile + "_merged_transcripts.pkl" output = open(name, 'wb') pickle.dump(reads_on_merged_transcripts, output) output.close() name = opt.outfile + "_summary.pkl" output = open(name, 'wb') pickle.dump(summary, output) output.close() print "it took", time.time() - startTime, "seconds."
def assign_AluElements_to_intronexons_by_chrom(my_entrez_genes, Alufile_by_chrom, chrom): """ entrez genes are made sure to be on one chrom, and the bed file are reads for that strand The raw read file needs to conform to bed format """ # Separate by chrom reads if Utility_extended.fileExists(Alufile_by_chrom) and ( chrom in my_entrez_genes.chroms): print chrom # set up a KnownEntrezGenes Instance for entrez_genes on this particular chrom entrez_genes_by_chrom = Entrez.KnownEntrezGenes( [chrom], my_entrez_genes.subset_by_chrom(chrom)) # load in the Alus on chrom Alus = RepElements.KnownRepElements.initiate_from_file( [chrom], Alufile_by_chrom) print "There are %d of elements on %s" % (Alus.number, chrom) # Use the mid point of an Alu Element to represent its position, each element is a tuple of (position, id) Alu_positions = [] for myid in Alus.rep_elements.keys(): element = Alus.rep_elements[myid] position = int((element.genoStart + element.genoEnd) / 2.0) Alu_positions.append((position, myid)) Alu_positions.sort(key=itemgetter(0)) shared_exon_Alus = {} #{entrezID:[(region, [Alu_positions])]} shared_intron_Alus = {} #{entrezID:[(region, [Alu_positions])]} merged_transcript_Alus = {} #{entrezID:[(region, [Alu_positions])]} if entrez_genes_by_chrom.num_genes > 0: for entrez_id in entrez_genes_by_chrom.entrez_ids: gene = (entrez_genes_by_chrom.entrez_genes)[entrez_id] shared_exons = gene.shared_exonic_regions #sorted in absolute coordinate if shared_exons == []: # No shared extrons shared_exon_Alus[entrez_id] = [] else: # element in result_list has structure: [((start, end), [Alu_positions in range])] result_list = Utility_extended.associate_tags_with_regions( Alu_positions, shared_exons ) #returns a list, [region, [Alu_positions]] shared_exon_Alus[entrez_id] = result_list shared_introns = gene.shared_intronic_regions #sorted if shared_introns == []: # No shared introns shared_intron_Alus[entrez_id] = [] else: # element in result_list has structure: [((start, end), [Alus in range])] result_list = Utility_extended.associate_tags_with_regions( Alu_positions, shared_introns ) #returns a list, [region, [Alu_elements]] shared_intron_Alus[entrez_id] = result_list merged_transcript = gene.boundaries #[(start, end)] # element in result_list has structure: [((start, end), [Alus in range])] result_list = Utility_extended.associate_tags_with_regions( Alu_positions, merged_transcript) merged_transcript_Alus[entrez_id] = result_list return (shared_intron_Alus, shared_exon_Alus, merged_transcript_Alus)
def main(argv): parser = OptionParser() parser.add_option( "-a", "--AluElementsFile", action="store", type="string", dest="Alus", help="input Alu annotation file for non-strand specific analysis", metavar="<file>") parser.add_option( "-u", "--entrez_genes_file", action="store", type="string", dest="entrez_collection", metavar="<file>", help= "file with curated known genes clustered by entrez ID in pickle format" ) parser.add_option("-o", "--outfile", action="store", type="string", dest="outfile", metavar="<file>", help="output file name for genes and tag numbers") parser.add_option("-s", "--species", action="store", type="string", dest="species", help="species, mm8, hg18, etc", metavar="<str>") (opt, args) = parser.parse_args(argv) if len(argv) < 8: parser.print_help() sys.exit(1) startTime = time.time() if opt.species in GenomeData.species_chroms.keys(): chroms = GenomeData.species_chroms[opt.species] else: print "This species is not recognized, exiting" sys.exit(1) # entrez_collection is a dic (keyed by entrez_id) of lists of EntrezGene object annotation = open(opt.entrez_collection, 'rb') temp = pickle.load(annotation) my_entrez_genes = Entrez.KnownEntrezGenes(chroms, temp) annotation.close() #test entrez, checks out #id = my_entrez_genes.entrez_genes.keys()[0] #print id #for i in my_entrez_genes.entrez_genes[id].transcripts: #print i.getAll() lib_name = (opt.Alus).split('/')[-1] # remove directory suffix = lib_name.split('.')[-1] # txt lib_name = lib_name.split('.')[0] extension = "-" + lib_name + '.' + suffix + "1" if Utility_extended.fileExists(opt.Alus): if Utility_extended.chrom_files_exist(chroms, extension) != 1: # Separate by chrom and sort by start print chroms, extension, " files do not exist, separate by chroms. " SeparateByChrom.separateByChrom(chroms, opt.Alus, extension) else: print opt.Alus, " is not found" sys.exit(1) Alus_in_shared_intron = {} Alus_in_shared_exon = {} Alus_in_merged_transcript = {} for chrom in chroms: (shared_intron_Alus, shared_exon_Alus, merged_transcript_Alus) = assign_AluElements_to_intronexons_by_chrom( my_entrez_genes, chrom + extension, chrom) if chrom == chroms[0]: myid = shared_intron_Alus.keys()[0] test(my_entrez_genes, shared_intron_Alus, myid) Alus_in_shared_intron.update(shared_intron_Alus) Alus_in_shared_exon.update(shared_exon_Alus) Alus_in_merged_transcript.update(merged_transcript_Alus) #{entrezID:[(region=(start, end), Alu_count)]} Alus_in_shared_intron_dist = {} for myid in Alus_in_shared_intron.keys(): shared_intronic_regions_on_this_gene = Alus_in_shared_intron[myid] Alus_on_shared_intronic_regions_on_this_gene = [] for region in shared_intronic_regions_on_this_gene: region_coord, Alu_positions = region number_of_Alus = len(Alu_positions) Alus_on_shared_intronic_regions_on_this_gene.append( (region_coord, number_of_Alus)) Alus_in_shared_intron_dist[ myid] = Alus_on_shared_intronic_regions_on_this_gene outname = opt.outfile + "_Alu_distribution_in_shared_intron.pkl" output = open(outname, 'wb') pickle.dump(Alus_in_shared_intron_dist, output) print "The number of genes output to %s is %d " % ( outname, len(Alus_in_shared_intron.keys())) output.close() #total_intronic_regions = 0 #for myid in Alus_in_shared_intron.keys(): # total_intronic_regions += len(Alus_in_shared_intron[myid]) #print "There are %d genes with %d shared intronic regions " % (len(Alus_in_shared_intron.keys()), total_intronic_regions) #{entrezID:[(region, Alu_positions)]} outname = opt.outfile + "_Alus_in_shared_intron.pkl" output = open(outname, 'wb') pickle.dump(Alus_in_shared_intron, output) print "The number of genes output to %s is %d " % ( outname, len(Alus_in_shared_intron.keys())) output.close() #{entrezID:[(region, Alu_positions)]} outname = opt.outfile + "_Alus_in_shared_exon.pkl" output = open(outname, 'wb') pickle.dump(Alus_in_shared_exon, output) print "The number of genes output to %s is %d " % ( outname, len(Alus_in_shared_exon.keys())) output.close() #Though in this case the structure can be simpler: {entrezID:(region, Alu_count)}, it is better to make the interface uniform.{entrezID:[(region, Alu_count)]} Alus_in_merged_transcript_dist = {} for myid in Alus_in_merged_transcript.keys(): assert len(Alus_in_merged_transcript[myid]) == 1 region = (Alus_in_merged_transcript[myid])[0] region_coord, Alu_positions = region number_of_Alus = len(Alu_positions) Alus_in_merged_transcript_dist[myid] = [(region_coord, number_of_Alus)] outname = opt.outfile + "_Alu_distribution_in_merged_transcript.pkl" output = open(outname, 'wb') pickle.dump(Alus_in_merged_transcript_dist, output) print "The number of genes output to %s is %d " % ( outname, len(Alus_in_merged_transcript.keys())) output.close() #{entrezID:[(region, Alu_positions)]} outname = opt.outfile + "_Alus_in_merged_transcript.pkl" output = open(outname, 'wb') pickle.dump(Alus_in_merged_transcript, output) print "The number of genes output to %s is %d " % ( outname, len(Alus_in_merged_transcript.keys())) output.close() print "it took", time.time() - startTime, "seconds."
def Calculate3UTRUsage(entrez_genes, bedfile, column_index, chroms, fragment_size, downstream_extension, outfile): """ entrez genes are made sure to be on one strand, the bed file are reads for that strand entrez_genes is a KnownEntrezGenes class object The raw read file needs to conform to bed format column_index: column in bed file for sorting """ # Separate reads by chrom rawreadslibName1 = (bedfile).split('/')[-1] rawreadssuffix1 = rawreadslibName1.split('.')[-1] rawreadslibName1 = rawreadslibName1.split('.')[0] rawreadsextension1 = "-" + rawreadslibName1 + '.' + rawreadssuffix1 + "1" if Utility_extended.fileExists(bedfile): if Utility_extended.chrom_files_exist(chroms, rawreadsextension1) != 1: # Separate by chrom and sort by start print chroms, rawreadsextension1, " files do not exist, separate by chroms and sort each file according to the second column. " Utility_extended.separate_by_chrom_sort(chroms, bedfile, rawreadsextension1, [column_index]) else: print bedfile, " is not found" sys.exit(1) # Here the output is 'a' outf = open(outfile, 'a') for chrom in chroms: if chrom in entrez_genes.chroms: # a KnownEntrezGenes object entrez_genes_by_chrom = Entrez.KnownEntrezGenes( [chrom], entrez_genes.subset_by_chrom(chrom)) # this_chrom_length = chrom_lengths[chrom] # Get the read locations if Utility_extended.fileExists(chrom + rawreadsextension1): f = open(chrom + rawreadsextension1, 'r') tag_positions = [] for line in f: line = line.strip() sline = line.split() tag_positions.append( associate_tags_with_regions.tag_position( sline, fragment_size)) if not Utility_extended.is_list_sorted(tag_positions): tag_positions.sort() f.close() for entrez_id in entrez_genes_by_chrom.entrez_ids: gene = entrez_genes_by_chrom.entrez_genes[ entrez_id] # an EntrezGene class object three_UTRs = gene.get_3UTRs(downstream_extension) print three_UTRs union = Utility_extended.union( three_UTRs ) # Find the union of 3UTRs [(start, end)], returns a [(start,end)] if len(union) > 1: print "There are disjoint 3UTRs in %s" % ( str(entrez_id)) else: # returns [((start, end), [tag_positions])], [tag_positions] = return[0][1] inside_reads = (Utility_extended. associate_simple_tags_with_regions( tag_positions, union))[0][1] total_read_count = len(inside_reads) RUD = CUTR_vs_AUTR(three_UTRs, inside_reads, gene.strand) ## For the set of genes, use the distal 3UTR at the designated representative 3UTR #myindex = Calculate3UTRUsageIndexFromCuratedGenes.find_distal_3UTR(genes) #gene = genes[myindex] #results = ThreeUTRCharacteristics(gene, inside_reads) gene_symbol = [] for mytranscript in gene.transcripts: if mytranscript.additional_annotations[ 0] not in gene_symbol: gene_symbol.append( mytranscript.additional_annotations[0]) union_length = union[0][1] - union[0][0] + 1 outline = str(entrez_id) + "\t" + str( union_length) + "\t" + str(RUD) + "\t" + str( total_read_count) + "\t" + ','.join([ transcript.name for transcript in gene.transcripts ]) + "\t" + ','.join(gene_symbol) + "\n" outf.write(outline) outf.close()
def main(argv): parser = OptionParser() parser.add_option( "-f", "--forwardreadfile", action="store", type="string", dest="ReadsOnForwardStrand", help="input bed file for RNASeq raw reads on forward strand", metavar="<file>") parser.add_option( "-r", "--reversereadfile", action="store", type="string", dest="ReadsOnReverseStrand", help="input bed file for RNASeq raw reads on reverse strand", metavar="<file>") parser.add_option( "-u", "--entrez_genes_file", action="store", type="string", dest="entrez_genes", metavar="<file>", help= "file with curated known genes clustered by entrez ID in pickle format" ) parser.add_option( "-g", "--fragment_size", action="store", type="int", dest="fragment_size", help= "fragment_size determines the shift (half of fragment_size of ChIP-seq read position, in bps", metavar="<int>") parser.add_option("-o", "--outfile", action="store", type="string", dest="outfile", help="outfile name", metavar="<file>") parser.add_option("-s", "--species", action="store", type="string", dest="species", help="species, mm8, hg18, etc", metavar="<str>") parser.add_option("-d", "--3UTRdownstreamextension", action="store", type="int", dest="downstream_extension", help="3UTR down stream extension", metavar="<int>") (opt, args) = parser.parse_args(argv) if len(argv) < 14: parser.print_help() sys.exit(1) startTime = time.time() allowance = 10 if opt.species in GenomeData.species_chroms.keys(): chroms = GenomeData.species_chroms[opt.species] chrom_lengths = GenomeData.species_chrom_lengths[opt.species] else: print "This species is not recognized, exiting" sys.exit(1) # entrez_gene_collection is a KnownEntrezGenes class object. The core is a entrez_genes.entrez_genes is a dic (keyed by entrez_id) of lists of EntrezGene object annotation = open(opt.entrez_genes, 'rb') entrez_gene_collection = Entrez.KnownEntrezGenes(chroms, pickle.load(annotation)) annotation.close() # test module test = 0 if test == 1: print "Testing gene structure" test_id = 54 Entrez.test_gene_structure(entrez_gene_collection, test_id) # Filter cluster of refseq_ids (keyed by entrez_id) according to the criterion of identical cdsEnd entrez_ids_with_unique_cdsEnd = entrez_gene_collection.get_ids_with_unique_cdsEnd( ) print "There are ", len(entrez_ids_with_unique_cdsEnd ), " Entrez IDs each of which has a unique cdsEnd." #get total read count totalcount_F = get_total_tag_counts.get_total_tag_counts( opt.ReadsOnForwardStrand) totalcount_R = get_total_tag_counts.get_total_tag_counts( opt.ReadsOnReverseStrand) totalcount = totalcount_F + totalcount_R print totalcount_F, totalcount_R #Clear the file and write the first line, needs to be modified outf = open(opt.outfile, 'w') #outline = "# Entrez ID \t Main Refseq ID \t 3UTR union length \t Length Index \t PA Multiplicity Index \t 3UTR Read Count \t RefSeq IDs \t Gene symbols \n" outline = "# Entrez ID \t 3UTR Union length \t RUD \t 3UTR Read Count \t RefSeq IDs \t Gene symbols \n" outf.write(outline) outf.close() #index: column in bed file for sorting index = 2 print "Process genes on forward strand" entrez_ids_on_forward_strand = entrez_gene_collection.get_strand_specific_ids( "+", entrez_ids_with_unique_cdsEnd) print "There are ", len( entrez_ids_on_forward_strand), " Entrez IDs on forward strand." entrez_gene_subset = Entrez.KnownEntrezGenes( chroms, entrez_gene_collection.subset(entrez_ids_on_forward_strand)) Calculate3UTRUsage(entrez_gene_subset, opt.ReadsOnForwardStrand, index, chroms, opt.fragment_size, opt.downstream_extension, opt.outfile) print "Process genes on reverse strand" entrez_ids_on_reverse_strand = entrez_gene_collection.get_strand_specific_ids( "-", entrez_ids_with_unique_cdsEnd) print "There are ", len( entrez_ids_on_reverse_strand), " Entrez IDs on reverse strand." entrez_gene_subset = Entrez.KnownEntrezGenes( chroms, entrez_gene_collection.subset(entrez_ids_on_reverse_strand)) Calculate3UTRUsage(entrez_gene_subset, opt.ReadsOnReverseStrand, index, chroms, opt.fragment_size, opt.downstream_extension, opt.outfile) print "it took", time.time() - startTime, "seconds."
def main(argv): parser = OptionParser() parser.add_option("-u", "--annotation_pickle_file", action="store", type="string", dest="annotation", metavar="<file>", help="annotation for strand information") parser.add_option( "-a", "--rnaseq_intron_pickle_file", action="store", type="string", dest="RNA_Seq_intron_pickle", metavar="<file>", help= "read densities for individual shared intronic regions in pickle format" ) parser.add_option( "-b", "--rnaseq_exon_pickle_file", action="store", type="string", dest="RNA_Seq_exon_pickle", metavar="<file>", help= "read densities for individual shared exonic regions in pickle format") parser.add_option( "-d", "--alu_distribution_intron_pkl", action="store", type="string", dest="alu_distribution_intron_pkl", metavar="<file>", help= "Alu densities for individual shared intronic regions in pickle format" ) parser.add_option( "-e", "--alu_distribution_in_merged_transcript_pkl", action="store", type="string", dest="alu_distribution_in_merged_transcript_pkl", metavar="<file>", help="Alu counts for transcript regions in pickle format") parser.add_option("-f", "--ids", action="store", type="string", dest="id_subset_file", metavar="<file>", help="file that records ids of interest", default="") parser.add_option("-s", "--species", action="store", type="string", dest="species", help="species, mm8, hg18, etc", metavar="<str>") (opt, args) = parser.parse_args(argv) if len(argv) < 14: parser.print_help() sys.exit(1) if opt.species in GenomeData.species_chroms.keys(): chroms = GenomeData.species_chroms[opt.species] else: print "This species is not recognized, exiting" sys.exit(1) print "Read in RNASeq information" # read in from intron pickle resulted in {entrezID:[rpkms]} intron_read_density_distribution = readin_RNASeq(opt.RNA_Seq_intron_pickle) # read in from exon pickle resulted in {entrezID:[rpkms]} exon_read_density_distribution = readin_RNASeq(opt.RNA_Seq_exon_pickle) # read in annotation file for strand and gene cluster information print "Read in annotation" annotation = open(opt.annotation, 'rb') entrez_gene_collection = Entrez.KnownEntrezGenes(chroms, pickle.load(annotation)) annotation.close() # read in the Alus print "Read in Alu information" Alu_distribution_on_introns = readin_Alu(opt.alu_distribution_intron_pkl) Alu_distribution_on_transcripts = readin_Alu( opt.alu_distribution_in_merged_transcript_pkl) print "Load in ids" if opt.id_subset_file != "": id_set = [] f = open(opt.id_subset_file, 'r') for line in f: if not comment.match(line): line = line.strip() sline = line.split('\t') id_set.append(int(sline[0])) f.close() print "There are %d ids in %s" % (len(id_set), opt.id_subset_file) pc = 0.000000001 exon_read_density_cutoff = 1 combine_iri_Alu_genic_level(intron_read_density_distribution, exon_read_density_distribution, Alu_distribution_on_transcripts, id_set, exon_read_density_cutoff) combine_iri_Alu_intron_level(intron_read_density_distribution, exon_read_density_distribution, Alu_distribution_on_introns, id_set, exon_read_density_cutoff, pc)
def main(argv): parser = OptionParser() parser.add_option("-f", "--forwardalufile", action="store", type="string", dest="AlusOnForwardStrand", help="input file for Alus on forward strand", metavar="<file>") parser.add_option("-r", "--reversealufile", action="store", type="string", dest="AlusOnReverseStrand", help="input file for Alus on reverse strand", metavar="<file>") parser.add_option( "-u", "--entrez_genes_file", action="store", type="string", dest="entrez_genes", metavar="<file>", help= "file with curated known genes clustered by entrez ID in pickle format" ) parser.add_option("-o", "--outfile", action="store", type="string", dest="outfile", metavar="<file>", help="output file name for genes and tag numbers") parser.add_option("-s", "--species", action="store", type="string", dest="species", help="species, mm8, hg18, etc", metavar="<str>") (opt, args) = parser.parse_args(argv) if len(argv) < 10: parser.print_help() sys.exit(1) startTime = time.time() if opt.species in GenomeData.species_chroms.keys(): chroms = GenomeData.species_chroms[opt.species] chrom_lengths = GenomeData.species_chrom_lengths[opt.species] else: print "This species is not recognized, exiting" sys.exit(1) # entrez_genes is a EntrezGenes class object. The core is a entrez_genes.entrez_genes is a dic (keyed by entrez_id) of lists of EntrezGene object annotation = open(opt.entrez_genes, 'rb') entrez_genes = Entrez.EntrezGenes(chroms, pickle.load(annotation)) annotation.close() #test module Entrez.test(entrez_genes) #Clear the file. outf = open(opt.outfile, 'w') outline = "# Entrez ID \t Merged Exon Read Count \t Merged Exon Length \t Merged Exon RPKM \t Shared Exon Read Count \t Shared Exon Length \t Shared Exon RPKM \t Shared Intron Read Count \t Share Intron Length \t Shared Intron RPKM \t Merged Transcript Read Count \t Merged Transcript Length \t Merged Transcript RPKM \t RefSeq IDs \t Gene Symbols \n" outf.write(outline) outf.close() # The RNA seq data are strand specific. Only use + reads on genes on forward strand, and - reads on genes on reverse strand. print "Process genes on forward strand" (forward_shared_exon_count, forward_shared_intron_count) = CalculateExonIntrons( entrez_genes_on_forward_strand, opt.ReadsOnForwardStrand, chroms, opt.fragment_size, totalcount, opt.outfile) print "Process genes on reverse strand" (reverse_shared_exon_count, reverse_shared_intron_count) = CalculateExonIntrons( entrez_genes_on_reverse_strand, opt.AlusOnReverseStrand, chroms, opt.fragment_size, totalcount, opt.outfile) #combine the densities shared_exon_count = {} shared_intron_count = {} for chrom in chroms: # exon if chrom in forward_shared_exon_count.keys(): shared_exon_count[chrom] = forward_shared_exon_count[chrom] if chrom in reverse_shared_exon_count.keys(): shared_exon_count[chrom].update(reverse_shared_exon_count[chrom]) # intron if chrom in forward_shared_intron_count.keys(): shared_intron_count[chrom] = forward_shared_intron_count[chrom] if chrom in reverse_shared_intron_count.keys(): shared_intron_count[chrom].update( reverse_shared_intron_count[chrom]) #store the info in a pickle file name = opt.outfile + "_shared_exon_RPKMS.pkl" output = open(name, 'wb') pickle.dump(shared_exon_count, output) output.close() name = opt.outfile + "_shared_intron_RPKMS.pkl" output = open(name, 'wb') pickle.dump(shared_intron_count, output) output.close() print "it took", time.time() - startTime, "seconds."
def main(argv): parser = OptionParser() parser.add_option("-f", "--forwardreadfile", action="store", type="string", dest="ReadsOnForwardStrand", help="input bed file for RNASeq raw reads on forward strand", metavar="<file>") parser.add_option("-r", "--reversereadfile", action="store", type="string", dest="ReadsOnReverseStrand", help="input bed file for RNASeq raw reads on reverse strand", metavar="<file>") parser.add_option("-u", "--entrez_genes_file", action="store", type="string", dest="entrez_genes", metavar="<file>", help="file with curated known genes clustered by entrez ID in pickle format") parser.add_option("-o", "--outfile", action="store", type="string", dest="outfile", help="outfile name", metavar="<file>") parser.add_option("-s", "--species", action="store", type="string", dest="species",help="species, mm8, hg18, etc", metavar="<str>") parser.add_option("-p", "--PAfile", action="store", type="string", dest="PAfile", help="input bed3 file", metavar="<file>") parser.add_option("-e", "--extension", action="store", type="int", dest="extension",help="integer value denoting how far downstream the program should look for polyadenylation sites past the Entrez given 3'UTR end", metavar="<float>") (opt, args) = parser.parse_args(argv) if len(argv) < 14: parser.print_help() sys.exit(1) startTime = time.time() allowance = 10 if opt.species in GenomeData.species_chroms.keys(): chroms = GenomeData.species_chroms[opt.species] chrom_lengths = GenomeData.species_chrom_lengths[opt.species] else: print "This species is not recognized, exiting" sys.exit(1) # entrez_gene_collection is a KnownEntrezGenes class object. The core is a entrez_genes.entrez_genes is a dic (keyed by entrez_id) of lists of EntrezGene object annotation = open(opt.entrez_genes, 'rb') entrez_gene_collection = Entrez.KnownEntrezGenes(chroms, pickle.load(annotation)) annotation.close() # test module test = 0 if test == 1: print "Testing gene structure" test_id = 79947 Entrez.test_gene_structure(entrez_gene_collection, test_id) # Filter cluster of refseq_ids (keyed by entrez_id) according to the criterion of identical cdsEnd entrez_ids_with_unique_cdsEnd = entrez_gene_collection.get_ids_with_unique_cdsEnd() print "There are ", len(entrez_ids_with_unique_cdsEnd), " Entrez IDs each of which has a unique cdsEnd." #get total read count totalcount_F = get_total_tag_counts.get_total_tag_counts(opt.ReadsOnForwardStrand) totalcount_R = get_total_tag_counts.get_total_tag_counts(opt.ReadsOnReverseStrand) totalcount = totalcount_F + totalcount_R print totalcount_F, totalcount_R #Clear the file and write the first line outf = open(opt.outfile, 'w') #outline to use to output polyA information for a species #outline = "# Entrez ID" + "\t" + "Chrom" + "\t" + "Strand" + "\t" + "UTRstart" + "\t" + "PolyAsites" + "\n" #outline to use to output RUDs outline = "# Entrez ID" + "\t" + "Chrom" + "\t" + "Strand" + "\t" + "Basic_RUD" + "\t" + "List_of_subRUDs" + "\n" outf.write(outline) outf.close() #index: column in bed file for sorting index = 2 print "Process genes on forward strand" entrez_ids_on_forward_strand = entrez_gene_collection.get_strand_specific_ids("+", entrez_ids_with_unique_cdsEnd) print "There are ", len(entrez_ids_on_forward_strand), " Entrez IDs on forward strand." entrez_gene_subset = Entrez.KnownEntrezGenes(chroms, entrez_gene_collection.subset(entrez_ids_on_forward_strand)) Calculate3UTRUsage(entrez_gene_subset, opt.ReadsOnForwardStrand, chroms, opt.outfile, allowance, opt.PAfile, opt.extension, index) print "Process genes on reverse strand" entrez_ids_on_reverse_strand = entrez_gene_collection.get_strand_specific_ids("-", entrez_ids_with_unique_cdsEnd) print "There are ", len(entrez_ids_on_reverse_strand), " Entrez IDs on reverse strand." entrez_gene_subset = Entrez.KnownEntrezGenes(chroms, entrez_gene_collection.subset(entrez_ids_on_reverse_strand)) Calculate3UTRUsage(entrez_gene_subset, opt.ReadsOnReverseStrand, chroms, opt.outfile, allowance, opt.PAfile, opt.extension, index) print "it took", time.time() - startTime, "seconds."
def main(argv): parser = OptionParser() parser.add_option( "-f", "--forwardreadfile", action="store", type="string", dest="ReadsOnForwardStrand", help="input bed file for RNASeq raw reads on forward strand", metavar="<file>") parser.add_option( "-r", "--reversereadfile", action="store", type="string", dest="ReadsOnReverseStrand", help="input bed file for RNASeq raw reads on reverse strand", metavar="<file>") parser.add_option( "-u", "--entrez_genes_file", action="store", type="string", dest="entrez_genes", metavar="<file>", help= "file with curated known genes clustered by entrez ID in pickle format" ) parser.add_option("-o", "--outfile", action="store", type="string", dest="outfile", help="outfile name", metavar="<file>") parser.add_option("-s", "--species", action="store", type="string", dest="species", help="species, mm8, hg18, etc", metavar="<str>") parser.add_option("-p", "--PAfile", action="store", type="string", dest="PAfile", help="input bed3 file", metavar="<file>") parser.add_option( "-e", "--extension", action="store", type="int", dest="extension", help= "integer value denoting how far downstream the program should look for polyadenylation sites past the Entrez given 3'UTR end", metavar="<float>") (opt, args) = parser.parse_args(argv) if len(argv) < 14: parser.print_help() sys.exit(1) startTime = time.time() allowance = 10 if opt.species in GenomeData.species_chroms.keys(): chroms = GenomeData.species_chroms[opt.species] chrom_lengths = GenomeData.species_chrom_lengths[opt.species] else: print "This species is not recognized, exiting" sys.exit(1) # entrez_gene_collection is a KnownEntrezGenes class object. The core is a entrez_genes.entrez_genes is a dic (keyed by entrez_id) of lists of EntrezGene object annotation = open(opt.entrez_genes, 'rb') entrez_gene_collection = Entrez.KnownEntrezGenes(chroms, pickle.load(annotation)) annotation.close() # test module test = 0 if test == 1: print "Testing gene structure" test_id = 79947 Entrez.test_gene_structure(entrez_gene_collection, test_id) # Filter cluster of refseq_ids (keyed by entrez_id) according to the criterion of identical cdsEnd entrez_ids_with_unique_cdsEnd = entrez_gene_collection.get_ids_with_unique_cdsEnd( ) print "There are ", len(entrez_ids_with_unique_cdsEnd ), " Entrez IDs each of which has a unique cdsEnd." #get total read count totalcount_F = get_total_tag_counts.get_total_tag_counts( opt.ReadsOnForwardStrand) totalcount_R = get_total_tag_counts.get_total_tag_counts( opt.ReadsOnReverseStrand) totalcount = totalcount_F + totalcount_R print totalcount_F, totalcount_R #Clear the file and write the first line outf = open(opt.outfile, 'w') #outline to use to output polyA information for a species #outline = "# Entrez ID" + "\t" + "Chrom" + "\t" + "Strand" + "\t" + "UTRstart" + "\t" + "PolyAsites" + "\n" #outline to use to output RUDs outline = "# Entrez ID" + "\t" + "Chrom" + "\t" + "Strand" + "\t" + "Basic_RUD" + "\t" + "List_of_subRUDs" + "\n" outf.write(outline) outf.close() #index: column in bed file for sorting index = 2 print "Process genes on forward strand" entrez_ids_on_forward_strand = entrez_gene_collection.get_strand_specific_ids( "+", entrez_ids_with_unique_cdsEnd) print "There are ", len( entrez_ids_on_forward_strand), " Entrez IDs on forward strand." entrez_gene_subset = Entrez.KnownEntrezGenes( chroms, entrez_gene_collection.subset(entrez_ids_on_forward_strand)) Calculate3UTRUsage(entrez_gene_subset, opt.ReadsOnForwardStrand, chroms, opt.outfile, allowance, opt.PAfile, opt.extension, index) print "Process genes on reverse strand" entrez_ids_on_reverse_strand = entrez_gene_collection.get_strand_specific_ids( "-", entrez_ids_with_unique_cdsEnd) print "There are ", len( entrez_ids_on_reverse_strand), " Entrez IDs on reverse strand." entrez_gene_subset = Entrez.KnownEntrezGenes( chroms, entrez_gene_collection.subset(entrez_ids_on_reverse_strand)) Calculate3UTRUsage(entrez_gene_subset, opt.ReadsOnReverseStrand, chroms, opt.outfile, allowance, opt.PAfile, opt.extension, index) print "it took", time.time() - startTime, "seconds."
def Calculate3UTRUsage(entrez_genes, bedfile, chroms, outfile, threshold, PAfile, extension, index): """ entrez genes are made sure to be on one strand, the bed file are reads for that strand entrez_genes is a KnownEntrezGenes class object The raw read file needs to conform to bed format column_index: column in bed file for sorting """ # Separate reads by chrom rawreadslibName1 = (bedfile).split('/')[-1] rawreadssuffix1 = rawreadslibName1.split('.')[-1] rawreadslibName1 = rawreadslibName1.split('.')[0] rawreadsextension1 = "-" + rawreadslibName1 + '.' + rawreadssuffix1 + "1" if Utility_extended.fileExists(bedfile): if Utility_extended.chrom_files_exist(chroms, rawreadsextension1) != 1: # Separate by chrom and sort by start print chroms, rawreadsextension1, " files do not exist, separate by chroms and sort each file according to the second column. " Utility_extended.separate_by_chrom_sort(chroms, bedfile, rawreadsextension1, str(index)) else: print bedfile, " is not found" sys.exit(1) #This part is to access the polyadenylation sites PA1 = open(PAfile, 'r') PAsiteslist = [] PA2 = 'i' while PA2 != '': PA2 = PA1.readline() if PA2 != '': PA3 = PA2.strip('\n') PA4 = PA3.split('\t') PAsiteslist.append((PA4[0], PA4[1])) PA1.close() # Here the output is 'a', i.e. the output is appended to an existing file instead of creating one outf = open(outfile, 'a') for chrom in chroms: if chrom in entrez_genes.chroms: # a KnownEntrezGenes object entrez_genes_by_chrom = Entrez.KnownEntrezGenes( [chrom], entrez_genes.subset_by_chrom(chrom)) # Get the read locations if Utility_extended.fileExists(chrom + rawreadsextension1): f = open(chrom + rawreadsextension1, 'r') tag_positions = [] for line in f: line = line.strip() sline = line.split() #make sure the extension is always 0, otherwise the rest of the program might not work as intended tag_positions.append( associate_tags_with_regions.tag_position(sline, 0)) f.close() if not Utility_extended.is_list_sorted(tag_positions): tag_positions.sort() #By this point tag_positions is a sorted list of all the reads located on the strand and chromosome the code is currently dealing with for entrez_id in entrez_genes_by_chrom.entrez_ids: gene = entrez_genes_by_chrom.entrez_genes[ entrez_id] # an EntrezGene class object # get_3UTRs gets the ENTREZ 3'UTR, which appears to generally give the beginning of the 3'UTR and a site very close to the most distal polyadenylation site three_UTRs = gene.get_3UTRs() # Mastertuplemaker uses the ENTREZ 3'UTR and the polyA sites given to create the true data for the 3'UTR needed for CUTR_vs_AUTR to work true3UTRstarts, true3UTRends, UTRregion_start, UTRregion_end, UTRbeginning = Mastertuplemaker( three_UTRs, PAsiteslist, chrom, gene.strand, extension) #value should always be 1 as only 3'UTR with more than 1 polyA site need be considered if len(true3UTRends) > 1: #find all reads inside the 3'UTR inside_reads = associate_tags_with_3UTR( tag_positions, UTRregion_start, UTRregion_end) #finds reads in each region of the 3'UTR and calculates aUTR/cUTR for each of them #PolyAsites potentially useful for output RUDs, basic_RUD, PolyAsites = CUTR_vs_AUTR( true3UTRstarts, true3UTRends, inside_reads, gene.strand, threshold) #important if one wants to output gene_symbol information gene_symbol = [] for mytranscript in gene.transcripts: if mytranscript.additional_annotations[ 0] not in gene_symbol: gene_symbol.append( mytranscript.additional_annotations[0]) #outline to use to output RUDs outline = str( entrez_id ) + "\t" + chrom + "\t" + gene.strand + "\t" + str( basic_RUD) + "\t" + ",".join(map(str, RUDs)) + "\n" #outline to use to output polyA information for a species #outline = str(entrez_id) + "\t" + chrom + "\t" + gene.strand + "\t" + str(UTRbeginning) + "\t" + ",".join(map(str, PolyAsites)) + "\n" outf.write(outline) outf.close()
def calculateExonIntrons(entrez_genes, bedfile, column_index, chroms, fragment_size, totalcount, out_file=None): """ entrez_genes is a EntrezGenes class object. The core is a entrez_genes.entrez_genes is a dic (keyed by entrez_id) of lists of EntrezGene object return: all_reads_on_shared_exons = {} # {entrezID:[((start, end), read_count)]} all_reads_on_shared_introns = {} # {entrezID:[((start, end), read_count)]} all_reads_on_merged_transcripts = {} #{entrezID:[((start, end), read_count)]} all_summary = {} # {entrezID:{attribute:value}} (summary[entrez_id])["merged_exons_rc"] = merged_exons_rc (summary[entrez_id])["merged_exon_RPKM"] = merged_exon_RPKM (summary[entrez_id])["merged_exons_total_length"] = merged_exons_total_length (summary[entrez_id])["shared_exons_rc"] = shared_exons_rc (summary[entrez_id])["shared_exon_RPKM"] = shared_exon_RPKM (summary[entrez_id])["shared_exons_total_length"] = shared_exons_total_length (summary[entrez_id])["shared_introns_rc"] = shared_introns_rc (summary[entrez_id])["shared_intron_RPKM"] = shared_intron_RPKM (summary[entrez_id])["shared_introns_total_length"] = shared_introns_total_length (summary[entrez_id])["merged_transcript_rc"] = merged_transcript_rc (summary[entrez_id])["merged_transcript_RPKM"] = merged_transcript_RPKM (summary[entrez_id])["merged_transcript_length"] = merged_transcript_length """ lib_name = (bedfile).split('/')[-1] # remove directory suffix = lib_name.split('.')[-1] # txt lib_name = lib_name.split('.')[0] extension = "-" + lib_name + '.' + suffix + "1" if Utility_extended.fileExists(bedfile): if Utility_extended.chrom_files_exist(chroms, extension) != 1: # Separate by chrom and sort by start print chroms, extension, " files do not exist, separate by chroms. " Utility_extended.separate_by_chrom_sort(chroms, bedfile, extension, [column_index]) else: print bedfile, " is not found" sys.exit(1) all_reads_on_shared_exons = {} # {entrezID:[((start, end), read_count)]} all_reads_on_shared_introns = {} # {entrezID:[((start, end), read_count)]} all_reads_on_merged_transcripts = { } #{entrezID:[((start, end), read_count)]} all_summary = {} # {entrezID:{attributes}} for chrom in chroms: chrombed = chrom + extension if chrom in entrez_genes.chroms: entrez_genes_by_chrom = Entrez.KnownEntrezGenes( [chrom], entrez_genes.subset_by_chrom(chrom)) (reads_on_shared_exons, reads_on_shared_introns, reads_on_merged_transcripts, summary) = calculateExonIntrons_by_chrom(entrez_genes_by_chrom, chrombed, fragment_size, totalcount, out_file) #if chrom == chroms[0]: #myid = reads_on_shared_exons.keys()[0] #test(entrez_genes_by_chrom, reads_on_shared_introns, myid) all_reads_on_shared_exons.update(reads_on_shared_exons) all_reads_on_shared_introns.update(reads_on_shared_introns) all_reads_on_merged_transcripts.update(reads_on_merged_transcripts) all_summary.update(summary) print len(all_summary.keys()) SeparateByChrom.cleanup(chroms, extension) return (all_reads_on_shared_exons, all_reads_on_shared_introns, all_reads_on_merged_transcripts, all_summary)
def AssignPeaksToEntrez3UTRs(entrez_genes, peakfile, chroms, chrom_lengths, peak_threshold, downstream_extension): """ Returns {entrez_id:(gene, ThreeUTR_length, peaks_on_3UTR)} gene:gene = entrez_genes_by_chrom.entrez_genes[entrez_id] ThreeUTR_length: longest 3UTR length; length includes the downstream extension peaks_on_3UTR:[(location, read_count)] """ peaks_on_entrez_3UTRs = {} #store the peaks for each 3UTR of the entrez cluster. {Entrez_ID: (gene, ThreeUTR_length, peaks_on_3UTR)} if Utility_extended.fileExists(peakfile): # Read the peaks, which is assumed to have the pseudo ucsc format island_libName1 = (peakfile).split('/')[-1] island_suffix1 = island_libName1.split('.')[-1] island_libName1 = island_libName1.split('.')[0] island_extension1 = "-" + island_libName1 + '.' + island_suffix1 + "1" SeparateByChrom.separateByChrom(chroms, peakfile, island_extension1) else: print peakfile, " is not found"; sys.exit(1) for chrom in chroms: if chrom in entrez_genes.chroms: entrez_genes_by_chrom = Entrez.KnownEntrezGenes([chrom], entrez_genes.subset_by_chrom(chrom)) this_chrom_length = chrom_lengths[chrom] # Load in the PA peak information if Utility_extended.fileExists(chrom + island_extension1): inf = open(chrom + island_extension1, 'r') # Read in the peaks and separate the forward strand peaks and the reverse strand peaks five_peaks = [] # peaks on forward strand, element (location, read_count) three_peaks = [] # peaks on reverse strand, element (location, read_count) for line in inf: line = line.strip(); sline = line.split(); strand = sline[2] if plus.match(strand): if float(sline[10]) >= peak_threshold: five_peaks.append ((int(sline[3]), float(sline[10]))) elif minus.match(strand): if float(sline[10]) >= peak_threshold: three_peaks.append ((int(sline[4]), float(sline[10]))) five_peaks = sorted(five_peaks, key = itemgetter(0)) #sort according to location five_peaks_location = [item[0] for item in five_peaks] three_peaks = sorted(three_peaks, key = itemgetter(0)) three_peaks_location = [item[0] for item in three_peaks] inf.close() for entrez_id in entrez_genes_by_chrom.entrez_ids: gene = entrez_genes_by_chrom.entrez_genes[entrez_id] # an EntrezGene class object # For the set of transcripts, use the longest 3UTR at the designated representative 3UTR transcript_with_longest_3UTR = gene.identify_transcript_with_longest_3UTR() # a UCSC class object if plus.match(transcript_with_longest_3UTR.strand): start = transcript_with_longest_3UTR.cdsEnd end = min(transcript_with_longest_3UTR.txEnd + downstream_extension, this_chrom_length) start_ind = bisect.bisect_left(five_peaks_location, start); end_ind = bisect.bisect_right(five_peaks_location, end); peaks_on_3UTR = five_peaks[start_ind: end_ind] #[(mode_location, readcount)] if minus.match(transcript_with_longest_3UTR.strand): start = max(transcript_with_longest_3UTR.txStart - downstream_extension, 0) end = transcript_with_longest_3UTR.cdsStart start_ind = bisect.bisect_left(three_peaks_location, start); end_ind = bisect.bisect_right(three_peaks_location, end); peaks_on_3UTR = three_peaks[start_ind: end_ind] ThreeUTR_length = end - start + 1 #length includes the downstream extension peaks_on_entrez_3UTRs[entrez_id] = (gene, ThreeUTR_length, peaks_on_3UTR) SeparateByChrom.cleanup(chroms, island_extension1) return peaks_on_entrez_3UTRs
def main(argv): parser = OptionParser() parser.add_option( "-f", "--forwardreadfile", action="store", type="string", dest="ReadsOnForwardStrand", help="input bed file for RNASeq raw reads on forward strand", metavar="<file>") parser.add_option( "-r", "--reversereadfile", action="store", type="string", dest="ReadsOnReverseStrand", help="input bed file for RNASeq raw reads on reverse strand", metavar="<file>") parser.add_option( "-g", "--fragment_size", action="store", type="int", dest="fragment_size", help= "fragment_size determines the shift (half of fragment_size of ChIP-seq read position, in bps", metavar="<int>") parser.add_option( "-u", "--entrez_genes_file", action="store", type="string", dest="entrez_genes", metavar="<file>", help= "file with curated known genes clustered by entrez ID in pickle format" ) parser.add_option("-o", "--outfile", action="store", type="string", dest="outfile", metavar="<file>", help="output file name for genes and tag numbers") parser.add_option("-s", "--species", action="store", type="string", dest="species", help="species, mm8, hg18, etc", metavar="<str>") test = 0 (opt, args) = parser.parse_args(argv) if len(argv) < 12: parser.print_help() sys.exit(1) startTime = time.time() ##################################################################3 #The column numbers are 1 based instead of 0 based! #For positive strand start_index_P = 2 #For negative strand start_index_N = 3 ##################################################################3 if opt.species in GenomeData.species_chroms.keys(): chroms = GenomeData.species_chroms[opt.species] chrom_lengths = GenomeData.species_chrom_lengths[opt.species] else: print "This species is not recognized, exiting" sys.exit(1) # entrez_gene_collection is a EntrezGenes class object. The core is a entrez_genes.entrez_genes is a dic (keyed by entrez_id) of lists of EntrezGene object annotation = open(opt.entrez_genes, 'rb') entrez_gene_collection = Entrez.KnownEntrezGenes(chroms, pickle.load(annotation)) annotation.close() # test module test = 0 if test == 1: print "Testing gene structure" test_id = 54 Entrez.test_gene_structure(entrez_gene_collection, test_id) totalcount_F = get_total_tag_counts.get_total_tag_counts( opt.ReadsOnForwardStrand) totalcount_R = get_total_tag_counts.get_total_tag_counts( opt.ReadsOnReverseStrand) totalcount = totalcount_F + totalcount_R print totalcount_F, totalcount_R #Clear the file. outf = open(opt.outfile, 'w') outline = "# Entrez ID \t Merged Exon Read Count \t Merged Exon Length \t Merged Exon RPKM \t Shared Exon Read Count \t Shared Exon Length \t Shared Exon RPKM \t Shared Intron Read Count \t Share Intron Length \t Shared Intron RPKM \t Merged Transcript Read Count \t Merged Transcript Length \t Merged Transcript RPKM \t RefSeq IDs \t Gene Symbols \n" outf.write(outline) outf.close() # The RNA seq data are strand specific. Only use + reads on genes on forward strand, and - reads on genes on reverse strand. print "Process genes on forward strand" entrez_ids_on_forward_strand = entrez_gene_collection.get_strand_specific_ids( "+") print "There are ", len( entrez_ids_on_forward_strand), " Entrez IDs on forward strand." entrez_gene_subset = Entrez.KnownEntrezGenes( chroms, entrez_gene_collection.subset(entrez_ids_on_forward_strand)) (forward_reads_on_shared_exons, forward_reads_on_shared_introns, forward_reads_on_merged_transcripts, forward_summary) = calculateExonIntrons(entrez_gene_subset, opt.ReadsOnForwardStrand, start_index_P, chroms, opt.fragment_size, totalcount, opt.outfile) print "Process genes on reverse strand" entrez_ids_on_reverse_strand = entrez_gene_collection.get_strand_specific_ids( "-") print "There are ", len( entrez_ids_on_reverse_strand), " Entrez IDs on reverse strand." entrez_gene_subset = Entrez.KnownEntrezGenes( chroms, entrez_gene_collection.subset(entrez_ids_on_reverse_strand)) (reverse_reads_on_shared_exons, reverse_reads_on_shared_introns, reverse_reads_on_merged_transcripts, reverse_summary) = calculateExonIntrons(entrez_gene_subset, opt.ReadsOnReverseStrand, start_index_N, chroms, opt.fragment_size, totalcount, opt.outfile) #combine the densities # {entrezID:[((start, end), read_count)]} reads_on_shared_exons = {} reads_on_shared_exons.update(forward_reads_on_shared_exons) reads_on_shared_exons.update(reverse_reads_on_shared_exons) name = opt.outfile + "_shared_exons.pkl" output = open(name, 'wb') pickle.dump(reads_on_shared_exons, output) output.close() if test == 1: test_distribution_dic(reads_on_shared_exons, test_id) # {entrezID:[((start, end), read_count)]} reads_on_shared_introns = {} reads_on_shared_introns.update(forward_reads_on_shared_introns) reads_on_shared_introns.update(reverse_reads_on_shared_introns) #store the info in a pickle file name = opt.outfile + "_shared_introns.pkl" output = open(name, 'wb') pickle.dump(reads_on_shared_introns, output) output.close() if test == 1: test_distribution_dic(reads_on_shared_introns, test_id) reads_on_merged_transcripts = {} reads_on_merged_transcripts.update(forward_reads_on_merged_transcripts) reads_on_merged_transcripts.update(reverse_reads_on_merged_transcripts) #store the info in a pickle file name = opt.outfile + "_merged_transcripts.pkl" output = open(name, 'wb') pickle.dump(reads_on_merged_transcripts, output) output.close() summary = {} summary.update(forward_summary) summary.update(reverse_summary) name = opt.outfile + "_summary.pkl" output = open(name, 'wb') pickle.dump(summary, output) output.close() print "it took", time.time() - startTime, "seconds."