def load_genbank(seqfile): """Load single-record GenBank file.""" parser = GenBank.FeatureParser() input_handle = open(seqfile, 'rU') gb_record = parser.parse(input_handle) input_handle.close() return gb_record
def pLonk(plasmids): pLenks = [] pLasmids = [] for (pName, seq_infile, offset, order) in plasmids: fhandle = open(seq_infile, 'r') # load plasmid sequence file # evaluate file name to detect format using Quixote [ filename ] format = Quixote(seq_infile) if format == 'genbank': parser = GenBank.FeatureParser() gb_entry = parser.parse(fhandle) pLen = len(gb_entry.seq) # read in length of plasmid sequence print pName, pLen elif format == 'fasta' or format == 'seq': for fa_entry in SeqIO.parse(fhandle, "fasta"): pLen = len(fa_entry.seq) # read in length of plasmid sequence else: print "TERMINAL ERROR : file format not recognized for " + pName + " !!!" break fhandle.close() # close sequence file (to free up memory) pLenks.append(pLen) pLasmids.append((pName, seq_infile, int(pLen), int(offset))) pLenks.sort() pLenks.reverse() pLen_MAX = pLenks[0] return pLenks, pLen_MAX, pLasmids
def setUp(self): # create TESTDB create_database() # load the database db_name = "biosql-test" server = BioSeqDatabase.open_database(driver=DBDRIVER, user=DBUSER, passwd=DBPASSWD, host=DBHOST, db=TESTDB) # remove the database if it already exists try: server[db_name] server.remove_database(db_name) except KeyError: pass self.db = server.new_database(db_name) # get the GenBank file we are going to put into it input_file = os.path.join(os.getcwd(), "GenBank", "cor6_6.gb") handle = open(input_file, "r") parser = GenBank.FeatureParser() self.iterator = GenBank.Iterator(handle, parser)
def search(self): if self.database == 'PubMed': from Bio import PubMed from Bio import GenBank searchIds = PubMed.search_for(self.searchTerm, max_ids=self.maxResults) GBrecParser = GenBank.FeatureParser() ncbiDict = GenBank.NCBIDictionary(self.type, 'genbank', parser=GBrecParser) from Bio import Medline MLrecParser = Medline.RecordParser() medlineDict = PubMed.Dictionary(delay=1.0, parser=MLrecParser) for id in searchIds: MLrecord = medlineDict[id] GBrecord = ncbiDict[id] newDBItem = DBItem(self.project, seq=GBrecord.seq, descript=GBrecord.description, id=id, record=MLrecord) self.items[id] = newDBItem
def plot_unique_genome_diagram(gbk, unique_loci): parser = GenBank.FeatureParser() fhandle = open(gbk, 'r') genbank_entry = parser.parse(fhandle) fhandle.close() gdd = GenomeDiagram.Diagram(gbk) gd_track_for_features = gdd.new_track(1, name="CDS", scale_smalltick_interval=100000) gdfs = gd_track_for_features.new_set() for feature in genbank_entry.features: if feature.type == 'CDS': feature.strand = 1 if feature.qualifiers['locus_tag'][0] in unique_loci: gdfs.add_feature(feature, color=rcolors.HexColor("#93341F")) else: gdfs.add_feature(feature, color=rcolors.HexColor("#058F45")) gdd.draw(format='circular', orientation='landscape', tracklines=0, pagesize='A5', fragments=5, circular=1) return gdd
def BaseDraw(plasmids): ordN = 0 for (pName, seq_infile, pLen, offset) in plasmids: # set Y axis once and for all for the plasmid being processed y0 = (pNs - ordN) * dBL # starts from the top pLeni = int(pLen) print 'offset', offset offset = int(offset) # draw plasmid baseline BaseL(ordN, pName, pLeni, y0, canvas_main) # label the baseline with plasmid name and size LabeL(ordN, pName, pLeni, y0, canvas_main) # evaluate file name to detect format using Quixote [filename] format = Quixote(seq_infile) # mark up sequence origin if there is an offset if offset < -1 or offset > 1: Zs, dir = Off7(1, pLeni, offset) xs = Zs * u canvas_main.setFont(bFont, NfSize) canvas_main.drawString(xs, y0 + da / 2, osym) # filter and draw annotation features if format == 'genbank': # load GB file to filter features parser = GenBank.FeatureParser() fhandle = open(seq_infile, 'r') # load GenBank file gb_entry = parser.parse(fhandle) ORFcnt = 0 for feature in gb_entry.features: if feature.type == 'CDS' or feature.type == 'cds': # draw CDS using ORFeus ORFcnt += 1 ORFeus(feature, pLeni, offset, y0, ORFcnt) elif SFX == 'on': if feature.type == 'SNP': Snippit(feature, pLeni, offset, y0) # draw asterisk at feature location if feature.type == 'IR': IRFlag(feature, pLeni, offset, y0) # draw flag at feature location # need other functions for other features ( with conditional, default switch off) fhandle.close() print " got a GenBank-style file for " + pName + " with " + str( ORFcnt) + " ORFs" else: # no features so just skip this step print " got a non-genbank-style file for " + pName + "; no features to draw" # increment plasmid ordinal count ordN = ordN + 1 print " " + pName + " (" + str(pLeni) + " bp) drawn with " + str( ORFcnt) + " ORFs" print " OK"
def loadData(self, data, dbtype): if (dbtype == "GenBank"): # get the GenBank file we are going to put into it parser = GenBank.FeatureParser() iterator = GenBank.Iterator(data, parser) # finally put it in the database try: self.getDatabase().load(iterator) except: self.getBioSQLRoot().getDBServer().adaptor.conn.rollback() return traceback.format_exc() self.getBioSQLRoot().getDBServer().adaptor.conn.commit() return "" else: raise "Unknown dbtype: %r" % (dbtype)
def t_cleaning_features(): """Test the ability to clean up feature values.""" gb_parser = GenBank.FeatureParser( feature_cleaner=utils.FeatureValueCleaner()) handle = open(os.path.join("GenBank", "arab1.gb")) iterator = GenBank.Iterator(handle, gb_parser) first_record = next(iterator) # test for cleaning of translation translation_feature = first_record.features[1] test_trans = translation_feature.qualifiers["translation"][0] assert " " not in test_trans, "Did not clean spaces out of the translation" assert "\012" not in test_trans, "Did not clean newlines out of the translation" handle.close()
def t_cleaning_features(): """Test the ability to clean up feature values. """ parser = GenBank.FeatureParser(feature_cleaner = \ utils.FeatureValueCleaner()) handle = open(os.path.join("GenBank", "arab1.gb")) iterator = GenBank.Iterator(handle, parser) first_record = iterator.next() # test for cleaning of translation translation_feature = first_record.features[1] test_trans = translation_feature.qualifiers["translation"][0] assert test_trans.find(" ") == -1, \ "Did not clean spaces out of the translation" assert test_trans.find("\012") == -1, \ "Did not clean newlines out of the translation"
def load_database(gb_handle): """Load a GenBank file into a BioSQL database. This is useful for running tests against a newly created database. """ create_database() # now open a connection to load the database db_name = "biosql-test" server = BioSeqDatabase.open_database(driver=DBDRIVER, user=DBUSER, passwd=DBPASSWD, host=DBHOST, db=TESTDB) db = server.new_database(db_name) # get the GenBank file we are going to put into it parser = GenBank.FeatureParser() iterator = GenBank.Iterator(gb_handle, parser) # finally put it in the database db.load(iterator) server.adaptor.conn.commit() server.adaptor.conn.close()
# don't test dbsource_wrap because it is a junky RefSeq file files_to_parse = [] for file in test_files: files_to_parse.append(os.path.join(gb_file_dir, file)) # parse the bioperl test files # comment this out for now -- there are a bunch of junky records in here # that no longer exist in GenBank -- do we really need to support those? # files_to_parse = [os.path.join(os.getcwd(), 'GenBank', 'bioperl_test.gb')] # parse the biojava test files # files_to_parse += [os.path.join(os.getcwd(), 'GenBank', 'biojava_test.gb')] # test the parsers feature_parser = GenBank.FeatureParser(debug_level=0) record_parser = GenBank.RecordParser(debug_level=0) all_parsers = [feature_parser, record_parser] print("Testing parsers...") for parser in all_parsers: for filename in files_to_parse: if not os.path.isfile(filename): print("Missing test input file: %s" % filename) continue handle = open(filename, 'r') iterator = GenBank.Iterator(handle, parser) while True: with warnings.catch_warnings():
from Bio import GenBank from BioSQL import BioSeqDatabase server = BioSeqDatabase.open_database(host="192.168.0.192", user="******", passwd="", db="pythonloadtest") # remove the database if it already exists db_name = "testload" try: server[db_name] server.remove_database(db_name) except KeyError: pass db = server.new_database(db_name) input_file = "/home/hack/install/biopython/Tests/GenBank/cor6_6.gb" handle = open(input_file, "r") parser = GenBank.FeatureParser() iterator = GenBank.Iterator(handle, parser) # -- do the timing part start_time = time.time() num_records = db.load(iterator) end_time = time.time() elapsed_time = end_time - start_time print("Loading") print("\tDid %s records in %s seconds for\n\t%f records per second" % (num_records, elapsed_time, float(num_records) / float(elapsed_time)))
org = rec.annotations.get('organism', '') date = rec.annotations.get('date', '') head = '>gi:%s, id:%s, org:%s, date:%s\n' % (gi, rec.id, org, date) body = '\n'.join(textwrap.wrap(rec.seq.data, width=80)) return head, body if __name__ == '__main__': mode = sys.argv[1] text = sys.argv[2] output_file = sys.argv[3] print('Searching for %s <br>' % text) # check if inputs are all numbers try: gi_list = text.split() [int(_) for _ in gi_list] except ValueError: gi_list = GenBank.search_for(text, max_ids=10) fp = open(output_file, 'wt') record_parser = GenBank.FeatureParser() ncbi_dict = GenBank.NCBIDictionary(mode, 'genbank', parser=record_parser) for gid in gi_list: res = ncbi_dict[gid] head, body = make_fasta(res) fp.write(head + body + '\n') print(head) fp.close()
def run(self): if not self.allowRefSeqs: print 'NOT ALLOWING REFSEQS' if self.query_string.startswith( 'GI:') or self.query_string.startswith('gi:'): self.query_string = self.query_string[3:] q = self.query_string gi_list = self.search(q) else: q = "mycobacterium phage " + self.query_string + " AND Hatfull GF[AUTH] NOT srcdb_refseq[prop]" print "search query:", q gi_list = self.search(q) print 'gi_list:', gi_list if len(gi_list) == 0: print 'Got no results. Changing search criteria' q = self.query_string + " AND Hatfull GF[AUTH] NOT srcdb_refseq[prop]" print "search query:", q gi_list = self.search(q) if len(gi_list) == 0: print 'Got no results. Changing search criteria' q = self.query_string + " NOT srcdb_refseq[prop]" print "search query:", q gi_list = self.search(q) if len(gi_list) != 0: print 'found GenBank Direct Submission(s)' print gi_list else: print 'found no results other than refSeq(s), which you refused' self.result = None return else: # allowing refSeqs print 'ALLOWING REFSEQS' if self.query_string.startswith( 'GI:') or self.query_string.startswith('gi:'): self.query_string = self.query_string[3:] q = self.query_string gi_list = self.search(q) else: q = "mycobacterium phage " + self.query_string + " AND Hatfull GF[AUTH]" print "search query:", q gi_list = self.search(q) if len(gi_list) == 0: q = self.query_string + " AND Hatfull GF[AUTH]" gi_list = self.search(q) if len(gi_list) == 0: print 'Got no results. Changing search criteria' print 'search query:', self.query_string gi_list = self.search(self.query_string) if len(gi_list) == 0: print 'no results found' self.results = gi_list return if len(gi_list) > 1: selection = -1 for i in range(len(gi_list)): print i + 1, '\t', gi_list[i] selection = raw_input( "Your search returned multiple results. Please type the number for your selection: " ) selection = int(selection) - 1 else: selection = 0 print 'creating parser...' feature_parser = GenBank.FeatureParser() print 'creating dict' ncbi_dict = GenBank.NCBIDictionary('nucleotide', 'genbank', parser=feature_parser) if selection == -1: ## Accounts for non-existent phage query print 'non-existent phage query' self.result = 0 else: print 'got result' self.result = ncbi_dict[gi_list[selection]]
def loadDB(catalog): from BioSQL import BioSeqDatabase import sys username = raw_input("Please enter user name: ") password = raw_input("and password: "******"dbpg-ifi-utv.uio.no" db_name = "rnammer" server = BioSeqDatabase.open_database(driver="psycopg2", user=username,passwd=password, host=host, db=db_name) biodb_name = "empty" # genebank problem ? se staving db = "nodb" gi_rep = 1 for gbff in catalog: #server.remove_database(source) print gi_rep print gbff parser = GenBank.FeatureParser() #record = parser.parse(open(gbff)) #records = SeqIO.parse(open(gbff),'genbank') records = GenBank.Iterator(open(gbff), parser) for x in records: if re.search("plasmid",x.description, re.IGNORECASE): continue print "Record name:" print x.id #print dir(x) if "Proteobacteria" == x.annotations["taxonomy"][1]: print x.annotations["taxonomy"][1] print x.annotations["taxonomy"][2] biodb_name = x.annotations["taxonomy"][2] else : print x.annotations["taxonomy"][1] biodb_name = x.annotations["taxonomy"][1] while True : try : db = server[biodb_name] #print "here" break except KeyError : #print ("Cannot find biodatabase with name %r making it" % source) server.new_database(biodb_name) server.commit() db.load([x]) #record.annotations["gi"] = gi_rep #print type(records) #print record.id gi_rep = gi_rep + 1 #db.load([records]) server.adaptor.commit()