def check_config(dbdriver, dbtype, dbhost, dbuser, dbpasswd, testdb): global DBDRIVER, DBTYPE, DBHOST, DBUSER, DBPASSWD, TESTDB, DBSCHEMA, SQL_FILE DBDRIVER = dbdriver DBTYPE = dbtype DBHOST = dbhost DBUSER = dbuser DBPASSWD = dbpasswd TESTDB = testdb #Check the database driver is installed: try: __import__(DBDRIVER) except ImportError: message = "Install %s if you want to use %s with BioSQL " % (DBDRIVER, DBTYPE) raise MissingExternalDependencyError(message) try: if DBDRIVER in ["sqlite3"]: server = BioSeqDatabase.open_database(driver = DBDRIVER, db = TESTDB) else: server = BioSeqDatabase.open_database(driver = DBDRIVER, user = DBUSER, passwd = DBPASSWD, host = DBHOST) server.close() del server except Exception, e: message = "Connection failed, check settings if you plan to use BioSQL: %s" % str(e) raise MissingExternalDependencyError(message)
def check_config(dbdriver, dbtype, dbhost, dbuser, dbpasswd, testdb): """Verify the database settings work for connecting.""" global DBDRIVER, DBTYPE, DBHOST, DBUSER, DBPASSWD, TESTDB, DBSCHEMA global SYSTEM, SQL_FILE DBDRIVER = dbdriver DBTYPE = dbtype DBHOST = dbhost DBUSER = dbuser DBPASSWD = dbpasswd TESTDB = testdb if not DBDRIVER or not DBTYPE or not DBUSER: # No point going any further... raise MissingExternalDependencyError("Incomplete BioSQL test settings") # Check the database driver is installed: if SYSTEM == "Java": try: if DBDRIVER in ["MySQLdb"]: import com.mysql.jdbc.Driver elif DBDRIVER in ["psycopg2", "pgdb"]: import org.postgresql.Driver except ImportError: message = "Install the JDBC driver for %s to use BioSQL " % DBTYPE raise MissingExternalDependencyError(message) else: try: __import__(DBDRIVER) except ImportError: if DBDRIVER in ["MySQLdb"]: message = "Install MySQLdb or mysqlclient if you want to use %s with BioSQL " % (DBTYPE) else: message = "Install %s if you want to use %s with BioSQL " % (DBDRIVER, DBTYPE) raise MissingExternalDependencyError(message) try: if DBDRIVER in ["sqlite3"]: server = BioSeqDatabase.open_database(driver=DBDRIVER, db=TESTDB) else: server = BioSeqDatabase.open_database(driver=DBDRIVER, host=DBHOST, user=DBUSER, passwd=DBPASSWD) server.close() del server except Exception as e: message = "Connection failed, check settings if you plan to use BioSQL: %s" % e raise MissingExternalDependencyError(message) DBSCHEMA = "biosqldb-" + DBTYPE + ".sql" SQL_FILE = os.path.join(os.getcwd(), "BioSQL", DBSCHEMA) if not os.path.isfile(SQL_FILE): message = "Missing SQL schema file: %s" % SQL_FILE raise MissingExternalDependencyError(message)
def _do_db_cleanup(): """Cleanup everything from TESTDB. Relevant for MySQL and PostgreSQL. """ if DBDRIVER in ["psycopg2", "pgdb"]: # first open a connection the database # notice that postgres doesn't have createdb privileges, so # the TESTDB must exist server = BioSeqDatabase.open_database(driver=DBDRIVER, host=DBHOST, user=DBUSER, passwd=DBPASSWD, db=TESTDB) # The pgdb postgres driver does not support autocommit, so here we # commit the current transaction so that 'drop database' query will # be outside a transaction block server.adaptor.cursor.execute("COMMIT") # drop anything in the database # with Postgres, can get errors about database still being used. # Wait briefly to be sure previous tests are done with it. time.sleep(1) # drop anything in the database sql = r"DROP OWNED BY " + DBUSER server.adaptor.cursor.execute(sql, ()) server.close() else: # first open a connection to create the database server = BioSeqDatabase.open_database(driver=DBDRIVER, host=DBHOST, user=DBUSER, passwd=DBPASSWD) # Auto-commit try: server.adaptor.autocommit() except AttributeError: pass # drop the database try: sql = r"DROP DATABASE " + TESTDB server.adaptor.cursor.execute(sql, ()) except (server.module.OperationalError, server.module.Error, server.module.DatabaseError) as e: # the database doesn't exist pass except (server.module.IntegrityError, server.module.ProgrammingError) as e: # ditto--perhaps if str(e).find('database "%s" does not exist' % TESTDB) == -1: server.close() raise # create a new database sql = r"CREATE DATABASE " + TESTDB server.adaptor.execute(sql, ()) server.close()
def test_add_from_gff_with_taxonomy(self): """Add in sequences from a gff + fasta file given taxonomy.""" gff = os.path.join(os.path.dirname(__file__), 'test_files', 'GCF_000005845.2_ASM584v2_genomic.gff') fasta = os.path.join(os.path.dirname(__file__), 'test_files', 'GCF_000005845.2_ASM584v2_genomic.fna') runner = CliRunner() result = runner.invoke(cli.main, self.common_params + ['-t', '-T', 511145, '-g', gff, '-f', fasta, '-D', 'test']) self.assertEqual(result.exit_code, 0) server = BioSeqDatabase.open_database(driver = self.dbdriver, user = self.dbuser, passwd = self.dbpassword, host = self.dbhost, db = self.dbname) rows = server.adaptor.execute_and_fetchall("SELECT name FROM taxon_name where name_class = 'scientific name'") dbnames = set([x[0] for x in rows]) names = set(['cellular organisms', 'Bacteria', 'Proteobacteria', 'Gammaproteobacteria', 'Enterobacterales', 'Enterobacteriaceae', 'Escherichia', 'Escherichia coli', 'Escherichia coli K-12', 'Escherichia coli str. K-12 substr. MG1655']) self.assertCountEqual(dbnames, names) server.close()
def loop(self, filename, format): original_records = list(SeqIO.parse(open(filename, "rU"), format)) # now open a connection to load the database server = BioSeqDatabase.open_database(driver = DBDRIVER, user = DBUSER, passwd = DBPASSWD, host = DBHOST, db = TESTDB) db_name = "test_loop_%s" % filename # new namespace! db = server.new_database(db_name) count = db.load(original_records) self.assertEqual(count, len(original_records)) server.commit() #Now read them back... biosql_records = [db.lookup(name=rec.name) for rec in original_records] #And check they agree self.assertTrue(compare_records(original_records, biosql_records)) #Now write to a handle... handle = StringIO() SeqIO.write(biosql_records, handle, "gb") #Now read them back... handle.seek(0) new_records = list(SeqIO.parse(handle, "gb")) #And check they still agree self.assertEqual(len(new_records), len(original_records)) for old, new in zip(original_records, new_records): #TODO - remove this hack because we don't yet write these (yet): for key in ["comment", "references", "db_source"]: if key in old.annotations and key not in new.annotations: del old.annotations[key] self.assertTrue(compare_record(old, new)) #Done server.close()
def _do_db_create(): """Do the actual work of database creation. Relevant for MySQL and PostgreSQL """ # first open a connection to create the database server = BioSeqDatabase.open_database(driver = DBDRIVER, user = DBUSER, passwd = DBPASSWD, host = DBHOST) if DBDRIVER == "pgdb": # The pgdb postgres driver does not support autocommit, so here we # commit the current transaction so that 'drop database' query will # be outside a transaction block server.adaptor.cursor.execute("COMMIT") else: # Auto-commit: postgresql cannot drop database in a transaction try: server.adaptor.autocommit() except AttributeError: pass # drop anything in the database try: # with Postgres, can get errors about database still being used and # not able to be dropped. Wait briefly to be sure previous tests are # done with it. import time time.sleep(1) sql = r"DROP DATABASE " + TESTDB server.adaptor.cursor.execute(sql, ()) except (server.module.OperationalError, server.module.Error, server.module.DatabaseError), e: # the database doesn't exist pass
def main(args): server = BioSeqDatabase.open_database(driver=args.driver, db=args.database, user=args.user, host=args.host, passwd=args.password) if args.database_name not in server.keys(): server.new_database(args.database_name) db = server[args.database_name] try: if args.gff is not None and args.fasta is not None: load_gff(db, args.gff, args.fasta, args.tax_lookup, args.taxid) elif args.genbank is not None: load_genbank(db, args.genbank, args.tax_lookup, args.taxid) except: server.adaptor.rollback() raise if args.new_taxons: taxon_id = add_new_taxonomy(server, args.new_taxons, args.taxid) if args.fasta is not None: gen = SeqIO.parse(args.fasta, 'fasta') elif args.genbank is not None: gen = SeqIO.parse(args.genbank, 'genbank') for rec in gen: server.adaptor.execute('update bioentry set taxon_id = %s where bioentry_id = %s',(taxon_id, db.adaptor.fetch_seqid_by_display_id(db.dbid, rec.name))) server.commit()
def create_database(): """Delete any existing BioSQL test database, then (re)create an empty BioSQL database.""" if DBDRIVER in ["sqlite3"]: global TESTDB if os.path.exists(TESTDB): try: os.remove(TESTDB) except: time.sleep(1) try: os.remove(TESTDB) except: # Seen this with PyPy 2.1 (and older) on Windows - # which suggests an open handle still exists? print("Could not remove %r" % TESTDB) pass # Now pick a new filename - just in case there is a stale handle # (which might be happening under Windows...) TESTDB = temp_db_filename() else: _do_db_create() # now open a connection to load the database server = BioSeqDatabase.open_database(driver=DBDRIVER, user=DBUSER, passwd=DBPASSWD, host=DBHOST, db=TESTDB) try: server.load_database_sql(SQL_FILE) server.commit() server.close() except: # Failed, but must close the handle... server.close() raise
def load_multi_database(gb_filename_or_handle, gb_filename_or_handle2): """Load two GenBank files into a new BioSQL database as different subdatabases. This is useful for running tests against a newly created database. """ TESTDB = create_database() # now open a connection to load the database db_name = "biosql-test" db_name2 = "biosql-test2" server = BioSeqDatabase.open_database(driver=DBDRIVER, user=DBUSER, passwd=DBPASSWD, host=DBHOST, db=TESTDB) db = server.new_database(db_name) # get the GenBank file we are going to put into it iterator = SeqIO.parse(gb_filename_or_handle, "gb") count = db.load(iterator) db = server.new_database(db_name2) # get the GenBank file we are going to put into it iterator = SeqIO.parse(gb_filename_or_handle2, "gb") # finally put it in the database count2 = db.load(iterator) server.commit() server.close() return count + count2
def setUp(self): global DBDRIVER, DBTYPE, DBHOST, DBUSER, DBPASSWD, TESTDB, DBSCHEMA global SYSTEM, SQL_FILE Entrez.email = "*****@*****.**" # create TESTDB TESTDB = create_database() # load the database db_name = "biosql-test" self.server = BioSeqDatabase.open_database(driver=DBDRIVER, user=DBUSER, passwd=DBPASSWD, host=DBHOST, db=TESTDB) # remove the database if it already exists try: self.server[db_name] self.server.remove_database(db_name) except KeyError: pass self.db = self.server.new_database(db_name) # get the GenBank file we are going to put into it self.iterator = SeqIO.parse("GenBank/cor6_6.gb", "gb")
def main(gbfile, length=10000): driver = "MySQLdb" user = "******" passwd = "" host = "localhost" dbname = "bioseqdb" print "Parsing Genbank file sequence file...." with open(gbfile) as gb_handle: records = list(SeqIO.parse(gb_handle, "genbank")) print "Sorting by size and name......." longrecords = [record for record in records if len(record) > length] longrecords.sort(key=lambda x: x.name) #sort by name print "Writing to BioSQL database..." server = BioSeqDatabase.open_database(driver=driver, user=user, passwd=passwd, host=host, db=dbname) try: if biodb_name not in server.keys(): server.new_database(biodb_name) else: server.remove_database(biodb_name) server.adaptor.commit() server.new_databse(biodb_name) db = server[biodb_name] db.load(longrecords) server.adaptor.commit() except: server.adaptor.rollback() raide
def main(args): server = BioSeqDatabase.open_database(driver=args.driver, db=args.database, user=args.user, host=args.host, passwd=args.password) if args.database_name not in server.keys(): server.new_database(args.database_name) db = server[args.database_name] gen = [] if args.fasta is not None: for rec in SeqIO.parse(args.fasta, 'fasta'): gen.append(rec.name) elif args.genbank is not None: for rec in SeqIO.parse(args.genbank, 'genbank'): gen.append(rec.name) elif args.input is not None: with open(args.input) as fp: for line in fp: gen.append(line.rstrip()) if args.remove: taxon_id = None else: taxon_id = add_new_taxonomy(server, args.new_taxons, args.taxid) for rec in gen: server.adaptor.execute('update bioentry set taxon_id = %s where bioentry_id = %s',(taxon_id, db.adaptor.fetch_seqid_by_display_id(db.dbid, rec))) server.commit()
def create_database(): """Create an empty BioSQL database.""" # first open a connection to create the database server = BioSeqDatabase.open_database(driver = DBDRIVER, user = DBUSER, passwd = DBPASSWD, host = DBHOST) # Auto-commit: postgresql cannot drop database in a transaction try: server.adaptor.autocommit() except AttributeError: pass # drop anything in the database try: # with Postgres, can get errors about database still being used and # not able to be dropped. Wait briefly to be sure previous tests are # done with it. import time time.sleep(1) sql = r"DROP DATABASE " + TESTDB server.adaptor.cursor.execute(sql, ()) except server.module.OperationalError: # the database doesn't exist pass except (server.module.IntegrityError, server.module.ProgrammingError), e: # ditto--perhaps if str(e).find('database "%s" does not exist' % TESTDB) == -1 : raise
def trans(self, filename, format): original_records = list(SeqIO.parse(open(filename, "rU"), format)) # now open a connection to load the database server = BioSeqDatabase.open_database(driver = DBDRIVER, user = DBUSER, passwd = DBPASSWD, host = DBHOST, db = TESTDB) db_name = "test_trans1_%s" % filename # new namespace! db = server.new_database(db_name) count = db.load(original_records) self.assertEqual(count, len(original_records)) server.commit() #Now read them back... biosql_records = [db.lookup(name=rec.name) for rec in original_records] #And check they agree self.assertTrue(compare_records(original_records, biosql_records)) #Now write to a second name space... db_name = "test_trans2_%s" % filename # new namespace! db = server.new_database(db_name) count = db.load(biosql_records) self.assertEqual(count, len(original_records)) #Now read them back again, biosql_records2 = [db.lookup(name=rec.name) for rec in original_records] #And check they also agree self.assertTrue(compare_records(original_records, biosql_records2)) #Done server.close()
def main(args): server = BioSeqDatabase.open_database(driver=args.driver, db=args.database, user=args.user, host=args.host, passwd=args.password) sfids = [] with open(args.input) as fp: for line in fp: sfids.append(line.rstrip()) print_feature_qv_csv(server, sfids)
def setUp(self): gb_file = os.path.join(os.getcwd(), "GenBank", "cor6_6.gb") load_database(gb_file) self.server = BioSeqDatabase.open_database( driver=DBDRIVER, user=DBUSER, passwd=DBPASSWD, host=DBHOST, db=TESTDB ) self.db = self.server["biosql-test"]
def get_database(): """Perform a connection with the database. XXX The info here shouldn't be hard coded and should be specified on the commandline. """ server = BioSeqDatabase.open_database(host="192.168.0.192", user="******", passwd="", db="biosql_new") return server["embl_rod"]
def setUp(self): #drop any old database and create a new one: create_database() #connect to new database: self.server = BioSeqDatabase.open_database(driver = DBDRIVER, user = DBUSER, passwd = DBPASSWD, host = DBHOST, db = TESTDB) #Create new namespace within new empty database: self.db = self.server.new_database("biosql-test")
def setUp(self): """Connect to the database.""" db_name = "biosql-test-seqio" server = BioSeqDatabase.open_database(driver=DBDRIVER, user=DBUSER, passwd=DBPASSWD, host=DBHOST, db=TESTDB) self.server = server if db_name not in server: self.db = server.new_database(db_name) server.commit() self.db = self.server[db_name]
def setUp(self): """Load a database.""" load_database("GenBank/cor6_6.gb") self.server = BioSeqDatabase.open_database(driver=DBDRIVER, user=DBUSER, passwd=DBPASSWD, host=DBHOST, db=TESTDB) self.db = self.server["biosql-test"] self.item = self.db.lookup(accession="X62281")
def setUp(self): # drop any old database and create a new one: testdb, dbdriver, dbuser, dbpassword, dbhost = connection_parameters(create=True) # connect to new database: self.server = BioSeqDatabase.open_database(driver=dbdriver, user=dbuser, passwd=dbpassword, host=dbhost, db=testdb) self._create_taxonomy() self.taxon_tree = TaxonTree(self.server.adaptor) self.testdb = testdb
def setUp(self): """Connect to and load up the database. """ load_database("GenBank/cor6_6.gb") self.server = BioSeqDatabase.open_database( driver=DBDRIVER, user=DBUSER, passwd=DBPASSWD, host=DBHOST, db=TESTDB ) self.db = self.server["biosql-test"]
def test_change_taxonomy(self): """Check that taxonomy can be properly changed.""" runner = CliRunner() infile = os.path.join(os.path.dirname(__file__), 'test_files', 'modify_header.txt') result = runner.invoke(cli.main, self.common_params + ['-i', infile, '-T', '112040', '--key', 'accession']) self.assertEqual(result.exit_code, 0) print(result.output) server = BioSeqDatabase.open_database(driver = self.dbdriver, user = self.dbuser, passwd = self.dbpassword, host = self.dbhost, db = self.dbname) rows = server.adaptor.execute_and_fetchall("select ncbi_taxon_id from taxon join bioentry using(taxon_id) where bioentry.accession = 'NC_000913'") taxid = rows[0][0] self.assertEqual(taxid, 112040)
def setUp(self): """Load a database. """ gb_file = os.path.join(os.getcwd(), "GenBank", "cor6_6.gb") gb_handle = open(gb_file, "r") load_database(gb_handle) gb_handle.close() self.server = BioSeqDatabase.open_database(driver = DBDRIVER, user = DBUSER, passwd = DBPASSWD, host = DBHOST, db = TESTDB) self.db = self.server["biosql-test"] self.item = self.db.lookup(accession = "X62281")
def setUp(self): """Connect to and load up the database. """ gb_file = os.path.join(os.getcwd(), "GenBank", "cor6_6.gb") gb_handle = open(gb_file, "r") load_database(gb_handle) gb_handle.close() self.server = BioSeqDatabase.open_database( driver=DBDRIVER, user=DBUSER, passwd=DBPASSWD, host=DBHOST, db=TESTDB ) self.db = self.server["biosql-test"]
def main(args): server = BioSeqDatabase.open_database(driver=args.driver, db=args.database, user=args.user, host=args.host, passwd=args.password) if args.database_name not in server.keys(): server.new_database(args.database_name) db = server[args.database_name] try: load_img(db, args.directory, args.tax_lookup, args.taxid) server.adaptor.commit() except: server.adaptor.rollback() raise
def test_add_from_genbank(self): """Add in sequences from a Genbank file.""" infile = os.path.join(os.path.dirname(__file__), 'test_files', 'GCF_000005845.2_ASM584v2_genomic.gbff') runner = CliRunner() result = runner.invoke(cli.main, self.common_params + ['-G', infile, '-D', 'test']) self.assertEqual(result.exit_code, 0) server = BioSeqDatabase.open_database(driver = self.dbdriver, user = self.dbuser, passwd = self.dbpassword, host = self.dbhost, db = self.dbname) rows = server.adaptor.execute_and_fetchall("SELECT name FROM taxon_name where name_class = 'scientific name'") self.assertEqual(rows, [('Escherichia coli str. K-12 substr. MG1655',)]) server.close()
def test_backwards_compatibility(self): """Check can re-use an old BioSQL SQLite3 database.""" original_records = list(SeqIO.parse("GenBank/cor6_6.gb", "gb")) # now open a connection to load the database server = BioSeqDatabase.open_database(driver=DBDRIVER, db="BioSQL/cor6_6.db") db = server["OLD"] self.assertEqual(len(db), len(original_records)) #Now read them back... biosql_records = [db.lookup(name=rec.name) \ for rec in original_records] #And check they agree self.assertTrue(compare_records(original_records, biosql_records))
def test_add_from_gff(self): """Add in sequences from a gff + fasta file.""" gff = os.path.join(os.path.dirname(__file__), 'test_files', 'GCF_000005845.2_ASM584v2_genomic.gff') fasta = os.path.join(os.path.dirname(__file__), 'test_files', 'GCF_000005845.2_ASM584v2_genomic.fna') runner = CliRunner() result = runner.invoke(cli.main, self.common_params + ['-g', gff, '-f', fasta, '-D', 'test']) self.assertEqual(result.exit_code, 0) server = BioSeqDatabase.open_database(driver = self.dbdriver, user = self.dbuser, passwd = self.dbpassword, host = self.dbhost, db = self.dbname) rows = server.adaptor.execute_and_fetchall("SELECT name FROM taxon_name where name_class = 'scientific name'") self.assertEqual(rows, []) server.close()
def create_database(): """Delete any existing BioSQL test database, then (re)create an empty BioSQL database.""" if DBDRIVER in ["sqlite3"]: if os.path.exists(TESTDB): os.remove(TESTDB) else: _do_db_create() # now open a connection to load the database server = BioSeqDatabase.open_database(driver = DBDRIVER, user = DBUSER, passwd = DBPASSWD, host = DBHOST, db = TESTDB) server.load_database_sql(SQL_FILE) server.commit() server.close()
def upload_files(self, seqtype, filetype, upload_path, upload_list=None, new_db=False): """Upload NCBI/genbank files to a new or existing sqlite database. :param seqtype: :param filetype: :param upload_path: :param upload_list: (Default value = None) :param new_db: (Default value = False) :return: """ db_name = Path(self.database_name.stem + '_' + seqtype + self.database_name.suffix) db_abs_path = Path(upload_path) / db_name # Make sure a BioSQL-SQLite database exists # TODO-ROB: Rework this part # if db_abs_path.is_file(): # raise FileExistsError # elif new_db: # self.copy_template_database(destination=db_abs_path) # else: # raise FileNotFoundError("Database not found: %s\mPlease create a BioSQL-SQLite database." % self.database_abs_path) if not upload_list: upload_list = os.listdir(upload_path) # Parse the upload list and upload the files to the BioSQL-SQLite database. for file in upload_list: abs_upload_path = Path(str(upload_path)) / Path(file) # Make a connection with the BioSQL database try: server = BioSeqDatabase.open_database( driver=self.driver.lower(), db=str(db_abs_path)) self.biosqllog.info("Server Connected.") pass except: self.biosqllog.warn( "The Server did not Connect. Check the to make sure %s exists." % self.database_abs_path) raise FileNotFoundError # See if the sub database exists (rna, protein, or genomic) try: if seqtype not in server.keys(): server.new_database(seqtype) self.biosqllog.info( "New Sub-Database created, %s, for %s." % (seqtype, db_abs_path)) # Connect to the sub database sub_db = server[seqtype] count = sub_db.load(SeqIO.parse(abs_upload_path, filetype)) self.biosqllog.info("%s loaded with %s %s files" % (db_name, count, filetype)) server.commit() self.biosqllog.warn("Server committed.") t_count = t_count + count self.biosqllog.info( "The server has not loaded a total of %s files." % t_count) # TODO-ROB: Add something to do with time here. except: self.biosqllog.critical("Unable to load the database...") server.rollback() try: del server[sub_db] self.biosqllog.critical( "%s sub database deleted from %s. All of the info will be lost." % (sub_db, db_abs_path)) server.commit() self.biosqllog.critical("Server committed") except: raise raise
from BioSQL import BioSeqDatabase import sys from operator import itemgetter import matplotlib.pyplot as plt import numpy as np from scipy.stats import gaussian_kde username = raw_input("Please enter user name: ") password = raw_input("and password: "******"psycopg2", user = username, passwd = password, host = "dbpg-ifi-utv.uio.no", db = "rnammer") class Genome: def __init__(self, ID): self.name = ID self.orginal = [] self.new = [] def add_org(self,data): self.orginal.append(data) def add_new(self,data): self.new.append(data) #def clean(): # compare content of orginal to new # NB can only be don after bothe are added
def loadDB(catalog): from BioSQL import BioSeqDatabase import sys username = raw_input("Please enter user name: ") password = raw_input("and password: "******"dbpg-ifi-utv.uio.no" db_name = "rnammer" server = BioSeqDatabase.open_database(driver="psycopg2", user=username,passwd=password, host=host, db=db_name) biodb_name = "empty" # genebank problem ? se staving db = "nodb" gi_rep = 1 for gbff in catalog: #server.remove_database(source) print gi_rep print gbff parser = GenBank.FeatureParser() #record = parser.parse(open(gbff)) #records = SeqIO.parse(open(gbff),'genbank') records = GenBank.Iterator(open(gbff), parser) for x in records: if re.search("plasmid",x.description, re.IGNORECASE): continue print "Record name:" print x.id #print dir(x) if "Proteobacteria" == x.annotations["taxonomy"][1]: print x.annotations["taxonomy"][1] print x.annotations["taxonomy"][2] biodb_name = x.annotations["taxonomy"][2] else : print x.annotations["taxonomy"][1] biodb_name = x.annotations["taxonomy"][1] while True : try : db = server[biodb_name] #print "here" break except KeyError : #print ("Cannot find biodatabase with name %r making it" % source) server.new_database(biodb_name) server.commit() db.load([x]) #record.annotations["gi"] = gi_rep #print type(records) #print record.id gi_rep = gi_rep + 1 #db.load([records]) server.adaptor.commit()
def gbk_upload(self): """Upload a BioSQL database with target GenBank data (.gbk files). This method is only usable after creating GenBank records with this class. It uploads a BioSQL databases with target GenBank data (.gbk files). This creates a compact set of data for each project. :return: Does not return an object. """ t_count = 0 # Parse the tier dictionary for TIER in self.tier_frame_dict.keys(): db_name = str(TIER) + '.db' db_file_path = self.target_gbk_db_path / Path(db_name) # Create the db file if it exists if os.path.isfile(str(db_file_path)) is False: self.genbanklog.warn( 'Copying Template BioSQL Database... This may take a few minutes...' ) shutil.copy2('Template_BioSQL_DB.db', str(db_file_path)) # If it already exists then the database is bad, or needs to be update. Delete it. else: # TODO-ROB: This part is broken until the template db creation and management is added os.remove(str(db_file_path)) self.genbanklog.warn( 'Copying Template BioSQL Database... This may take a few minutes...' ) shutil.copy2('Template_BioSQL_DB.db', str(db_file_path)) server = BioSeqDatabase.open_database(driver='sqlite3', db=str(db_file_path)) gene_path = self.raw_data # Parse the raw_data folder to get the name of each gene. for GENE in os.listdir(str(gene_path)): sub_db_name = GENE genbank_path = gene_path / Path(GENE) / Path('GENBANK') # Parse the GenBank file names for each gene in order to upload them to a custom BioSQL database for FILE in os.listdir(str(genbank_path)): # Try to load the database. try: if sub_db_name not in server.keys(): server.new_database(sub_db_name) db = server[sub_db_name] count = db.load(SeqIO.parse(FILE, 'genbank')) server.commit() self.genbanklog.info('Server Commited %s' % sub_db_name) self.genbanklog.info('%s database loaded with %s.' % (db.dbid, FILE)) self.genbanklog.info( "That file contains %s genbank records." % str(count)) t_count = t_count + count self.genbanklog.info( 'The total number of files loaded so far is %i.' % t_count) # If the database cannot be loaded then rollback the server and raise an error. except BaseException: server.rollback() # Try to delete the sub database and commit try: del server[sub_db_name] server.commit() # If it cannot be deleted then raise an error. except BaseException: raise raise
import sqlite3 import fnmatch import time import Bio import csv import os import pexpect from Bio import SeqIO from BioSQL import BioSeqDatabase server_db_list = [ ] #list of db names. I will create a DB for the orthologs of interest soon. #Open the server that we want to look at server = BioSeqDatabase.open_database( driver="sqlite3", db="/work5/r2294/bin/NCBI_data/vertebrate_mammalian/DB/VM_RefseqRNA_DB.db") #Get the current working directory and set it to the home variable home = os.getcwd() #make a list of database names for db_name in server.keys(): server_db_list.append(db_name) print(server_db_list) input('this is the server_db_list Do you enjoy it? Very much eh? ......') #To start you have to parse each subdatabase on the "server" in order to search through each one. for db_name in server.keys(): db = server[db_name] print(db_name)
def fetch_gis(email, db_name, tool, batch_size, log_file, save_file_directory): # BASE LOGGING CONFIG logging.basicConfig( filename=log_file, level=logging.INFO, filemode='a', # append to log file format='%(asctime)s:%(levelname)s:%(name)s:%(message)s') # GET bioentries gis that have already been imported. # GET gi_queues that have not been imported # SELECT thouse gis that are not fetched and not in bioentry with psycopg2.connect("dbname=%s" % (db_name)) as conn: with conn.cursor() as cur: cur.execute("""SELECT identifier FROM bioentry""") bioentry_ids = set(map(lambda x: x[0], cur.fetchall())) logging.info("found {0} gis in bioentry".format(len(bioentry_ids))) cur.execute("""SELECT gi FROM gi_queues WHERE fetched IS false""") gi_queues_ids = set(map(lambda x: x[0], cur.fetchall())) logging.info("found {0} gis in qi_queues".format( len(gi_queues_ids))) fetch_gis = list(gi_queues_ids - bioentry_ids) logging.info("will fetch {0} gis from ncbi.".format( len(fetch_gis))) # SELECT the biodatabase to use # GET proteins from NCBI # IMPORT proteins to bioentries # LOG which gis imported and which failed # # Fetch in blocks of 1000 for i in xrange((len(fetch_gis) + batch_size - 1) / batch_size): try: server = BioSeqDatabase.open_database(driver="psycopg2", db=db_name) if db_name in server: db = server[db_name] else: db = server.new_database(db_name, description="sll biosql test") server.commit() Entrez.email = email Entrez.tool = tool logging.info( "Fetch gis in batch of #{0}. Now fetching from {1} .".format( batch_size, i * batch_size)) fh = Entrez.efetch(db="protein", rettype="gp", retmode="text", id=fetch_gis[i * batch_size:(i + 1) * batch_size]) sleep(0.5) seqs = list(SeqIO.parse(fh, "gb")) for seq in seqs: f = gzip.open( os.path.join(save_file_directory, seq.annotations["gi"] + ".gb.gz"), 'wb') SeqIO.write(seq, f, "genbank") logging.info("Fetched {0} sequences from NCBI.".format(len(seqs))) count = db.load(seqs) logging.info("Inserted {0} sequences into biosql".format(count)) if count != len(fetch_gis): logging.warn("Not equally many gis to fetch as were inserted") not_fetched_gis = set() for i in fetch_gis[i * batch_size:(i + 1) * batch_size]: try: entry = db.lookup(gi=i) except IndexError as e: logging.warn("Gi: {0} not inserted in database".format(i)) not_fetched_gis.add(i) server.commit() except Exception as e: tb = traceback.format_exc() logging.error("Error: {0}\nTraceback: {1}".format(e, tb)) finally: fh.close() server.close() # UPDATE gi_queue entries that have been fetched with psycopg2.connect("dbname=%s" % db_name) as conn: with conn.cursor() as cur: cur.execute( """UPDATE gi_queues SET fetched=true,updated_at=now() WHERE gi IN ('{0}')""" .format("','".join(set(fetch_gis) - not_fetched_gis))) logging.info( "Updated status of {0} gi_queues rows out of {1} gis that were added to bioentry" .format(cur.rowcount, count))
def _get_db(dbpath=dbpath, db=dbname): server = BioSeqDatabase.open_database(driver='sqlite3', db=dbpath) return server[db]
#!/usr/bin/env python """Test timing of loading records into a BioSQL database.""" from __future__ import print_function import time # set up the connection from Bio import GenBank from BioSQL import BioSeqDatabase server = BioSeqDatabase.open_database(host="192.168.0.192", user="******", passwd="", db="pythonloadtest") # remove the database if it already exists db_name = "testload" try: server[db_name] server.remove_database(db_name) except KeyError: pass db = server.new_database(db_name) input_file = "/home/hack/install/biopython/Tests/GenBank/cor6_6.gb" handle = open(input_file, "r") parser = GenBank.FeatureParser() iterator = GenBank.Iterator(handle, parser) # -- do the timing part start_time = time.time() num_records = db.load(iterator) end_time = time.time()
temp_var['log_file_rank'] = str(temp_var['log_file']) + str( rank) # Each process gets it's own unique log file ser_loc = where.DB loaded_list = [] t_count = 0 # Open a logging file and begin the process of uploading with open(where.LOG + '/Temp/' + temp_var['log_file_rank'], 'w') as log_w: for file in temp_var['small_list']: print('file: ', file) log = [] log.append('file: %s' % file) # Create's or opens an existing server. If the database cannot be created or opened it deletes and try again try: server = BioSeqDatabase.open_database(driver='sqlite3', db=ser_loc + '/' + temp_var['db_name'] + '.' + temp_var['key'] + '.db') print('server created') log.append('server created') except: print('server not created') log.append('server not created') os.remove(ser_loc + ('/%s.%s.db' % (temp_var['db_name'], temp_var['key']))) raise # Deprecated (all files are RNA, but I originally wanted to get the other types as well) s = str(file).lower() if s.find("rna") != -1: sub_db = 'RNA' elif s.find("protein") != -1:
# -*- coding: utf-8 -*- """ Created on Mon Sep 19 11:16:37 2016 @author: rgilmore """ import os from Bio import SeqIO import csv from BioSQL import BioSeqDatabase from fnmatch import fnmatch server = BioSeqDatabase.open_database( driver="sqlite3", db="/work5/r2294/bin/NCBI_data/vertebrate_mammalian/DB/GPCR_Orthologs_DB.db" ) count = 0 home = os.getcwd() os.chdir('/work5/r2294/bin/NCBI_data/Raw_GBK_Files/HTR1A') for file in (os.listdir('/work5/r2294/bin/NCBI_data/Raw_GBK_Files/HTR1A')): if fnmatch(file, '*.gbk') == False: continue print(file) db = server['HTR1A'] try: c = db.load(SeqIO.parse(file, 'genbank')) except: print('error') continue
#!/usr/bin/env python from Bio import GenBank from Bio import Entrez from BioSQL import BioSeqDatabase import sys # Should read these from settings at some point dbpath = 'biosql.sqlite3' dbname = 'local_db' Entrez.email = '*****@*****.**' server = BioSeqDatabase.open_database(driver='sqlite3', db=dbpath) db = server[dbname] parser = GenBank.FeatureParser() loadgb = lambda _id: db.load( GenBank.Iterator( Entrez.efetch(db='nucleotide', id=_id, rettype='gb', retmode='text'), parser)) ACCESSIONS_FILE = 'accession.lst' if len(sys.argv) < 2 else sys.argv[1] for id in open(ACCESSIONS_FILE): print "Loading %s" % id loadgb(id) server.adaptor.commit()
'--seqfeature', help= 'The first column of the input file is the seqfeature id used by the database. Does not apply when using a gff file as input', action='store_true', default=False) parser.add_argument( '--replace', help='replace any existing annotations for the given qualifiers', action='store_true', default=False) args = parser.parse_args() if args.password is None: args.password = getpass("Please enter the password for user " + \ args.user + " on database " + args.database) server = BioSeqDatabase.open_database(driver=args.driver, db=args.database, user=args.user, host=args.host, passwd=args.password) db = server[args.dbname] if args.input is not None: mapping = parse_input(args.input) else: mapping = parse_gff(args.gff) add_annotation(db, mapping, args.seqfeature, args.replace) server.commit()
parser.add_argument('-user', help='database user', default="root") parser.add_argument('-host', help='database host', default="localhost") parser.add_argument('-passwd', help='database password', required=True) args = parser.parse_args() from BioSQL import BioSeqDatabase cs = [] with gzip.open(args.refseq, "rt") as h: for x in tqdm(bpio.parse(h, "gb")): cs.append(x) assert (cs) server = BioSeqDatabase.open_database(driver="MySQLdb", user=args.user, passwd=args.passwd, host=args.host, db=args.db) acc = "GCF_" + args.refseq.split("_")[1] db = server.new_database(acc, description="") server.commit() count = db.load(tqdm(cs)) print(count) server.commit() db = server.new_database(acc + "_prots", description="") server.commit() prots = [] for x in tqdm(cs):
from BioSQL import BioSeqDatabase server = BioSeqDatabase.open_database(driver="MySQLdb", user="******", passwd="FurtherFlowersVenus", host="localhost", db="bioseqdb") db = server.new_database("just_testing", description="Just for testing") server.commit() #On Biopython 1.49 or older, server.adaptor.commit()
# -*- coding: utf-8 -*- """ Created on Wed Aug 17 15:02:37 2016 @author: rgilmore """ import sqlite3 from BioSQL import BioSeqDatabase from Bio import GenBank import os server = BioSeqDatabase.open_database(driver="sqlite3", db="biosql.db") db = server.new_database("HTR1A") dir_list1 = os.listdir() print(dir_list1) #for files in dir_list1: parser = GenBank.FeatureParser() iterator = GenBank.Iterator(open("HTR1A_Ailuropoda melanoleuca.gbk"), parser) db.load(iterator) db.adaptor.commit() #input("%s loaded into HTR1A database. Proceed?") server.commit() server.close()
def main(args): server = BioSeqDatabase.open_database(driver=args.driver, db=args.database, user=args.user, host=args.host, passwd=args.password) tax_name = False try: ncbi_tax = int(args.taxid) except ValueError: tax_name = True if not tax_name: print("interpreting as an NCBI taxon ID...", file=sys.stderr) taxon_id_lookup_sql = "SELECT bioentry_id, taxon_id, biodatabase.name FROM bioentry JOIN "\ "biodatabase USING(biodatabase_id) WHERE taxon_id IN "\ "(SELECT DISTINCT include.taxon_id FROM taxon "\ "INNER JOIN taxon as include ON (include.left_value "\ "BETWEEN taxon.left_value AND taxon.right_value) "\ "WHERE taxon.ncbi_taxon_id = %s AND include.right_value = include.left_value + 1)" rows = server.adaptor.execute_and_fetchall(taxon_id_lookup_sql, (ncbi_tax, )) else: print("interpreting as a taxon name...", file=sys.stderr) taxon_name_lookup_sql = "SELECT bioentry_id, taxon_id, biodatabase.name FROM bioentry JOIN "\ "biodatabase USING(biodatabase_id) WHERE taxon_id IN "\ "(SELECT DISTINCT include.taxon_id FROM taxon "\ "INNER JOIN taxon as include ON (include.left_value "\ "BETWEEN taxon.left_value AND taxon.right_value) "\ "WHERE taxon.taxon_id IN (SELECT taxon_id FROM taxon_name "\ "WHERE name like %s) AND include.right_value = include.left_value + 1)" rows = server.adaptor.execute_and_fetchall(taxon_name_lookup_sql, (args.taxid, )) if args.feature_type is not None: types = args.feature_type elif args.output_format == 'feat-prot': types = ['CDS'] elif args.output_format == 'feat-nucl': types = ['CDS', 'rRNA', 'tRNA'] if len(rows) == 0: print( "There does not appear to be any sequences associated with\n" "the taxonomy provided. If you used a taxonomy name, make sure\n" "it is spelled correctly. If you used an NCBI taxonomy ID, make\n" "sure that it is correct.", file=sys.stderr) sys.exit(1) dbids = {} for row in rows: dbids[(row[0], row[2])] = row[1] files = {} taxid_to_dbids = {} if args.split_species: taxon_file_mapping = {} for k, v in dbids.items(): tname = server.adaptor.execute_and_fetch_col0( "SELECT name from taxon_name where taxon_id = %s and name_class = %s", (v, 'scientific name'))[0] tname = tname.replace(' ', '_') if args.output_format == 'gb': tname += '.gb' elif args.output_format == 'feat-prot': tname += '.faa' else: tname += '.fna' files[v] = tname taxid_to_dbids.setdefault(v, []).append(k) if args.split_species: # got to save all of the records before printing them out outdata = {} for taxid, dbid_list in taxid_to_dbids.items(): for dbid, dbname in dbid_list: db = server[dbname] seq_rec = db[dbid] outdata.setdefault(taxid, []).append(seq_rec) for taxid, dbrecs in outdata.items(): with open(files[taxid], 'w') as fp: if 'feat' in args.output_format: for dbrec in dbrecs: extract_feature(dbrec, args.output_format, fp) else: SeqIO.write(dbrecs, fp, args.output_format) else: if args.output_format == 'feat-prot': extract_feature_sql(server, get_seqfeature_ids_for_bioseqs( server, [x[0] for x in dbids.keys()]), type=types, translate=True) elif args.output_format == 'feat-nucl': extract_feature_sql(server, get_seqfeature_ids_for_bioseqs( server, [x[0] for x in dbids.keys()]), type=types) else: for (dbid, dbname), taxid in dbids.items(): db = server[dbname] try: dbrec = db[dbid] SeqIO.write(dbrec, sys.stdout, args.output_format) except KeyError: pass
def main(): """This is run if file is directly executed, but not if imported as module. Having this in a separate function allows importing the file into interactive python, and still able to execute the function for testing""" parser = argparse.ArgumentParser() parser.add_argument("-f", "--file", required=True, help="input file", type=str) parser.add_argument("-l", "--locus", required=True, help="Locus", type=str) parser.add_argument("-k", "--kir", help="Option for running with KIR", action='store_true') parser.add_argument("-s", "--server", help="Option for running with a server", action='store_true') parser.add_argument("-v", "--verbose", help="Option for running in verbose", action='store_true') args = parser.parse_args() fastafile = args.file locus = args.locus verbose = False if args.verbose: verbose = True verbose = False if args.verbose: verbose = True kir = False if args.kir: kir = True serv = False if args.server: serv = True if verbose: logging.basicConfig(format='%(asctime)s - %(name)-35s - %(levelname)-5s - %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p', level=logging.INFO) server = None if serv: server = BioSeqDatabase.open_database(driver="pymysql", user="******", passwd="", host="localhost", db="bioseqdb") seqann = BioSeqAnn(verbose=True, kir=kir) for seq in SeqIO.parse(fastafile, "fasta"): ann = seqann.annotate(seq, locus=locus) print('{:*^20} {:^20} {:*^20}'.format("", str(seq.description), "")) l = 0 for f in ann.annotation: if isinstance(ann.annotation[f], DBSeq): print(f, ann.method, str(ann.annotation[f]), sep="\t") l += len(ann.annotation[f]) else: print(f, ann.method, str(ann.annotation[f].seq), sep="\t") l += len(ann.annotation[f].seq) print("") if serv: server.close()
def main(): """This is run if file is directly executed, but not if imported as module. Having this in a separate function allows importing the file into interactive python, and still able to execute the function for testing""" parser = argparse.ArgumentParser() parser.add_argument("-v", "--verbose", help="Option for running in verbose", action='store_true') parser.add_argument("-n", "--number", required=False, help="Number of IMGT/DB releases", default=1, type=int) parser.add_argument("-r", "--releases", required=False, help="IMGT/DB releases", type=str) args = parser.parse_args() releases = args.releases number = args.number if args.verbose: verbose = True else: verbose = False if releases: dblist = [db for db in releases.split(",")] else: try: versions_url = "https://www.ebi.ac.uk/ipd/imgt/hla/docs/release.html" df = pd.read_html(versions_url)[0] x = df.columns dblist = [l.replace(".", '') for l in df[x[0]].tolist()[0:number]] except ValueError as err: db_error = "Failed to load DB list: {0}".format(err) logging.info(db_error) logging.info("Defaulting to Latest") dblist = ["Latest"] # Connecting to mysql DB server = BioSeqDatabase.open_database(driver="pymysql", user="******", passwd="my-secret-pw", host="localhost", db="bioseqdb") if verbose: dbversions_str = ",".join(dblist) logging.info("IMGT/HLA DB Versions = " + dbversions_str) # Looping through DB verions for dbv in dblist: # Downloading hla.dat file hladat = download_dat(dbv) if verbose: logging.info("Finished downloading hla.dat file for " + str(dbv)) # Downloading allele list allele_list = download_allelelist(dbv) if verbose: logging.info("Finished downloading allele list for " + str(dbv)) hla_names = {} try: # File formats change... with open(allele_list, 'r') as f: for line in f: line = line.rstrip() if re.search("#", line) or re.search("AlleleID", line): continue accession, name = line.split(",") hla_names.update({accession: name}) f.close() if verbose: nalleles = len(hla_names.keys()) logging.info("Finished loading " + str(nalleles) + " alleles for " + str(dbv)) except ValueError as err: list_error = "Allelelist error: {0}".format(err) logging.error(list_error) server.close() os.remove(hladat) os.remove(allele_list) sys.exit() cmd = "perl -p -i -e 's/[^\\x00-\\x7F]//g' " + hladat os.system(cmd) # Loading sequence data from hla.dat file try: seq_list = list(SeqIO.parse(hladat, "imgt")) except: #read_error = "Read dat error: {0}".format(err) logging.error("ERROR LOADING!!") server.close() os.remove(hladat) os.remove(allele_list) sys.exit() new_seqs = { "A": [], "B": [], "C": [], "DRB1": [], "DQB1": [], "DRB3": [], "DRB4": [], "DRB5": [], "DQA1": [], "DPA1": [], "DPB1": [] } # Changing the sequence name to # the HLA allele name instead of the accession for seq in seq_list: if seq.name in hla_names: loc, allele = hla_names[seq.name].split("*") if loc in new_seqs: hla_name = "HLA-" + hla_names[seq.name] if not hla_name in skip_alleles: seq.name = hla_name new_seqs[loc].append(seq) dbsp = list(dbv) descr = ".".join([dbsp[0], dbsp[1] + dbsp[2], dbsp[3]]) if verbose: logging.info("Loaded IMGT dat file " + descr) # Looping through and loading each locus for locus in new_seqs: dbname = dbv + "_" + locus dbdescription = "IMGT/HLA " + descr + " " + locus db = server.new_database(dbname, description=dbdescription) try: count = db.load(new_seqs[locus]) except: load_fail = sys.exc_info()[0] logging.error("Faild to load " + load_fail) server.close() os.remove(hladat) os.remove(allele_list) sys.exit() if verbose: logging.info("Loaded " + str(count) + " for " + dbname) # Commiting data to mysql db server.commit() # Removing hla.dat and allele list files os.remove(hladat) os.remove(allele_list) if verbose: logging.info("Finished loading " + descr) server.close()
from Bio import SeqIO from Bio.Seq import Seq from BioSQL import BioSeqDatabase def get_mysql_conn(): try: conn = MySQLdb.connect(host=os.environ["MYSQL_HOST"], user=os.environ["MYSQL_USER"], passwd=os.environ["MYSQL_PW"], db=os.environ["MYSQL_DB"]) except MySQLdb.Error, e: print "Error %d: %s" % (e.args[0], e.args[1]) sys.exit(1) biosql_server = BioSeqDatabase.open_database( \ driver="MySQLdb", user=os.environ["MYSQL_USER"], \ passwd = os.environ["MYSQL_PW"], \ host = os.environ["MYSQL_HOST"], db=os.environ["MYSQL_DB"]) DB = biosql_server["medicago"] def createSubDB(dbname, desc): db = biosql_server.new_database(dbname, description=desc) biosql_server.commit() def loadSeq2DB(seqRcdLst): try: DB.load(seqRcdLst) biosql_server.adaptor.commit() except: biosql_server.adaptor.rollback()
dat = ".".join([db, "hla", "dat"]) urllib.request.urlretrieve(url, dat) return dat def download_allelelist(db): url = 'https://raw.githubusercontent.com/ANHIG/IMGTHLA/Latest/Allelelist.' + db + '.txt' alist = ".".join([db, "Allelelist", "txt"]) urllib.request.urlretrieve(url, alist) return alist dblist = ["".join([str(i), str("0")]) for i in range(326, 331)] server = BioSeqDatabase.open_database(driver="pymysql", user="******", passwd="", host="localhost", db="bioseqdb") for dbv in dblist: hladat = download_dat(dbv) allele_list = download_allelelist(dbv) hla_names = {} try: s = "," if dbv == "3260" or dbv == "3270" else " " with open(allele_list, 'r') as f: for line in f: line = line.rstrip() accession, name = line.split(s)
TESTDB = temp_db_filename() # This will abort if driver not installed etc: check_config(DBDRIVER, DBTYPE, DBHOST, DBUSER, DBPASSWD, TESTDB) # Some of the unit tests don't create their own database, # so just in case there is no database already: TESTDB = create_database() if False: # This is how I generated test file Tests/BioSQL/cor6_6.db # which is test cross-checked with the latest bindings to # catch any regressions in how we map GenBank entries to # the database. assert not os.path.isfile("BioSQL/cor6_6.db") server = BioSeqDatabase.open_database(driver=DBDRIVER, db="BioSQL/cor6_6.db") DBSCHEMA = "biosqldb-" + DBTYPE + ".sql" SQL_FILE = os.path.join(os.getcwd(), "BioSQL", DBSCHEMA) assert os.path.isfile(SQL_FILE), SQL_FILE server.load_database_sql(SQL_FILE) server.commit() db = server.new_database("OLD") count = db.load(SeqIO.parse("GenBank/cor6_6.gb", "gb")) assert count == 6 server.commit() assert len(db) == 6 server.close() class BackwardsCompatibilityTest(unittest.TestCase): def test_backwards_compatibility(self):
def fetchseq(ids, species, write=False, output_name='', delim='\t', id_type='brute', server=None, source="SQL", database="bioseqdb", database_path=None, host='localhost', driver='psycopg2', version='1.0', user='******', passwd='', email='', batch_size=50, output_type="fasta", verbose=1, n_threads=1, n_subthreads=1, add_length=(0, 0), indent=0): if isgenerator(ids): if verbose > 1: print('Received generator!', indent=indent) elif isinstance(ids, list): if verbose > 1: print('Received list!', indent=indent) else: if verbose > 1: print('Reading ID File... ', indent=indent) with ids.open('w') as in_handle: id_prelist = [line.strip() for line in in_handle ] # list of each line in the file print('Done!', indent=indent) ids = [id_item for id_item in filter(None, id_prelist) if id_item] if not id_prelist or id_prelist is None: if verbose: print('id_prelist is empty!', indent=indent) return 'None' for id_item in ids: assert len(id_item) == 12, ( "Item {0} in id_list has {1} items, not 5!\n" "Format should be: " "chr, (start,end), id, score, strand, thickStart, thickEnd, rgb, blockcount," " blockspans, blockstarts, query_span" "!").format( " ".join((" ".join(item) if not isinstance(item, str) else item for item in id_item)), len(id_item)) if verbose > 1: print('Readied ids!', indent=indent) id_list = multiprocessing.JoinableQueue() results = multiprocessing.Queue() if 'sql' in source.lower(): if server is None: try: if verbose > 1: print('No server received, opening server...', indent=indent) server = BioSeqDatabase.open_database(driver=driver, user=user, passwd=passwd, host=host, database=database) if verbose > 1: print('Done!', indent=indent) except Exception as err: if verbose > 1: print('Failed to open server!', indent=indent) print(str(type(err)), err, sep=' ', indent=indent) raise else: if verbose > 1: print('Received server handle:', indent=indent) print(server, indent=indent) if verbose > 2: print('Please note the sub_databases of server:\n\t', [str(i) for i in server.keys()], indent=indent) elif source.lower() in ['fasta', '2bit', 'twobit']: print('Search type: ', source, indent=indent) else: raise SearchEngineNotImplementedError( 'Search using source {} has not yet been implemented!'.format( source)) if verbose > 1: print('Creating FecSeq Processes...', indent=indent) fs_instances = [ FetchSeqMP(id_queue=id_list, seq_out_queue=results, delim=delim, id_type=id_type, server=server, species=species, source=source, database=database, database_path=database_path, host=host, driver=driver, version=version, user=user, passwd=passwd, email=email, output_type=output_type, batch_size=batch_size, verbose=verbose, n_subthreads=n_subthreads, add_length=add_length, indent=indent + 1) for _ in range(n_threads) ] if verbose > 1: print('Done! Starting processes...', indent=indent) for fs in fs_instances: fs.start() if verbose > 1: print('Done!', indent=indent) print('Assigning FetchSeq records to queue... ', indent=indent) id_order = [] for i, id_rec in enumerate(ids): try: id_order.append("{0}:{1}-{2}".format(id_rec[0], id_rec[1][0], id_rec[1][1])) except IndexError: id_order.append("{0}".format(id_rec[0])) try: id_list.put(FetchSeq(id_rec=id_rec)) except AssertionError as err: print(i, type(err), err, sep=' ') break for _ in fs_instances: id_list.put(None) if verbose > 1: print('Done!', indent=indent) output_dict = dict() missing_items_list = list() if verbose > 1: print('Getting sequences from processes... ', indent=indent) n_jobs = len(ids) while n_jobs: seq, missing = results.get() output_dict[seq[0]] = seq[1] missing_items_list.append(missing) n_jobs -= 1 if verbose > 1: print('Done! Finished fetching sequences!', indent=indent) print('Closing processes!', indent=indent) for fs in fs_instances: if fs.is_alive(): fs.join() output_list = [output_dict[i] for i in id_order if i in output_dict] if write: SeqIO.write(output_list, output_name, output_type) return else: if missing_items_list == [None]: missing_items_list = None return output_list, missing_items_list
#!/usr/bin/env python # Copyright 2002 Brad Chapman. All rights reserved. # # This code is part of the Biopython distribution and governed by its # license. Please see the LICENSE file that should have been included # as part of this package. """Test timing of getting records from a BioSQL database.""" from __future__ import print_function import time # set up the connection from BioSQL import BioSeqDatabase server = BioSeqDatabase.open_database(host="192.168.0.192", user="******", passwd="", db="test_biosql") db = server["embl_rod"] # -- do the fasta-only timing part start_time = time.time() num_records = 0 for junk_id, record in db.items(): num_records += 1 sequence = record.seq.data d = record.description i = record.id n = record.name end_time = time.time() elapsed_time = end_time - start_time
from tqdm import tqdm from SNDG.Comparative.Pangenome import Pangenome, Strain, sqldb from SNDG.WebServices.NCBI import NCBI from Bio import Entrez, SeqIO from BioSQL import BioSeqDatabase server = BioSeqDatabase.open_database(driver="MySQLdb", user="******", passwd="mito", host="localhost", db="bioseqdb") if __name__ == '__main__': from peewee import MySQLDatabase mysql_db = MySQLDatabase('bioseqdb', user="******", password="******") sqldb.initialize(mysql_db) tables = [Pangenome, Strain] # for x in tables: # x.create_table() Entrez.email = "*****@*****.**" query = '"pathogen"[Properties] AND ("Metazoa"[Organism] OR "Viridiplantae"[Organism] OR "Fungi"[Organism] OR "Eukaryota"[Organism] NOT "Metazoa"[Organism] NOT "Fungi"[Organism] NOT "Viridiplantae"[Organism] OR "Bacteria"[Organism] OR txid1224[Orgn] OR "Archaea"[Organism])' genomesList = Entrez.read( Entrez.esearch(db="genome", term=query, idtype="acc", retmax=10000)) genomes = Entrez.read(Entrez.esummary(db="genome",
] ##################################################################### #TODO - Should we re-use the create_database() function currently # defined in test_BioSQL.py here too? This would allow us # to deal with the error of an unknown database... # #print "Creating database" #from setup_BioSQL import create_database #create_database() print "Connecting to database" try : server = BioSeqDatabase.open_database(driver = DBDRIVER, user = DBUSER, passwd = DBPASSWD, host = DBHOST, db = TESTDB) except Exception, e : message = "Connection failed, check settings in Tests/setup_BioSQL.py "\ "if you plan to use BioSQL: %s" % str(e) raise MissingExternalDependencyError(message) print "Removing existing sub-database '%s' (if exists)" % db_name if db_name in server.keys() : #Might exist from a failed test run... #db = server[db_name] server.remove_database(db_name) server.commit() print "(Re)creating empty sub-database '%s'" % db_name db = server.new_database(db_name)
def create_fasta_files(self): """ Creates FASTA files from database sequences. """ print "Creating FASTA datasets." print " Opening database." server = BioSeqDatabase.open_database(**self.dbargs) cur = server.adaptor.cursor for indexes, name, schema, namespace, max_res in self.dataset: print " Creating dataset %s." % name if schema: print " Setting schema %s." % schema cur.execute("SET search_path TO %s" % schema) server.adaptor.commit() # Skip the whole FASTA file creation if the first file # already exists fasta_name = name + '%3.3d.fas' % 0 if fasta_name in os.listdir(self.index_dir): print " File %s present - skipping all." % fasta_name continue # Execute the SQL query (specific namespace or all sequences) if namespace: dbid = server[namespace].dbid sql = """SELECT e.name || ' (' || e.accession || ') ' || e.description AS header, s.seq AS residues FROM bioentry e, biosequence s WHERE e.bioentry_id = s.bioentry_id AND e.biodatabase_id = %s ORDER BY e.name;""" cur.execute(sql, (dbid, )) else: sql = """SELECT e.name || ' (' || e.accession || ') ' || e.description AS header, s.seq AS residues FROM bioentry e, biosequence s WHERE e.bioentry_id = s.bioentry_id ORDER BY e.name;""" cur.execute(sql) # Number of residues in the current file set so that new file is opened num_residues = max_res + 1 # Counter and file are dummy file_counter = -1 fp = StringIO() while 1: res = cur.fetchone() if not res: break title = res[0][:_COL_WIDTH - 1] sequence = res[1] if num_residues + len(sequence) > max_res: num_residues = 0 file_counter += 1 fp.close() fasta_name = name + '%3.3d.fas' % file_counter print " Creating file %s." % fasta_name fasta_path = os.path.join(self.index_dir, fasta_name) # Open file in binary mode: # We write in UNIX format with line separator '\n' fp = file(fasta_path, 'wb') # Now write the sequence fp.write('>%s\n' % title) i = 0 while i < len(sequence): fp.write('%s\n' % sequence[i:i + _COL_WIDTH]) i += _COL_WIDTH num_residues += len(sequence) fp.close() server.adaptor.close()