def check_config(dbdriver, dbtype, dbhost, dbuser, dbpasswd, testdb):
    global DBDRIVER, DBTYPE, DBHOST, DBUSER, DBPASSWD, TESTDB, DBSCHEMA, SQL_FILE
    DBDRIVER = dbdriver
    DBTYPE = dbtype
    DBHOST = dbhost
    DBUSER = dbuser
    DBPASSWD = dbpasswd
    TESTDB = testdb

    #Check the database driver is installed:
    try:
        __import__(DBDRIVER)
    except ImportError:
        message = "Install %s if you want to use %s with BioSQL " % (DBDRIVER, DBTYPE)
        raise MissingExternalDependencyError(message)

    try:
        if DBDRIVER in ["sqlite3"]:
            server = BioSeqDatabase.open_database(driver = DBDRIVER, db = TESTDB)
        else:
            server = BioSeqDatabase.open_database(driver = DBDRIVER,
                                                  user = DBUSER, passwd = DBPASSWD,
                                                  host = DBHOST)
            server.close()
            del server
    except Exception, e:
        message = "Connection failed, check settings if you plan to use BioSQL: %s" % str(e)
        raise MissingExternalDependencyError(message)
Example #2
0
def check_config(dbdriver, dbtype, dbhost, dbuser, dbpasswd, testdb):
    """Verify the database settings work for connecting."""
    global DBDRIVER, DBTYPE, DBHOST, DBUSER, DBPASSWD, TESTDB, DBSCHEMA
    global SYSTEM, SQL_FILE
    DBDRIVER = dbdriver
    DBTYPE = dbtype
    DBHOST = dbhost
    DBUSER = dbuser
    DBPASSWD = dbpasswd
    TESTDB = testdb

    if not DBDRIVER or not DBTYPE or not DBUSER:
        # No point going any further...
        raise MissingExternalDependencyError("Incomplete BioSQL test settings")

    # Check the database driver is installed:
    if SYSTEM == "Java":
        try:
            if DBDRIVER in ["MySQLdb"]:
                import com.mysql.jdbc.Driver
            elif DBDRIVER in ["psycopg2", "pgdb"]:
                import org.postgresql.Driver
        except ImportError:
            message = "Install the JDBC driver for %s to use BioSQL " % DBTYPE
            raise MissingExternalDependencyError(message)
    else:
        try:
            __import__(DBDRIVER)
        except ImportError:
            if DBDRIVER in ["MySQLdb"]:
                message = "Install MySQLdb or mysqlclient if you want to use %s with BioSQL " % (DBTYPE)
            else:
                message = "Install %s if you want to use %s with BioSQL " % (DBDRIVER, DBTYPE)
            raise MissingExternalDependencyError(message)

    try:
        if DBDRIVER in ["sqlite3"]:
            server = BioSeqDatabase.open_database(driver=DBDRIVER, db=TESTDB)
        else:
            server = BioSeqDatabase.open_database(driver=DBDRIVER, host=DBHOST,
                                                  user=DBUSER, passwd=DBPASSWD)
        server.close()
        del server
    except Exception as e:
        message = "Connection failed, check settings if you plan to use BioSQL: %s" % e
        raise MissingExternalDependencyError(message)

    DBSCHEMA = "biosqldb-" + DBTYPE + ".sql"
    SQL_FILE = os.path.join(os.getcwd(), "BioSQL", DBSCHEMA)

    if not os.path.isfile(SQL_FILE):
        message = "Missing SQL schema file: %s" % SQL_FILE
        raise MissingExternalDependencyError(message)
Example #3
0
def _do_db_cleanup():
    """Cleanup everything from TESTDB.

    Relevant for MySQL and PostgreSQL.
    """

    if DBDRIVER in ["psycopg2", "pgdb"]:
        # first open a connection the database
        # notice that postgres doesn't have createdb privileges, so
        # the TESTDB must exist
        server = BioSeqDatabase.open_database(driver=DBDRIVER, host=DBHOST,
                                              user=DBUSER, passwd=DBPASSWD,
                                              db=TESTDB)

        # The pgdb postgres driver does not support autocommit, so here we
        # commit the current transaction so that 'drop database' query will
        # be outside a transaction block
        server.adaptor.cursor.execute("COMMIT")
        # drop anything in the database
        # with Postgres, can get errors about database still being used.
        # Wait briefly to be sure previous tests are done with it.
        time.sleep(1)
        # drop anything in the database
        sql = r"DROP OWNED BY " + DBUSER
        server.adaptor.cursor.execute(sql, ())
        server.close()
    else:
        # first open a connection to create the database
        server = BioSeqDatabase.open_database(driver=DBDRIVER, host=DBHOST,
                                              user=DBUSER, passwd=DBPASSWD)
        # Auto-commit
        try:
            server.adaptor.autocommit()
        except AttributeError:
            pass
        # drop the database
        try:
            sql = r"DROP DATABASE " + TESTDB
            server.adaptor.cursor.execute(sql, ())
        except (server.module.OperationalError,
                server.module.Error,
                server.module.DatabaseError) as e:  # the database doesn't exist
            pass
        except (server.module.IntegrityError,
                server.module.ProgrammingError) as e:  # ditto--perhaps
            if str(e).find('database "%s" does not exist' % TESTDB) == -1:
                server.close()
                raise
        # create a new database
        sql = r"CREATE DATABASE " + TESTDB
        server.adaptor.execute(sql, ())
        server.close()
    def test_add_from_gff_with_taxonomy(self):
        """Add in sequences from a gff + fasta file given taxonomy."""
        gff = os.path.join(os.path.dirname(__file__), 'test_files', 'GCF_000005845.2_ASM584v2_genomic.gff')
        fasta = os.path.join(os.path.dirname(__file__), 'test_files', 'GCF_000005845.2_ASM584v2_genomic.fna')
        runner = CliRunner()
        result = runner.invoke(cli.main, self.common_params + ['-t', '-T', 511145, '-g', gff, '-f', fasta, '-D', 'test'])
        self.assertEqual(result.exit_code, 0)

        server = BioSeqDatabase.open_database(driver = self.dbdriver, user = self.dbuser,
                             passwd = self.dbpassword, host = self.dbhost, db = self.dbname)

        rows = server.adaptor.execute_and_fetchall("SELECT name FROM taxon_name where name_class = 'scientific name'")
        dbnames = set([x[0] for x in rows])
        names = set(['cellular organisms',
                    'Bacteria',
                    'Proteobacteria',
                    'Gammaproteobacteria',
                    'Enterobacterales',
                    'Enterobacteriaceae',
                    'Escherichia',
                    'Escherichia coli',
                    'Escherichia coli K-12',
                    'Escherichia coli str. K-12 substr. MG1655'])
        self.assertCountEqual(dbnames, names)
        server.close()
Example #5
0
 def loop(self, filename, format):
     original_records = list(SeqIO.parse(open(filename, "rU"), format))
     # now open a connection to load the database
     server = BioSeqDatabase.open_database(driver = DBDRIVER,
                                           user = DBUSER, passwd = DBPASSWD,
                                           host = DBHOST, db = TESTDB)
     db_name = "test_loop_%s" % filename  # new namespace!
     db = server.new_database(db_name)
     count = db.load(original_records)
     self.assertEqual(count, len(original_records))
     server.commit()
     #Now read them back...
     biosql_records = [db.lookup(name=rec.name)
                       for rec in original_records]
     #And check they agree
     self.assertTrue(compare_records(original_records, biosql_records))
     #Now write to a handle...
     handle = StringIO()
     SeqIO.write(biosql_records, handle, "gb")
     #Now read them back...
     handle.seek(0)
     new_records = list(SeqIO.parse(handle, "gb"))
     #And check they still agree
     self.assertEqual(len(new_records), len(original_records))
     for old, new in zip(original_records, new_records):
         #TODO - remove this hack because we don't yet write these (yet):
         for key in ["comment", "references", "db_source"]:
             if key in old.annotations and key not in new.annotations:
                 del old.annotations[key]
         self.assertTrue(compare_record(old, new))
     #Done
     server.close()
Example #6
0
def _do_db_create():
    """Do the actual work of database creation. Relevant for MySQL and PostgreSQL
    """
    # first open a connection to create the database
    server = BioSeqDatabase.open_database(driver = DBDRIVER,
                                          user = DBUSER, passwd = DBPASSWD,
                                          host = DBHOST)

    if DBDRIVER == "pgdb":
        # The pgdb postgres driver does not support autocommit, so here we
        # commit the current transaction so that 'drop database' query will
        # be outside a transaction block
        server.adaptor.cursor.execute("COMMIT")
    else:
        # Auto-commit: postgresql cannot drop database in a transaction
        try:
            server.adaptor.autocommit()
        except AttributeError:
            pass

    # drop anything in the database
    try:
        # with Postgres, can get errors about database still being used and
        # not able to be dropped. Wait briefly to be sure previous tests are
        # done with it.
        import time
        time.sleep(1)

        sql = r"DROP DATABASE " + TESTDB
        server.adaptor.cursor.execute(sql, ())
    except (server.module.OperationalError,
            server.module.Error,
            server.module.DatabaseError), e:  # the database doesn't exist
        pass
def main(args):
    server = BioSeqDatabase.open_database(driver=args.driver, db=args.database, user=args.user, host=args.host, passwd=args.password)
    if args.database_name not in server.keys():
        server.new_database(args.database_name)

    db = server[args.database_name]

    try:
        if args.gff is not None and args.fasta is not None:
            load_gff(db, args.gff, args.fasta, args.tax_lookup, args.taxid)
        elif args.genbank is not None:
            load_genbank(db, args.genbank, args.tax_lookup, args.taxid)
    except:
        server.adaptor.rollback()
        raise

    if args.new_taxons:
        taxon_id = add_new_taxonomy(server, args.new_taxons, args.taxid)

        if args.fasta is not None:
            gen = SeqIO.parse(args.fasta, 'fasta')
        elif args.genbank is not None:
            gen = SeqIO.parse(args.genbank, 'genbank')

        for rec in gen:
            server.adaptor.execute('update bioentry set taxon_id = %s where bioentry_id = %s',(taxon_id, db.adaptor.fetch_seqid_by_display_id(db.dbid, rec.name)))

    server.commit()
Example #8
0
def create_database():
    """Delete any existing BioSQL test database, then (re)create an empty BioSQL database."""
    if DBDRIVER in ["sqlite3"]:
        global TESTDB
        if os.path.exists(TESTDB):
            try:
                os.remove(TESTDB)
            except:
                time.sleep(1)
                try:
                    os.remove(TESTDB)
                except:
                    # Seen this with PyPy 2.1 (and older) on Windows -
                    # which suggests an open handle still exists?
                    print("Could not remove %r" % TESTDB)
                    pass
        # Now pick a new filename - just in case there is a stale handle
        # (which might be happening under Windows...)
        TESTDB = temp_db_filename()
    else:
        _do_db_create()

    # now open a connection to load the database
    server = BioSeqDatabase.open_database(driver=DBDRIVER, user=DBUSER, passwd=DBPASSWD, host=DBHOST, db=TESTDB)
    try:
        server.load_database_sql(SQL_FILE)
        server.commit()
        server.close()
    except:
        # Failed, but must close the handle...
        server.close()
        raise
Example #9
0
def load_multi_database(gb_filename_or_handle, gb_filename_or_handle2):
    """Load two GenBank files into a new BioSQL database as different subdatabases.

    This is useful for running tests against a newly created database.
    """

    TESTDB = create_database()
    # now open a connection to load the database
    db_name = "biosql-test"
    db_name2 = "biosql-test2"
    server = BioSeqDatabase.open_database(driver=DBDRIVER, user=DBUSER, passwd=DBPASSWD, host=DBHOST, db=TESTDB)
    db = server.new_database(db_name)

    # get the GenBank file we are going to put into it
    iterator = SeqIO.parse(gb_filename_or_handle, "gb")
    count = db.load(iterator)

    db = server.new_database(db_name2)

    # get the GenBank file we are going to put into it
    iterator = SeqIO.parse(gb_filename_or_handle2, "gb")
    # finally put it in the database
    count2 = db.load(iterator)
    server.commit()

    server.close()
    return count + count2
Example #10
0
    def setUp(self):
        global DBDRIVER, DBTYPE, DBHOST, DBUSER, DBPASSWD, TESTDB, DBSCHEMA
        global SYSTEM, SQL_FILE

        Entrez.email = "*****@*****.**"
        # create TESTDB
        TESTDB = create_database()

        # load the database
        db_name = "biosql-test"
        self.server = BioSeqDatabase.open_database(driver=DBDRIVER,
                                                   user=DBUSER, passwd=DBPASSWD,
                                                   host=DBHOST, db=TESTDB)

        # remove the database if it already exists
        try:
            self.server[db_name]
            self.server.remove_database(db_name)
        except KeyError:
            pass

        self.db = self.server.new_database(db_name)

        # get the GenBank file we are going to put into it
        self.iterator = SeqIO.parse("GenBank/cor6_6.gb", "gb")
Example #11
0
def main(gbfile, length=10000):
    driver = "MySQLdb"
    user   = "******"
    passwd = ""
    host   = "localhost"
    dbname = "bioseqdb"
    
    print "Parsing Genbank file sequence file...."
    with open(gbfile) as gb_handle:
        records = list(SeqIO.parse(gb_handle, "genbank"))
    print "Sorting by size and name......."
    longrecords = [record for record in records if len(record) > length]
    longrecords.sort(key=lambda x: x.name) #sort by name
    
    print "Writing to BioSQL database..."
    server = BioSeqDatabase.open_database(driver=driver, user=user,
            passwd=passwd, host=host, db=dbname)
    
    try:
        if biodb_name not in server.keys():
            server.new_database(biodb_name)
        else:
            server.remove_database(biodb_name)
            server.adaptor.commit()
            server.new_databse(biodb_name)
        db = server[biodb_name]
        db.load(longrecords)
        server.adaptor.commit()
    except:
        server.adaptor.rollback()
        raide
def main(args):
    server = BioSeqDatabase.open_database(driver=args.driver, db=args.database, user=args.user, host=args.host, passwd=args.password)
    if args.database_name not in server.keys():
        server.new_database(args.database_name)

    db = server[args.database_name]



    gen = []
    if args.fasta is not None:
        for rec in SeqIO.parse(args.fasta, 'fasta'):
            gen.append(rec.name)
    elif args.genbank is not None:
        for rec in SeqIO.parse(args.genbank, 'genbank'):
            gen.append(rec.name)
    elif args.input is not None:
        with open(args.input) as fp:
            for line in fp:
                gen.append(line.rstrip())

    if args.remove:
        taxon_id = None
    else:
        taxon_id = add_new_taxonomy(server, args.new_taxons, args.taxid)

    for rec in gen:
        server.adaptor.execute('update bioentry set taxon_id = %s where bioentry_id = %s',(taxon_id, db.adaptor.fetch_seqid_by_display_id(db.dbid, rec)))
    server.commit()
Example #13
0
def create_database():
    """Create an empty BioSQL database."""
    # first open a connection to create the database
    server = BioSeqDatabase.open_database(driver = DBDRIVER,
                                          user = DBUSER, passwd = DBPASSWD,
                                          host = DBHOST)

    # Auto-commit: postgresql cannot drop database in a transaction
    try:
        server.adaptor.autocommit()
    except AttributeError:
        pass

    # drop anything in the database
    try:
        # with Postgres, can get errors about database still being used and
        # not able to be dropped. Wait briefly to be sure previous tests are
        # done with it.
        import time
        time.sleep(1)

        sql = r"DROP DATABASE " + TESTDB
        server.adaptor.cursor.execute(sql, ())
    except server.module.OperationalError: # the database doesn't exist
        pass
    except (server.module.IntegrityError,
            server.module.ProgrammingError), e: # ditto--perhaps
        if str(e).find('database "%s" does not exist' % TESTDB) == -1 :
            raise
Example #14
0
 def trans(self, filename, format):
     original_records = list(SeqIO.parse(open(filename, "rU"), format))
     # now open a connection to load the database
     server = BioSeqDatabase.open_database(driver = DBDRIVER,
                                           user = DBUSER, passwd = DBPASSWD,
                                           host = DBHOST, db = TESTDB)
     db_name = "test_trans1_%s" % filename  # new namespace!
     db = server.new_database(db_name)
     count = db.load(original_records)
     self.assertEqual(count, len(original_records))
     server.commit()
     #Now read them back...
     biosql_records = [db.lookup(name=rec.name)
                       for rec in original_records]
     #And check they agree
     self.assertTrue(compare_records(original_records, biosql_records))
     #Now write to a second name space...
     db_name = "test_trans2_%s" % filename  # new namespace!
     db = server.new_database(db_name)
     count = db.load(biosql_records)
     self.assertEqual(count, len(original_records))
     #Now read them back again,
     biosql_records2 = [db.lookup(name=rec.name)
                        for rec in original_records]
     #And check they also agree
     self.assertTrue(compare_records(original_records, biosql_records2))
     #Done
     server.close()
def main(args):
    server = BioSeqDatabase.open_database(driver=args.driver, db=args.database, user=args.user, host=args.host, passwd=args.password)
    sfids = []
    with open(args.input) as fp:
        for line in fp:
            sfids.append(line.rstrip())

    print_feature_qv_csv(server, sfids)
Example #16
0
    def setUp(self):
        gb_file = os.path.join(os.getcwd(), "GenBank", "cor6_6.gb")
        load_database(gb_file)

        self.server = BioSeqDatabase.open_database(
            driver=DBDRIVER, user=DBUSER, passwd=DBPASSWD, host=DBHOST, db=TESTDB
        )
        self.db = self.server["biosql-test"]
Example #17
0
def get_database():
    """Perform a connection with the database.
    
    XXX The info here shouldn't be hard coded and should be specified
    on the commandline.
    """
    server = BioSeqDatabase.open_database(host="192.168.0.192", user="******", passwd="", db="biosql_new")
    return server["embl_rod"]
Example #18
0
 def setUp(self):
     #drop any old database and create a new one:
     create_database()
     #connect to new database:
     self.server = BioSeqDatabase.open_database(driver = DBDRIVER,
                                                user = DBUSER, passwd = DBPASSWD,
                                                host = DBHOST, db = TESTDB)
     #Create new namespace within new empty database:
     self.db = self.server.new_database("biosql-test")
Example #19
0
 def setUp(self):
     """Connect to the database."""
     db_name = "biosql-test-seqio"
     server = BioSeqDatabase.open_database(driver=DBDRIVER, user=DBUSER, passwd=DBPASSWD, host=DBHOST, db=TESTDB)
     self.server = server
     if db_name not in server:
         self.db = server.new_database(db_name)
         server.commit()
     self.db = self.server[db_name]
Example #20
0
    def setUp(self):
        """Load a database."""
        load_database("GenBank/cor6_6.gb")

        self.server = BioSeqDatabase.open_database(driver=DBDRIVER,
                                                   user=DBUSER, passwd=DBPASSWD,
                                                   host=DBHOST, db=TESTDB)
        self.db = self.server["biosql-test"]
        self.item = self.db.lookup(accession="X62281")
 def setUp(self):
     # drop any old database and create a new one:
     testdb, dbdriver, dbuser, dbpassword, dbhost = connection_parameters(create=True)
     # connect to new database:
     self.server = BioSeqDatabase.open_database(driver=dbdriver,
                                                user=dbuser, passwd=dbpassword,
                                                host=dbhost, db=testdb)
     self._create_taxonomy()
     self.taxon_tree = TaxonTree(self.server.adaptor)
     self.testdb = testdb
Example #22
0
    def setUp(self):
        """Connect to and load up the database.
        """
        load_database("GenBank/cor6_6.gb")

        self.server = BioSeqDatabase.open_database(
            driver=DBDRIVER, user=DBUSER, passwd=DBPASSWD, host=DBHOST, db=TESTDB
        )

        self.db = self.server["biosql-test"]
 def test_change_taxonomy(self):
     """Check that taxonomy can be properly changed."""
     runner = CliRunner()
     infile = os.path.join(os.path.dirname(__file__), 'test_files', 'modify_header.txt')
     result = runner.invoke(cli.main, self.common_params + ['-i', infile, '-T', '112040', '--key', 'accession'])
     self.assertEqual(result.exit_code, 0)
     print(result.output)
     server = BioSeqDatabase.open_database(driver = self.dbdriver, user = self.dbuser,
                          passwd = self.dbpassword, host = self.dbhost, db = self.dbname)
     rows = server.adaptor.execute_and_fetchall("select ncbi_taxon_id from taxon join bioentry using(taxon_id) where bioentry.accession = 'NC_000913'")
     taxid = rows[0][0]
     self.assertEqual(taxid, 112040)
Example #24
0
    def setUp(self):
        """Load a database.
        """
        gb_file = os.path.join(os.getcwd(), "GenBank", "cor6_6.gb")
        gb_handle = open(gb_file, "r")
        load_database(gb_handle)
        gb_handle.close()

        self.server = BioSeqDatabase.open_database(driver = DBDRIVER,
                                                   user = DBUSER, passwd = DBPASSWD,
                                                   host = DBHOST, db = TESTDB)
        self.db = self.server["biosql-test"]
        self.item = self.db.lookup(accession = "X62281")
Example #25
0
    def setUp(self):
        """Connect to and load up the database.
        """
        gb_file = os.path.join(os.getcwd(), "GenBank", "cor6_6.gb")
        gb_handle = open(gb_file, "r")
        load_database(gb_handle)
        gb_handle.close()

        self.server = BioSeqDatabase.open_database(
            driver=DBDRIVER, user=DBUSER, passwd=DBPASSWD, host=DBHOST, db=TESTDB
        )

        self.db = self.server["biosql-test"]
def main(args):

    server = BioSeqDatabase.open_database(driver=args.driver, db=args.database, user=args.user, host=args.host, passwd=args.password)
    if args.database_name not in server.keys():
        server.new_database(args.database_name)

    db = server[args.database_name]
    try:
        load_img(db, args.directory, args.tax_lookup, args.taxid)
        server.adaptor.commit()
    except:
        server.adaptor.rollback()
        raise
    def test_add_from_genbank(self):
        """Add in sequences from a Genbank file."""
        infile = os.path.join(os.path.dirname(__file__), 'test_files', 'GCF_000005845.2_ASM584v2_genomic.gbff')
        runner = CliRunner()
        result = runner.invoke(cli.main, self.common_params + ['-G', infile, '-D', 'test'])
        self.assertEqual(result.exit_code, 0)

        server = BioSeqDatabase.open_database(driver = self.dbdriver, user = self.dbuser,
                             passwd = self.dbpassword, host = self.dbhost, db = self.dbname)

        rows = server.adaptor.execute_and_fetchall("SELECT name FROM taxon_name where name_class = 'scientific name'")
        self.assertEqual(rows, [('Escherichia coli str. K-12 substr. MG1655',)])
        server.close()
Example #28
0
 def test_backwards_compatibility(self):
     """Check can re-use an old BioSQL SQLite3 database."""
     original_records = list(SeqIO.parse("GenBank/cor6_6.gb", "gb"))
     # now open a connection to load the database
     server = BioSeqDatabase.open_database(driver=DBDRIVER,
                                           db="BioSQL/cor6_6.db")
     db = server["OLD"]
     self.assertEqual(len(db), len(original_records))
     #Now read them back...
     biosql_records = [db.lookup(name=rec.name) \
                       for rec in original_records]
     #And check they agree
     self.assertTrue(compare_records(original_records, biosql_records))
    def test_add_from_gff(self):
        """Add in sequences from a gff + fasta file."""
        gff = os.path.join(os.path.dirname(__file__), 'test_files', 'GCF_000005845.2_ASM584v2_genomic.gff')
        fasta = os.path.join(os.path.dirname(__file__), 'test_files', 'GCF_000005845.2_ASM584v2_genomic.fna')
        runner = CliRunner()
        result = runner.invoke(cli.main, self.common_params + ['-g', gff, '-f', fasta, '-D', 'test'])
        self.assertEqual(result.exit_code, 0)

        server = BioSeqDatabase.open_database(driver = self.dbdriver, user = self.dbuser,
                             passwd = self.dbpassword, host = self.dbhost, db = self.dbname)

        rows = server.adaptor.execute_and_fetchall("SELECT name FROM taxon_name where name_class = 'scientific name'")
        self.assertEqual(rows, [])
        server.close()
Example #30
0
def create_database():
    """Delete any existing BioSQL test database, then (re)create an empty BioSQL database."""
    if DBDRIVER in ["sqlite3"]:
        if os.path.exists(TESTDB):
            os.remove(TESTDB)
    else:
        _do_db_create()

    # now open a connection to load the database
    server = BioSeqDatabase.open_database(driver = DBDRIVER,
                                          user = DBUSER, passwd = DBPASSWD,
                                          host = DBHOST, db = TESTDB)
    server.load_database_sql(SQL_FILE)
    server.commit()
    server.close()
Example #31
0
    def upload_files(self,
                     seqtype,
                     filetype,
                     upload_path,
                     upload_list=None,
                     new_db=False):
        """Upload NCBI/genbank files to a new or existing sqlite database.

        :param seqtype:
        :param filetype:
        :param upload_path:
        :param upload_list:  (Default value = None)
        :param new_db:  (Default value = False)
        :return:
        """

        db_name = Path(self.database_name.stem + '_' + seqtype +
                       self.database_name.suffix)
        db_abs_path = Path(upload_path) / db_name

        # Make sure a BioSQL-SQLite database exists
        # TODO-ROB:  Rework this part
        # if db_abs_path.is_file():
        #     raise FileExistsError
        # elif new_db:
        #     self.copy_template_database(destination=db_abs_path)
        # else:
        #     raise FileNotFoundError("Database not found: %s\mPlease create a BioSQL-SQLite database." % self.database_abs_path)

        if not upload_list:
            upload_list = os.listdir(upload_path)
        # Parse the upload list and upload the files to the BioSQL-SQLite database.
        for file in upload_list:
            abs_upload_path = Path(str(upload_path)) / Path(file)

            # Make a connection with the BioSQL database
            try:
                server = BioSeqDatabase.open_database(
                    driver=self.driver.lower(), db=str(db_abs_path))
                self.biosqllog.info("Server Connected.")
                pass
            except:
                self.biosqllog.warn(
                    "The Server did not Connect.  Check the to make sure %s exists."
                    % self.database_abs_path)
                raise FileNotFoundError

            # See if the sub database exists (rna, protein, or genomic)
            try:
                if seqtype not in server.keys():
                    server.new_database(seqtype)
                    self.biosqllog.info(
                        "New Sub-Database created, %s, for %s." %
                        (seqtype, db_abs_path))
                # Connect to the sub database
                sub_db = server[seqtype]

                count = sub_db.load(SeqIO.parse(abs_upload_path, filetype))
                self.biosqllog.info("%s loaded with %s %s files" %
                                    (db_name, count, filetype))
                server.commit()
                self.biosqllog.warn("Server committed.")
                t_count = t_count + count
                self.biosqllog.info(
                    "The server has not loaded a total of %s files." % t_count)
                # TODO-ROB:  Add something to do with time here.
            except:
                self.biosqllog.critical("Unable to load the database...")
                server.rollback()
                try:
                    del server[sub_db]
                    self.biosqllog.critical(
                        "%s sub database deleted from %s.  All of the info will be lost."
                        % (sub_db, db_abs_path))
                    server.commit()
                    self.biosqllog.critical("Server committed")
                except:
                    raise
                raise
Example #32
0
from BioSQL import BioSeqDatabase
import sys
from operator import itemgetter

import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import gaussian_kde

username = raw_input("Please enter user name: ")
password = raw_input("and password: "******"psycopg2", user = username, passwd = password, host = "dbpg-ifi-utv.uio.no", db = "rnammer")


class Genome:

    def __init__(self, ID):
        self.name = ID
        
        self.orginal    = []
        self.new        = []

    def add_org(self,data):
        self.orginal.append(data)
    
    def add_new(self,data):
        self.new.append(data)

    #def clean():
        # compare content of orginal to new
        # NB can only be don after bothe are added 
Example #33
0
def loadDB(catalog):
    from BioSQL import BioSeqDatabase
    import sys
    
    username = raw_input("Please enter user name: ")
    password = raw_input("and password: "******"dbpg-ifi-utv.uio.no"
    db_name = "rnammer"

    server = BioSeqDatabase.open_database(driver="psycopg2", user=username,passwd=password, 
            host=host, db=db_name)
    
    biodb_name = "empty"     # genebank problem ? se staving

    db  = "nodb"

    gi_rep = 1
    
    for gbff in catalog:
                               #server.remove_database(source)
        print gi_rep
        print gbff


        parser = GenBank.FeatureParser()
        #record = parser.parse(open(gbff))
        #records = SeqIO.parse(open(gbff),'genbank')
        records = GenBank.Iterator(open(gbff), parser)
        
        for x in records:
            if re.search("plasmid",x.description, re.IGNORECASE):
                continue
            print "Record name:"
            print x.id
            #print dir(x)

            if "Proteobacteria" == x.annotations["taxonomy"][1]:
                print x.annotations["taxonomy"][1]
                print x.annotations["taxonomy"][2]
                biodb_name = x.annotations["taxonomy"][2]
            else :
                print x.annotations["taxonomy"][1]
                biodb_name = x.annotations["taxonomy"][1]
            while True : 
                try :
                    db = server[biodb_name] 
                    #print "here"
                    break
                except KeyError :
                    #print ("Cannot find biodatabase with name %r making it" % source)
                    server.new_database(biodb_name)
                    server.commit()
            db.load([x])
        #record.annotations["gi"] = gi_rep 
        #print type(records)

        #print record.id
        gi_rep = gi_rep + 1

        #db.load([records])

    server.adaptor.commit()
Example #34
0
    def gbk_upload(self):
        """Upload a BioSQL database with target GenBank data (.gbk files).

        This method is only usable after creating GenBank records with this
        class.  It uploads a BioSQL databases with target GenBank data (.gbk
        files).  This creates a compact set of data for each project.

        :return:  Does not return an object.
        """

        t_count = 0
        # Parse the tier dictionary
        for TIER in self.tier_frame_dict.keys():
            db_name = str(TIER) + '.db'
            db_file_path = self.target_gbk_db_path / Path(db_name)
            # Create the db file if it exists
            if os.path.isfile(str(db_file_path)) is False:
                self.genbanklog.warn(
                    'Copying Template BioSQL Database...  This may take a few minutes...'
                )
                shutil.copy2('Template_BioSQL_DB.db', str(db_file_path))

            # If it already exists then the database is bad, or needs to be update.  Delete it.
            else:
                # TODO-ROB: This part is broken until the template db creation and management is added
                os.remove(str(db_file_path))
                self.genbanklog.warn(
                    'Copying Template BioSQL Database...  This may take a few minutes...'
                )
                shutil.copy2('Template_BioSQL_DB.db', str(db_file_path))

            server = BioSeqDatabase.open_database(driver='sqlite3',
                                                  db=str(db_file_path))
            gene_path = self.raw_data
            # Parse the raw_data folder to get the name of each gene.
            for GENE in os.listdir(str(gene_path)):
                sub_db_name = GENE
                genbank_path = gene_path / Path(GENE) / Path('GENBANK')
                # Parse the GenBank file names for each gene in order to upload them to a custom BioSQL database
                for FILE in os.listdir(str(genbank_path)):
                    # Try to load the database.
                    try:
                        if sub_db_name not in server.keys():
                            server.new_database(sub_db_name)
                        db = server[sub_db_name]
                        count = db.load(SeqIO.parse(FILE, 'genbank'))
                        server.commit()
                        self.genbanklog.info('Server Commited %s' %
                                             sub_db_name)
                        self.genbanklog.info('%s database loaded with %s.' %
                                             (db.dbid, FILE))
                        self.genbanklog.info(
                            "That file contains %s genbank records." %
                            str(count))
                        t_count = t_count + count
                        self.genbanklog.info(
                            'The total number of files loaded so far is %i.' %
                            t_count)
                    # If the database cannot be loaded then rollback the server and raise an error.
                    except BaseException:
                        server.rollback()
                        # Try to delete the sub database and commit
                        try:
                            del server[sub_db_name]
                            server.commit()
                        # If it cannot be deleted then raise an error.
                        except BaseException:
                            raise
                        raise
import sqlite3
import fnmatch
import time
import Bio
import csv
import os
import pexpect
from Bio import SeqIO
from BioSQL import BioSeqDatabase

server_db_list = [
]  #list of db names.  I will create a DB for the orthologs of interest soon.

#Open the server that we want to look at
server = BioSeqDatabase.open_database(
    driver="sqlite3",
    db="/work5/r2294/bin/NCBI_data/vertebrate_mammalian/DB/VM_RefseqRNA_DB.db")
#Get the current working directory and set it to the home variable
home = os.getcwd()

#make a list of database names
for db_name in server.keys():
    server_db_list.append(db_name)
print(server_db_list)
input('this is the server_db_list Do you enjoy it?  Very much eh? ......')

#To start you have to parse each subdatabase on the "server" in order to search through each one.
for db_name in server.keys():
    db = server[db_name]
    print(db_name)
def fetch_gis(email, db_name, tool, batch_size, log_file, save_file_directory):

    # BASE LOGGING CONFIG
    logging.basicConfig(
        filename=log_file,
        level=logging.INFO,
        filemode='a',  # append to log file
        format='%(asctime)s:%(levelname)s:%(name)s:%(message)s')

    # GET bioentries gis that have already been imported.
    # GET gi_queues that have not been imported
    # SELECT thouse gis that are not fetched and not in bioentry
    with psycopg2.connect("dbname=%s" % (db_name)) as conn:
        with conn.cursor() as cur:
            cur.execute("""SELECT identifier FROM bioentry""")
            bioentry_ids = set(map(lambda x: x[0], cur.fetchall()))
            logging.info("found {0} gis in bioentry".format(len(bioentry_ids)))
            cur.execute("""SELECT gi FROM gi_queues WHERE fetched IS false""")
            gi_queues_ids = set(map(lambda x: x[0], cur.fetchall()))
            logging.info("found {0} gis in qi_queues".format(
                len(gi_queues_ids)))
            fetch_gis = list(gi_queues_ids - bioentry_ids)
            logging.info("will fetch {0} gis from ncbi.".format(
                len(fetch_gis)))

    # SELECT the biodatabase to use
    # GET proteins from NCBI
    # IMPORT proteins to bioentries
    # LOG which gis imported and which failed
    #
    # Fetch in blocks of 1000
    for i in xrange((len(fetch_gis) + batch_size - 1) / batch_size):
        try:
            server = BioSeqDatabase.open_database(driver="psycopg2",
                                                  db=db_name)
            if db_name in server:
                db = server[db_name]
            else:
                db = server.new_database(db_name,
                                         description="sll biosql test")
                server.commit()

            Entrez.email = email
            Entrez.tool = tool

            logging.info(
                "Fetch gis in batch of #{0}. Now fetching from {1} .".format(
                    batch_size, i * batch_size))
            fh = Entrez.efetch(db="protein",
                               rettype="gp",
                               retmode="text",
                               id=fetch_gis[i * batch_size:(i + 1) *
                                            batch_size])
            sleep(0.5)
            seqs = list(SeqIO.parse(fh, "gb"))

            for seq in seqs:
                f = gzip.open(
                    os.path.join(save_file_directory,
                                 seq.annotations["gi"] + ".gb.gz"), 'wb')
                SeqIO.write(seq, f, "genbank")
            logging.info("Fetched {0} sequences from NCBI.".format(len(seqs)))
            count = db.load(seqs)
            logging.info("Inserted {0} sequences into biosql".format(count))

            if count != len(fetch_gis):
                logging.warn("Not equally many gis to fetch as were inserted")
            not_fetched_gis = set()
            for i in fetch_gis[i * batch_size:(i + 1) * batch_size]:
                try:
                    entry = db.lookup(gi=i)
                except IndexError as e:
                    logging.warn("Gi: {0} not inserted in database".format(i))
                    not_fetched_gis.add(i)
            server.commit()
        except Exception as e:
            tb = traceback.format_exc()
            logging.error("Error: {0}\nTraceback: {1}".format(e, tb))
        finally:
            fh.close()
            server.close()

            # UPDATE gi_queue entries that have been fetched
            with psycopg2.connect("dbname=%s" % db_name) as conn:
                with conn.cursor() as cur:
                    cur.execute(
                        """UPDATE gi_queues SET fetched=true,updated_at=now() WHERE gi IN ('{0}')"""
                        .format("','".join(set(fetch_gis) - not_fetched_gis)))
                    logging.info(
                        "Updated status of {0} gi_queues rows out of {1} gis that were added to bioentry"
                        .format(cur.rowcount, count))
Example #37
0
def _get_db(dbpath=dbpath, db=dbname):
    server = BioSeqDatabase.open_database(driver='sqlite3', db=dbpath)
    return server[db]
Example #38
0
#!/usr/bin/env python
"""Test timing of loading records into a BioSQL database."""
from __future__ import print_function

import time
# set up the connection
from Bio import GenBank
from BioSQL import BioSeqDatabase


server = BioSeqDatabase.open_database(host="192.168.0.192", user="******",
                                      passwd="", db="pythonloadtest")

# remove the database if it already exists
db_name = "testload"
try:
    server[db_name]
    server.remove_database(db_name)
except KeyError:
    pass
db = server.new_database(db_name)

input_file = "/home/hack/install/biopython/Tests/GenBank/cor6_6.gb"
handle = open(input_file, "r")
parser = GenBank.FeatureParser()
iterator = GenBank.Iterator(handle, parser)

# -- do the timing part
start_time = time.time()
num_records = db.load(iterator)
end_time = time.time()
Example #39
0
temp_var['log_file_rank'] = str(temp_var['log_file']) + str(
    rank)  # Each process gets it's own unique log file
ser_loc = where.DB
loaded_list = []
t_count = 0

# Open a logging file and begin the process of uploading
with open(where.LOG + '/Temp/' + temp_var['log_file_rank'], 'w') as log_w:
    for file in temp_var['small_list']:
        print('file: ', file)
        log = []
        log.append('file: %s' % file)
        # Create's or opens an existing server.  If the database cannot be created or opened it deletes and try again
        try:
            server = BioSeqDatabase.open_database(driver='sqlite3',
                                                  db=ser_loc + '/' +
                                                  temp_var['db_name'] + '.' +
                                                  temp_var['key'] + '.db')
            print('server created')
            log.append('server created')
        except:
            print('server not created')
            log.append('server not created')
            os.remove(ser_loc + ('/%s.%s.db' %
                                 (temp_var['db_name'], temp_var['key'])))
            raise

        # Deprecated (all files are RNA, but I originally wanted to get the other types as well)
        s = str(file).lower()
        if s.find("rna") != -1:
            sub_db = 'RNA'
        elif s.find("protein") != -1:
Example #40
0
# -*- coding: utf-8 -*-
"""
Created on Mon Sep 19 11:16:37 2016

@author: rgilmore
"""

import os
from Bio import SeqIO
import csv
from BioSQL import BioSeqDatabase
from fnmatch import fnmatch

server = BioSeqDatabase.open_database(
    driver="sqlite3",
    db="/work5/r2294/bin/NCBI_data/vertebrate_mammalian/DB/GPCR_Orthologs_DB.db"
)
count = 0
home = os.getcwd()
os.chdir('/work5/r2294/bin/NCBI_data/Raw_GBK_Files/HTR1A')
for file in (os.listdir('/work5/r2294/bin/NCBI_data/Raw_GBK_Files/HTR1A')):
    if fnmatch(file, '*.gbk') == False:
        continue

    print(file)
    db = server['HTR1A']
    try:
        c = db.load(SeqIO.parse(file, 'genbank'))
    except:
        print('error')
        continue
Example #41
0
#!/usr/bin/env python

from Bio import GenBank
from Bio import Entrez
from BioSQL import BioSeqDatabase
import sys

# Should read these from settings at some point
dbpath = 'biosql.sqlite3'
dbname = 'local_db'
Entrez.email = '*****@*****.**'

server = BioSeqDatabase.open_database(driver='sqlite3', db=dbpath)
db = server[dbname]

parser = GenBank.FeatureParser()
loadgb = lambda _id: db.load(
    GenBank.Iterator(
        Entrez.efetch(db='nucleotide', id=_id, rettype='gb', retmode='text'),
        parser))

ACCESSIONS_FILE = 'accession.lst' if len(sys.argv) < 2 else sys.argv[1]
for id in open(ACCESSIONS_FILE):
    print "Loading %s" % id
    loadgb(id)
server.adaptor.commit()
Example #42
0
        '--seqfeature',
        help=
        'The first column of the input file is the seqfeature id used by the database. Does not apply when using a gff file as input',
        action='store_true',
        default=False)
    parser.add_argument(
        '--replace',
        help='replace any existing annotations for the given qualifiers',
        action='store_true',
        default=False)
    args = parser.parse_args()
    if args.password is None:
        args.password = getpass("Please enter the password for user " + \
                args.user + " on database " + args.database)

    server = BioSeqDatabase.open_database(driver=args.driver,
                                          db=args.database,
                                          user=args.user,
                                          host=args.host,
                                          passwd=args.password)

    db = server[args.dbname]

    if args.input is not None:
        mapping = parse_input(args.input)
    else:
        mapping = parse_gff(args.gff)

    add_annotation(db, mapping, args.seqfeature, args.replace)
    server.commit()
Example #43
0
parser.add_argument('-user', help='database user', default="root")
parser.add_argument('-host', help='database host', default="localhost")
parser.add_argument('-passwd', help='database password', required=True)
args = parser.parse_args()

from BioSQL import BioSeqDatabase

cs = []
with gzip.open(args.refseq, "rt") as h:
    for x in tqdm(bpio.parse(h, "gb")):
        cs.append(x)
assert (cs)

server = BioSeqDatabase.open_database(driver="MySQLdb",
                                      user=args.user,
                                      passwd=args.passwd,
                                      host=args.host,
                                      db=args.db)

acc = "GCF_" + args.refseq.split("_")[1]
db = server.new_database(acc, description="")
server.commit()

count = db.load(tqdm(cs))
print(count)
server.commit()

db = server.new_database(acc + "_prots", description="")
server.commit()
prots = []
for x in tqdm(cs):
from BioSQL import BioSeqDatabase
server = BioSeqDatabase.open_database(driver="MySQLdb",
                                      user="******",
                                      passwd="FurtherFlowersVenus",
                                      host="localhost",
                                      db="bioseqdb")
db = server.new_database("just_testing", description="Just for testing")
server.commit()  #On Biopython 1.49 or older, server.adaptor.commit()
Example #45
0
# -*- coding: utf-8 -*-
"""
Created on Wed Aug 17 15:02:37 2016

@author: rgilmore
"""

import sqlite3
from BioSQL import BioSeqDatabase
from Bio import GenBank
import os

server = BioSeqDatabase.open_database(driver="sqlite3", db="biosql.db")

db = server.new_database("HTR1A")

dir_list1 = os.listdir()
print(dir_list1)

#for files in  dir_list1:
parser = GenBank.FeatureParser()
iterator = GenBank.Iterator(open("HTR1A_Ailuropoda melanoleuca.gbk"), parser)
db.load(iterator)
db.adaptor.commit()
#input("%s loaded into HTR1A database.  Proceed?")

server.commit()
server.close()
Example #46
0
def main(args):
    server = BioSeqDatabase.open_database(driver=args.driver,
                                          db=args.database,
                                          user=args.user,
                                          host=args.host,
                                          passwd=args.password)

    tax_name = False
    try:
        ncbi_tax = int(args.taxid)
    except ValueError:
        tax_name = True

    if not tax_name:
        print("interpreting as an NCBI taxon ID...", file=sys.stderr)
        taxon_id_lookup_sql = "SELECT bioentry_id, taxon_id, biodatabase.name FROM bioentry JOIN "\
                "biodatabase USING(biodatabase_id) WHERE taxon_id IN "\
                "(SELECT DISTINCT include.taxon_id FROM taxon "\
                "INNER JOIN taxon as include ON (include.left_value "\
                "BETWEEN taxon.left_value AND taxon.right_value) "\
                "WHERE taxon.ncbi_taxon_id  = %s AND include.right_value = include.left_value + 1)"

        rows = server.adaptor.execute_and_fetchall(taxon_id_lookup_sql,
                                                   (ncbi_tax, ))
    else:
        print("interpreting as a taxon name...", file=sys.stderr)
        taxon_name_lookup_sql = "SELECT bioentry_id, taxon_id, biodatabase.name FROM bioentry JOIN "\
                "biodatabase USING(biodatabase_id) WHERE taxon_id IN "\
                "(SELECT DISTINCT include.taxon_id FROM taxon "\
                "INNER JOIN taxon as include ON (include.left_value "\
                "BETWEEN taxon.left_value AND taxon.right_value) "\
                "WHERE taxon.taxon_id IN (SELECT taxon_id FROM taxon_name "\
                "WHERE name like %s) AND include.right_value = include.left_value + 1)"
        rows = server.adaptor.execute_and_fetchall(taxon_name_lookup_sql,
                                                   (args.taxid, ))

    if args.feature_type is not None:
        types = args.feature_type
    elif args.output_format == 'feat-prot':
        types = ['CDS']
    elif args.output_format == 'feat-nucl':
        types = ['CDS', 'rRNA', 'tRNA']

    if len(rows) == 0:
        print(
            "There does not appear to be any sequences associated with\n"
            "the taxonomy provided. If you used a taxonomy name, make sure\n"
            "it is spelled correctly. If you used an NCBI taxonomy ID, make\n"
            "sure that it is correct.",
            file=sys.stderr)
        sys.exit(1)

    dbids = {}
    for row in rows:
        dbids[(row[0], row[2])] = row[1]
    files = {}
    taxid_to_dbids = {}
    if args.split_species:
        taxon_file_mapping = {}
        for k, v in dbids.items():
            tname = server.adaptor.execute_and_fetch_col0(
                "SELECT name from taxon_name where taxon_id = %s and name_class = %s",
                (v, 'scientific name'))[0]
            tname = tname.replace(' ', '_')
            if args.output_format == 'gb':
                tname += '.gb'
            elif args.output_format == 'feat-prot':
                tname += '.faa'
            else:
                tname += '.fna'
            files[v] = tname
            taxid_to_dbids.setdefault(v, []).append(k)

    if args.split_species:
        # got to save all of the records before printing them out
        outdata = {}
        for taxid, dbid_list in taxid_to_dbids.items():
            for dbid, dbname in dbid_list:
                db = server[dbname]
                seq_rec = db[dbid]
                outdata.setdefault(taxid, []).append(seq_rec)

        for taxid, dbrecs in outdata.items():
            with open(files[taxid], 'w') as fp:
                if 'feat' in args.output_format:
                    for dbrec in dbrecs:
                        extract_feature(dbrec, args.output_format, fp)
                else:
                    SeqIO.write(dbrecs, fp, args.output_format)

    else:
        if args.output_format == 'feat-prot':
            extract_feature_sql(server,
                                get_seqfeature_ids_for_bioseqs(
                                    server, [x[0] for x in dbids.keys()]),
                                type=types,
                                translate=True)
        elif args.output_format == 'feat-nucl':
            extract_feature_sql(server,
                                get_seqfeature_ids_for_bioseqs(
                                    server, [x[0] for x in dbids.keys()]),
                                type=types)
        else:
            for (dbid, dbname), taxid in dbids.items():
                db = server[dbname]
                try:
                    dbrec = db[dbid]
                    SeqIO.write(dbrec, sys.stdout, args.output_format)
                except KeyError:
                    pass
Example #47
0
def main():
    """This is run if file is directly executed, but not if imported as
    module. Having this in a separate function  allows importing the file
    into interactive python, and still able to execute the
    function for testing"""
    parser = argparse.ArgumentParser()
    parser.add_argument("-f", "--file",
                        required=True,
                        help="input file",
                        type=str)

    parser.add_argument("-l", "--locus",
                        required=True,
                        help="Locus",
                        type=str)

    parser.add_argument("-k", "--kir",
                        help="Option for running with KIR",
                        action='store_true')

    parser.add_argument("-s", "--server",
                        help="Option for running with a server",
                        action='store_true')

    parser.add_argument("-v", "--verbose",
                        help="Option for running in verbose",
                        action='store_true')

    args = parser.parse_args()
    fastafile = args.file
    locus = args.locus

    verbose = False
    if args.verbose:
        verbose = True

    verbose = False
    if args.verbose:
        verbose = True

    kir = False
    if args.kir:
        kir = True

    serv = False
    if args.server:
        serv = True

    if verbose:
        logging.basicConfig(format='%(asctime)s - %(name)-35s - %(levelname)-5s - %(message)s',
                            datefmt='%m/%d/%Y %I:%M:%S %p',
                            level=logging.INFO)

    server = None
    if serv:
        server = BioSeqDatabase.open_database(driver="pymysql", user="******",
                                              passwd="", host="localhost",
                                              db="bioseqdb")

    seqann = BioSeqAnn(verbose=True, kir=kir)
    for seq in SeqIO.parse(fastafile, "fasta"):
        ann = seqann.annotate(seq, locus=locus)
        print('{:*^20} {:^20} {:*^20}'.format("", str(seq.description), ""))
        l = 0
        for f in ann.annotation:
            if isinstance(ann.annotation[f], DBSeq):
                print(f, ann.method, str(ann.annotation[f]), sep="\t")
                l += len(ann.annotation[f])
            else:
                print(f, ann.method, str(ann.annotation[f].seq), sep="\t")
                l += len(ann.annotation[f].seq)
        print("")

    if serv:
        server.close()
Example #48
0
def main():
    """This is run if file is directly executed, but not if imported as
    module. Having this in a separate function  allows importing the file
    into interactive python, and still able to execute the
    function for testing"""
    parser = argparse.ArgumentParser()
    parser.add_argument("-v",
                        "--verbose",
                        help="Option for running in verbose",
                        action='store_true')

    parser.add_argument("-n",
                        "--number",
                        required=False,
                        help="Number of IMGT/DB releases",
                        default=1,
                        type=int)

    parser.add_argument("-r",
                        "--releases",
                        required=False,
                        help="IMGT/DB releases",
                        type=str)

    args = parser.parse_args()
    releases = args.releases
    number = args.number

    if args.verbose:
        verbose = True
    else:
        verbose = False

    if releases:
        dblist = [db for db in releases.split(",")]
    else:
        try:
            versions_url = "https://www.ebi.ac.uk/ipd/imgt/hla/docs/release.html"
            df = pd.read_html(versions_url)[0]
            x = df.columns
            dblist = [l.replace(".", '') for l in df[x[0]].tolist()[0:number]]
        except ValueError as err:
            db_error = "Failed to load DB list: {0}".format(err)
            logging.info(db_error)
            logging.info("Defaulting to Latest")
            dblist = ["Latest"]

    # Connecting to mysql DB
    server = BioSeqDatabase.open_database(driver="pymysql",
                                          user="******",
                                          passwd="my-secret-pw",
                                          host="localhost",
                                          db="bioseqdb")

    if verbose:
        dbversions_str = ",".join(dblist)
        logging.info("IMGT/HLA DB Versions = " + dbversions_str)

    # Looping through DB verions
    for dbv in dblist:

        # Downloading hla.dat file
        hladat = download_dat(dbv)

        if verbose:
            logging.info("Finished downloading hla.dat file for " + str(dbv))

        # Downloading allele list
        allele_list = download_allelelist(dbv)

        if verbose:
            logging.info("Finished downloading allele list for " + str(dbv))

        hla_names = {}
        try:
            # File formats change...
            with open(allele_list, 'r') as f:
                for line in f:
                    line = line.rstrip()
                    if re.search("#", line) or re.search("AlleleID", line):
                        continue
                    accession, name = line.split(",")
                    hla_names.update({accession: name})
            f.close()
            if verbose:
                nalleles = len(hla_names.keys())
                logging.info("Finished loading " + str(nalleles) +
                             " alleles for " + str(dbv))
        except ValueError as err:
            list_error = "Allelelist error: {0}".format(err)
            logging.error(list_error)
            server.close()
            os.remove(hladat)
            os.remove(allele_list)
            sys.exit()

        cmd = "perl -p -i -e 's/[^\\x00-\\x7F]//g' " + hladat
        os.system(cmd)

        # Loading sequence data from hla.dat file
        try:
            seq_list = list(SeqIO.parse(hladat, "imgt"))
        except:
            #read_error = "Read dat error: {0}".format(err)
            logging.error("ERROR LOADING!!")
            server.close()
            os.remove(hladat)
            os.remove(allele_list)
            sys.exit()

        new_seqs = {
            "A": [],
            "B": [],
            "C": [],
            "DRB1": [],
            "DQB1": [],
            "DRB3": [],
            "DRB4": [],
            "DRB5": [],
            "DQA1": [],
            "DPA1": [],
            "DPB1": []
        }

        # Changing the sequence name to
        # the HLA allele name instead of the accession
        for seq in seq_list:
            if seq.name in hla_names:
                loc, allele = hla_names[seq.name].split("*")
                if loc in new_seqs:
                    hla_name = "HLA-" + hla_names[seq.name]
                    if not hla_name in skip_alleles:
                        seq.name = hla_name
                        new_seqs[loc].append(seq)

        dbsp = list(dbv)
        descr = ".".join([dbsp[0], dbsp[1] + dbsp[2], dbsp[3]])

        if verbose:
            logging.info("Loaded IMGT dat file " + descr)

        # Looping through and loading each locus
        for locus in new_seqs:
            dbname = dbv + "_" + locus
            dbdescription = "IMGT/HLA " + descr + " " + locus
            db = server.new_database(dbname, description=dbdescription)
            try:
                count = db.load(new_seqs[locus])
            except:
                load_fail = sys.exc_info()[0]
                logging.error("Faild to load " + load_fail)
                server.close()
                os.remove(hladat)
                os.remove(allele_list)
                sys.exit()

            if verbose:
                logging.info("Loaded " + str(count) + " for " + dbname)

            # Commiting data to mysql db
            server.commit()

        # Removing hla.dat and allele list files
        os.remove(hladat)
        os.remove(allele_list)

        if verbose:
            logging.info("Finished loading " + descr)

    server.close()
Example #49
0
from Bio import SeqIO
from Bio.Seq import Seq
from BioSQL import BioSeqDatabase


def get_mysql_conn():
    try:
        conn = MySQLdb.connect(host=os.environ["MYSQL_HOST"],
                               user=os.environ["MYSQL_USER"],
                               passwd=os.environ["MYSQL_PW"],
                               db=os.environ["MYSQL_DB"])
    except MySQLdb.Error, e:
        print "Error %d: %s" % (e.args[0], e.args[1])
        sys.exit(1)
    biosql_server = BioSeqDatabase.open_database( \
        driver="MySQLdb", user=os.environ["MYSQL_USER"], \
        passwd = os.environ["MYSQL_PW"], \
        host = os.environ["MYSQL_HOST"], db=os.environ["MYSQL_DB"])
    DB = biosql_server["medicago"]


def createSubDB(dbname, desc):
    db = biosql_server.new_database(dbname, description=desc)
    biosql_server.commit()


def loadSeq2DB(seqRcdLst):
    try:
        DB.load(seqRcdLst)
        biosql_server.adaptor.commit()
    except:
        biosql_server.adaptor.rollback()
Example #50
0
    dat = ".".join([db, "hla", "dat"])
    urllib.request.urlretrieve(url, dat)
    return dat


def download_allelelist(db):
    url = 'https://raw.githubusercontent.com/ANHIG/IMGTHLA/Latest/Allelelist.' + db + '.txt'
    alist = ".".join([db, "Allelelist", "txt"])
    urllib.request.urlretrieve(url, alist)
    return alist


dblist = ["".join([str(i), str("0")]) for i in range(326, 331)]
server = BioSeqDatabase.open_database(driver="pymysql",
                                      user="******",
                                      passwd="",
                                      host="localhost",
                                      db="bioseqdb")

for dbv in dblist:

    hladat = download_dat(dbv)
    allele_list = download_allelelist(dbv)

    hla_names = {}
    try:
        s = "," if dbv == "3260" or dbv == "3270" else " "
        with open(allele_list, 'r') as f:
            for line in f:
                line = line.rstrip()
                accession, name = line.split(s)
Example #51
0
TESTDB = temp_db_filename()

# This will abort if driver not installed etc:
check_config(DBDRIVER, DBTYPE, DBHOST, DBUSER, DBPASSWD, TESTDB)

# Some of the unit tests don't create their own database,
# so just in case there is no database already:
TESTDB = create_database()

if False:
    # This is how I generated test file Tests/BioSQL/cor6_6.db
    # which is test cross-checked with the latest bindings to
    # catch any regressions in how we map GenBank entries to
    # the database.
    assert not os.path.isfile("BioSQL/cor6_6.db")
    server = BioSeqDatabase.open_database(driver=DBDRIVER,
                                          db="BioSQL/cor6_6.db")
    DBSCHEMA = "biosqldb-" + DBTYPE + ".sql"
    SQL_FILE = os.path.join(os.getcwd(), "BioSQL", DBSCHEMA)
    assert os.path.isfile(SQL_FILE), SQL_FILE
    server.load_database_sql(SQL_FILE)
    server.commit()
    db = server.new_database("OLD")
    count = db.load(SeqIO.parse("GenBank/cor6_6.gb", "gb"))
    assert count == 6
    server.commit()
    assert len(db) == 6
    server.close()


class BackwardsCompatibilityTest(unittest.TestCase):
    def test_backwards_compatibility(self):
Example #52
0
def fetchseq(ids,
             species,
             write=False,
             output_name='',
             delim='\t',
             id_type='brute',
             server=None,
             source="SQL",
             database="bioseqdb",
             database_path=None,
             host='localhost',
             driver='psycopg2',
             version='1.0',
             user='******',
             passwd='',
             email='',
             batch_size=50,
             output_type="fasta",
             verbose=1,
             n_threads=1,
             n_subthreads=1,
             add_length=(0, 0),
             indent=0):
    if isgenerator(ids):
        if verbose > 1:
            print('Received generator!', indent=indent)
    elif isinstance(ids, list):
        if verbose > 1:
            print('Received list!', indent=indent)
    else:
        if verbose > 1:
            print('Reading ID File... ', indent=indent)
        with ids.open('w') as in_handle:
            id_prelist = [line.strip() for line in in_handle
                          ]  # list of each line in the file
            print('Done!', indent=indent)
        ids = [id_item for id_item in filter(None, id_prelist) if id_item]
        if not id_prelist or id_prelist is None:
            if verbose:
                print('id_prelist is empty!', indent=indent)
            return 'None'
    for id_item in ids:
        assert len(id_item) == 12, (
            "Item {0} in id_list has {1} items, not 5!\n"
            "Format should be: "
            "chr, (start,end), id, score, strand, thickStart, thickEnd, rgb, blockcount,"
            " blockspans, blockstarts, query_span"
            "!").format(
                " ".join((" ".join(item) if not isinstance(item, str) else item
                          for item in id_item)), len(id_item))
    if verbose > 1:
        print('Readied ids!', indent=indent)

    id_list = multiprocessing.JoinableQueue()
    results = multiprocessing.Queue()

    if 'sql' in source.lower():
        if server is None:
            try:
                if verbose > 1:
                    print('No server received, opening server...',
                          indent=indent)
                server = BioSeqDatabase.open_database(driver=driver,
                                                      user=user,
                                                      passwd=passwd,
                                                      host=host,
                                                      database=database)
                if verbose > 1:
                    print('Done!', indent=indent)
            except Exception as err:
                if verbose > 1:
                    print('Failed to open server!', indent=indent)
                    print(str(type(err)), err, sep=' ', indent=indent)
                raise
        else:
            if verbose > 1:
                print('Received server handle:', indent=indent)
                print(server, indent=indent)
            if verbose > 2:
                print('Please note the sub_databases of server:\n\t',
                      [str(i) for i in server.keys()],
                      indent=indent)
    elif source.lower() in ['fasta', '2bit', 'twobit']:
        print('Search type: ', source, indent=indent)
    else:
        raise SearchEngineNotImplementedError(
            'Search using source {} has not yet been implemented!'.format(
                source))
    if verbose > 1:
        print('Creating FecSeq Processes...', indent=indent)
    fs_instances = [
        FetchSeqMP(id_queue=id_list,
                   seq_out_queue=results,
                   delim=delim,
                   id_type=id_type,
                   server=server,
                   species=species,
                   source=source,
                   database=database,
                   database_path=database_path,
                   host=host,
                   driver=driver,
                   version=version,
                   user=user,
                   passwd=passwd,
                   email=email,
                   output_type=output_type,
                   batch_size=batch_size,
                   verbose=verbose,
                   n_subthreads=n_subthreads,
                   add_length=add_length,
                   indent=indent + 1) for _ in range(n_threads)
    ]
    if verbose > 1:
        print('Done! Starting processes...', indent=indent)
    for fs in fs_instances:
        fs.start()
    if verbose > 1:
        print('Done!', indent=indent)
        print('Assigning FetchSeq records to queue... ', indent=indent)
    id_order = []
    for i, id_rec in enumerate(ids):
        try:
            id_order.append("{0}:{1}-{2}".format(id_rec[0], id_rec[1][0],
                                                 id_rec[1][1]))
        except IndexError:
            id_order.append("{0}".format(id_rec[0]))
        try:
            id_list.put(FetchSeq(id_rec=id_rec))
        except AssertionError as err:
            print(i, type(err), err, sep=' ')
            break
    for _ in fs_instances:
        id_list.put(None)
    if verbose > 1:
        print('Done!', indent=indent)
    output_dict = dict()
    missing_items_list = list()
    if verbose > 1:
        print('Getting sequences from processes... ', indent=indent)
    n_jobs = len(ids)
    while n_jobs:
        seq, missing = results.get()
        output_dict[seq[0]] = seq[1]
        missing_items_list.append(missing)
        n_jobs -= 1
    if verbose > 1:
        print('Done! Finished fetching sequences!', indent=indent)
        print('Closing processes!', indent=indent)
    for fs in fs_instances:
        if fs.is_alive():
            fs.join()
    output_list = [output_dict[i] for i in id_order if i in output_dict]
    if write:
        SeqIO.write(output_list, output_name, output_type)
        return
    else:
        if missing_items_list == [None]:
            missing_items_list = None
        return output_list, missing_items_list
Example #53
0
#!/usr/bin/env python
# Copyright 2002 Brad Chapman.  All rights reserved.
#
# This code is part of the Biopython distribution and governed by its
# license.  Please see the LICENSE file that should have been included
# as part of this package.
"""Test timing of getting records from a BioSQL database."""
from __future__ import print_function

import time

# set up the connection
from BioSQL import BioSeqDatabase

server = BioSeqDatabase.open_database(host="192.168.0.192",
                                      user="******",
                                      passwd="",
                                      db="test_biosql")
db = server["embl_rod"]

# -- do the fasta-only timing part
start_time = time.time()
num_records = 0
for junk_id, record in db.items():
    num_records += 1
    sequence = record.seq.data
    d = record.description
    i = record.id
    n = record.name

end_time = time.time()
elapsed_time = end_time - start_time
Example #54
0
from tqdm import tqdm

from SNDG.Comparative.Pangenome import Pangenome, Strain, sqldb
from SNDG.WebServices.NCBI import NCBI
from Bio import Entrez, SeqIO

from BioSQL import BioSeqDatabase

server = BioSeqDatabase.open_database(driver="MySQLdb",
                                      user="******",
                                      passwd="mito",
                                      host="localhost",
                                      db="bioseqdb")

if __name__ == '__main__':

    from peewee import MySQLDatabase

    mysql_db = MySQLDatabase('bioseqdb', user="******", password="******")
    sqldb.initialize(mysql_db)
    tables = [Pangenome, Strain]

    # for x in tables:
    #     x.create_table()

    Entrez.email = "*****@*****.**"
    query = '"pathogen"[Properties] AND ("Metazoa"[Organism] OR "Viridiplantae"[Organism] OR "Fungi"[Organism] OR "Eukaryota"[Organism] NOT "Metazoa"[Organism] NOT "Fungi"[Organism] NOT "Viridiplantae"[Organism] OR "Bacteria"[Organism] OR txid1224[Orgn] OR "Archaea"[Organism])'
    genomesList = Entrez.read(
        Entrez.esearch(db="genome", term=query, idtype="acc", retmax=10000))

    genomes = Entrez.read(Entrez.esummary(db="genome",
Example #55
0
    ]

#####################################################################

#TODO - Should we re-use the create_database() function currently
#       defined in test_BioSQL.py here too?  This would allow us
#       to deal with the error of an unknown database...
#
#print "Creating database"
#from setup_BioSQL import create_database
#create_database()

print "Connecting to database"
try :
    server = BioSeqDatabase.open_database(driver = DBDRIVER,
                                      user = DBUSER, passwd = DBPASSWD,
                                      host = DBHOST, db = TESTDB)
except Exception, e :
    message = "Connection failed, check settings in Tests/setup_BioSQL.py "\
              "if you plan to use BioSQL: %s" % str(e)
    raise MissingExternalDependencyError(message)

print "Removing existing sub-database '%s' (if exists)" % db_name
if db_name in server.keys() :
    #Might exist from a failed test run...
    #db = server[db_name]
    server.remove_database(db_name)
    server.commit()

print "(Re)creating empty sub-database '%s'" % db_name
db = server.new_database(db_name)
Example #56
0
    def create_fasta_files(self):
        """
        Creates FASTA files from database sequences.
        """

        print "Creating FASTA datasets."
        print "  Opening database."
        server = BioSeqDatabase.open_database(**self.dbargs)
        cur = server.adaptor.cursor

        for indexes, name, schema, namespace, max_res in self.dataset:

            print "  Creating dataset %s." % name

            if schema:
                print "    Setting schema %s." % schema
                cur.execute("SET search_path TO %s" % schema)
                server.adaptor.commit()

            # Skip the whole FASTA file creation if the first file
            # already exists
            fasta_name = name + '%3.3d.fas' % 0
            if fasta_name in os.listdir(self.index_dir):
                print "    File %s present - skipping all." % fasta_name
                continue

            # Execute the SQL query (specific namespace or all sequences)
            if namespace:
                dbid = server[namespace].dbid
                sql = """SELECT e.name || ' (' || e.accession || ') '
                         || e.description AS header, s.seq AS residues
                         FROM bioentry e, biosequence s
                         WHERE e.bioentry_id = s.bioentry_id AND
                         e.biodatabase_id = %s ORDER BY e.name;"""
                cur.execute(sql, (dbid, ))
            else:
                sql = """SELECT e.name || ' (' || e.accession || ') '
                         || e.description AS header, s.seq AS residues
                         FROM bioentry e, biosequence s
                         WHERE e.bioentry_id = s.bioentry_id
                         ORDER BY e.name;"""
                cur.execute(sql)

            # Number of residues in the current file set so that new file is opened
            num_residues = max_res + 1
            # Counter and file are dummy
            file_counter = -1
            fp = StringIO()

            while 1:
                res = cur.fetchone()
                if not res: break
                title = res[0][:_COL_WIDTH - 1]
                sequence = res[1]

                if num_residues + len(sequence) > max_res:
                    num_residues = 0
                    file_counter += 1
                    fp.close()
                    fasta_name = name + '%3.3d.fas' % file_counter
                    print "    Creating file %s." % fasta_name
                    fasta_path = os.path.join(self.index_dir, fasta_name)
                    # Open file in binary mode:
                    # We write in UNIX format with line separator '\n'
                    fp = file(fasta_path, 'wb')

                # Now write the sequence
                fp.write('>%s\n' % title)
                i = 0
                while i < len(sequence):
                    fp.write('%s\n' % sequence[i:i + _COL_WIDTH])
                    i += _COL_WIDTH

                num_residues += len(sequence)
            fp.close()

        server.adaptor.close()