Esempio n. 1
0
    def lineage_name(self, taxid, reverse=False):
        """Get a lineage name for a taxonomic id

        Given a taxid, return its associated lineage

        Arguments:
            taxid (:obj:`int`): a taxid
            reverse (:obj:`bool`): Inverted lineage, from top to bottom
                taxonomy hierarchy. Default False

        Returns:
            list: lineage_name, associated lineage name with taxid or None if
                taxid not found

        """
        try:
            lineage_list = []
            current_lineage = Taxa.get(Taxa.ncbi_taxid == taxid).tax_name
            parent = Taxa.get(Taxa.ncbi_taxid == taxid).parent_taxid
            while current_lineage != 'root':
                lineage_list.append(current_lineage)
                new_query = Taxa.get(Taxa.ncbi_taxid == parent)

                current_lineage = new_query.tax_name
                parent = new_query.parent_taxid
            if reverse is True:
                lineage_list.reverse()
            return lineage_list
        except Taxa.DoesNotExist:
            return None
Esempio n. 2
0
    def lineage_id(self, acc_number_list):
        """Get taxonomic lineage name for accession ids

        Given a list of accession numbers, yield the accession number and their
            associated lineage (in the form of taxids) as tuples

        Args:
            acc_number_list (:obj:`list`): a list of accession numbers

        Yields:
            tuple: (accession id, lineage list)

        """
        self.check_list_ids(acc_number_list)
        with self.db.atomic():
            query = Accession.select().where(
                Accession.accession << acc_number_list)
            for i in query:
                lineage_list = []
                current_lineage = i.taxid.tax_name
                current_lineage_id = i.taxid.ncbi_taxid
                parent = i.taxid.parent_taxid
                while current_lineage != 'root':
                    lineage_list.append(current_lineage_id)
                    new_query = Taxa.get(Taxa.ncbi_taxid == parent)
                    current_lineage = new_query.tax_name
                    current_lineage_id = new_query.ncbi_taxid
                    parent = new_query.parent_taxid
                yield (i.accession, lineage_list)
Esempio n. 3
0
    def lineage_id(self, acc_number_list):
        """Get taxonomic lineage name for accession ids

        Given a list of accession numbers, yield the accession number and their
            associated lineage (in the form of taxids) as tuples

        Args:
            acc_number_list (:obj:`list`): a list of accession numbers

        Yields:
            tuple: (accession id, lineage list)

        """
        self.check_list_ids(acc_number_list)
        with self.db.atomic():
            query = Accession.select().where(
                Accession.accession << acc_number_list)
            for i in query:
                lineage_list = []
                current_lineage = i.taxid.tax_name
                current_lineage_id = i.taxid.ncbi_taxid
                parent = i.taxid.parent_taxid
                while current_lineage != 'root':
                    lineage_list.append(current_lineage_id)
                    new_query = Taxa.get(Taxa.ncbi_taxid == parent)
                    current_lineage = new_query.tax_name
                    current_lineage_id = new_query.ncbi_taxid
                    parent = new_query.parent_taxid
                yield (i.accession, lineage_list)
Esempio n. 4
0
    def cache_taxids():
        """Load data from taxa table into a dictionary

        Returns:
            data (:obj:`dict`): Data from taxa table mapped as dictionary

        """
        data = {}
        for x in Taxa.select(Taxa.ncbi_taxid).dicts():
            data[str(x['ncbi_taxid'])] = True
        return data
Esempio n. 5
0
    def cache_taxids():
        """Load data from taxa table into a dictionary

        Returns:
            data (:obj:`dict`): Data from taxa table mapped as dictionary

        """
        data = {}
        for x in Taxa.select(Taxa.ncbi_taxid).dicts():
            data[str(x['ncbi_taxid'])] = True
        return data
Esempio n. 6
0
    def lineage_id(self, taxid, ranks=False, reverse=False):
        """Get lineage for a taxonomic id

        Given a taxid, return its associated lineage (in the form of a list of
            taxids, each parents of each others)

        Args:
            taxid (:obj:`int`): a taxid
            ranks (:obj:`bool`): Wether to return a dict with the tax ranks or
                not. Default False
            reverse (:obj:`bool`): Inverted lineage, from top to bottom
                taxonomy hierarchy. Default False
        Returns:
            list: lineage_list, associated lineage id with taxid or None if
                taxid not found

        """
        try:
            lineages = {} if ranks else []
            # lineage_list = []
            current_lineage = Taxa.get(Taxa.ncbi_taxid == taxid).tax_name
            current_lineage_id = Taxa.get(Taxa.ncbi_taxid == taxid).ncbi_taxid
            parent = Taxa.get(Taxa.ncbi_taxid == taxid).parent_taxid
            rank = Taxa.get(Taxa.ncbi_taxid == taxid).lineage_level
            while current_lineage != 'root':
                if ranks:
                    lineages[rank] = current_lineage_id
                else:
                    lineages.append(current_lineage_id)

                new_query = Taxa.get(Taxa.ncbi_taxid == parent)

                current_lineage = new_query.tax_name
                current_lineage_id = new_query.ncbi_taxid
                parent = new_query.parent_taxid
                rank = new_query.lineage_level
            if reverse is True and ranks is False:
                lineages.reverse()
            return lineages
        except Taxa.DoesNotExist:
            return None
Esempio n. 7
0
    def lineage_id(self, taxid, ranks=False, reverse=False):
        """Get lineage for a taxonomic id

        Given a taxid, return its associated lineage (in the form of a list of
            taxids, each parents of each others)

        Args:
            taxid (:obj:`int`): a taxid
            ranks (:obj:`bool`): Wether to return the the tax ranks or
                not. Default False
            reverse (:obj:`bool`): Inverted lineage, from top to bottom
                taxonomy hierarchy. Default False
        Returns:
            list: lineage_list, associated lineage id with taxid or None if
                taxid not found

        """
        try:
            lineages = []
            # lineage_list = []
            current_lineage = Taxa.get(Taxa.ncbi_taxid == taxid).tax_name
            current_lineage_id = Taxa.get(Taxa.ncbi_taxid == taxid).ncbi_taxid
            parent = Taxa.get(Taxa.ncbi_taxid == taxid).parent_taxid
            rank = Taxa.get(Taxa.ncbi_taxid == taxid).lineage_level
            while current_lineage != 'root':
                if ranks:
                    lineages.append((rank, current_lineage_id))
                else:
                    lineages.append(current_lineage_id)

                new_query = Taxa.get(Taxa.ncbi_taxid == parent)

                current_lineage = new_query.tax_name
                current_lineage_id = new_query.ncbi_taxid
                parent = new_query.parent_taxid
                rank = new_query.lineage_level
            if reverse is True and ranks is False:
                lineages.reverse()
            return lineages
        except Taxa.DoesNotExist:
            return None
Esempio n. 8
0
 def test_accessionparser_accession2taxid(self):
     """Check method yield correct number of entries read from accession
     file"""
     # Need connection to db. We use an empty db to fill list returned by
     #  parsing method
     db = TaxaDB(dbtype='sqlite', dbname=self.testdb)
     db.db.create_table(Taxa, safe=True)
     db.db.create_table(Accession, safe=True)
     # We need to load names.dmp and nodes.dmp
     tp = TaxaDumpParser(nodes_file=self.nodes, names_file=self.names,
                         verbose=True)
     taxa_info = tp.taxdump()
     with db.db.atomic():
         for i in range(0, len(taxa_info), self.chunk):
             Taxa.insert_many(taxa_info[i:i + self.chunk]).execute()
     ap = Accession2TaxidParser(acc_file=self.acc, chunk=self.chunk,
                                verbose=True)
     acc_list = ap.accession2taxid()
     total_entrires = 0
     for accs in acc_list:
         total_entrires += len(accs)
     self.assertEqual(total_entrires, 55211)
Esempio n. 9
0
 def test_accessionparser_accession2taxid(self):
     """Check method yield correct number of entries read from accession
     file"""
     # Need connection to db. We use an empty db to fill list returned by
     #  parsing method
     db = TaxaDB(dbtype='sqlite', dbname=self.testdb)
     db.db.create_tables([Taxa])
     db.db.create_tables([Accession])
     # We need to load names.dmp and nodes.dmp
     tp = TaxaDumpParser(nodes_file=self.nodes, names_file=self.names,
                         verbose=True)
     taxa_info = tp.taxdump()
     with db.db.atomic():
         for i in range(0, len(taxa_info), self.chunk):
             Taxa.insert_many(taxa_info[i:i + self.chunk]).execute()
     ap = Accession2TaxidParser(acc_file=self.acc, chunk=self.chunk,
                                verbose=True)
     acc_list = ap.accession2taxid()
     total_entrires = 0
     for accs in acc_list:
         total_entrires += len(accs)
     self.assertEqual(total_entrires, 55211)
Esempio n. 10
0
    def sci_name(self, taxid):
        """Get taxonomic scientific name for taxonomy id

        Given a taxid, return its associated scientific name

        Args:
            taxid (:obj:`int`): a taxid
        Returns:
            str: name, scientific name or None if taxid not found

        """
        try:
            name = Taxa.get(Taxa.ncbi_taxid == taxid).tax_name
            return name
        except Taxa.DoesNotExist:
            return None
Esempio n. 11
0
    def sci_name(self, taxid):
        """Get taxonomic scientific name for taxonomy id

        Given a taxid, return its associated scientific name

        Args:
            taxid (:obj:`int`): a taxid
        Returns:
            str: name, scientific name or None if taxid not found

        """
        try:
            name = Taxa.get(Taxa.ncbi_taxid == taxid).tax_name
            return name
        except Taxa.DoesNotExist:
            return None
Esempio n. 12
0
    def taxid(self, sci_name):
        """Get taxid from scientific name

        Given a taxid, return its associated scientific name

        Args:
            sci_name (:obj:`int`): a scientific name
        Returns:
            int: ncbi_taxid, taxid matching scientific name or None if
                taxid not found
        """
        try:
            ncbi_taxid = Taxa.get(Taxa.tax_name == sci_name).ncbi_taxid
            return ncbi_taxid
        except Taxa.DoesNotExist:
            return None
Esempio n. 13
0
def create_db(args):
    """Main function for the 'taxadb create' sub-command.

    This function creates a taxonomy database with 2 tables: Taxa and Sequence.

    Args:

        args.input (:obj:`str`): input directory. It is the directory created by
            `taxadb download`
        args.dbname (:obj:`str`): name of the database to be created
        args.dbtype (:obj:`str`): type of database to be used.
        args.division (:obj:`str`): division to create the db for.
        args.fast (:obj:`bool`): Disables checks for faster db creation. Use
                                 with caution!

    """
    database = DatabaseFactory(**args.__dict__).get_database()
    div = args.division  # am lazy at typing
    db.initialize(database)

    nucl_est = 'nucl_est.accession2taxid.gz'
    nucl_gb = 'nucl_gb.accession2taxid.gz'
    nucl_gss = 'nucl_gss.accession2taxid.gz'
    nucl_wgs = 'nucl_wgs.accession2taxid.gz'
    prot = 'prot.accession2taxid.gz'
    acc_dl_list = []

    db.connect()
    parser = TaxaDumpParser(nodes_file=os.path.join(args.input, 'nodes.dmp'),
                            names_file=os.path.join(args.input, 'names.dmp'),
                            verbose=args.verbose)

    parser.verbose("Connected to database ...")
    # If taxa table already exists, do not recreate and fill it
    # safe=True prevent not to create the table if it already exists
    if not Taxa.table_exists():
        parser.verbose("Creating table %s" % str(Taxa.get_table_name()))
    db.create_table(Taxa, safe=True)

    parser.verbose("Parsing files")
    taxa_info_list = parser.taxdump()

    parser.verbose("Inserting taxa data")
    with db.atomic():
        for i in range(0, len(taxa_info_list), args.chunk):
            Taxa.insert_many(taxa_info_list[i:i + args.chunk]).execute()
    print('Taxa: completed')

    parser.verbose("Checking table accession ...")
    # At first load, table accession does not exist yet, we create it
    db.create_table(Accession, safe=True)

    if div in ['full', 'nucl', 'est']:
        acc_dl_list.append(nucl_est)
    if div in ['full', 'nucl', 'gb']:
        acc_dl_list.append(nucl_gb)
    if div in ['full', 'nucl', 'gss']:
        acc_dl_list.append(nucl_gss)
    if div in ['full', 'nucl', 'wgs']:
        acc_dl_list.append(nucl_wgs)
    if div in ['full', 'prot']:
        acc_dl_list.append(prot)
    parser = Accession2TaxidParser(verbose=args.verbose, fast=args.fast)
    with db.atomic():
        for acc_file in acc_dl_list:
            inserted_rows = 0
            parser.verbose("Parsing %s" % str(acc_file))
            for data_dict in parser.accession2taxid(acc2taxid=os.path.join(
                    args.input, acc_file),
                                                    chunk=args.chunk):
                Accession.insert_many(data_dict[0:args.chunk]).execute()
                inserted_rows += len(data_dict)
            print('%s: %s added to database (%d rows inserted)' %
                  (Accession.get_table_name(), acc_file, inserted_rows))
        if not Accession.has_index(name='accession_accession'):
            print('Creating index for %s' % Accession.get_table_name())
            try:
                db.create_index(Accession, ['accession'], unique=True)
            except PeeweeException as err:
                raise Exception("Could not create Accession index: %s" %
                                str(err))
    print('Accession: completed')
    db.close()
Esempio n. 14
0
def create_db(args):
    """Main function for the 'taxadb create' sub-command.

    This function creates a taxonomy database with 2 tables: Taxa and Sequence.

    Args:

        args.input (:obj:`str`): input directory. It is the directory created
            by `taxadb download`
        args.dbname (:obj:`str`): name of the database to be created
        args.dbtype (:obj:`str`): type of database to be used.
        args.division (:obj:`str`): division to create the db for.
        args.fast (:obj:`bool`): Disables checks for faster db creation. Use
                                 with caution!

    """
    logger = logging.getLogger(__name__)
    database = DatabaseFactory(**args.__dict__).get_database()
    div = args.division  # am lazy at typing
    db.initialize(database)

    nucl_gb = 'nucl_gb.accession2taxid.gz'
    nucl_wgs = 'nucl_wgs.accession2taxid.gz'
    prot = 'prot.accession2taxid.gz'
    acc_dl_list = []

    db.connect()
    parser = TaxaDumpParser(nodes_file=os.path.join(args.input, 'nodes.dmp'),
                            names_file=os.path.join(args.input, 'names.dmp'),
                            verbose=args.verbose)

    logger.debug('Connected to database')
    # If taxa table already exists, do not recreate and fill it
    # safe=True prevent not to create the table if it already exists
    if not Taxa.table_exists():
        logger.info('Creating table %s' % str(Taxa.get_table_name()))
        db.create_tables([Taxa])

    logger.info("Parsing files")
    taxa_info_list = parser.taxdump()

    logger.info("Inserting taxonomy data")
    total_size = len(taxa_info_list)
    try:
        with db.atomic():
            for i in tqdm(range(0, total_size, args.chunk),
                          unit=' chunks',
                          desc='INFO:taxadb.app',
                          total=''):
                Taxa.insert_many(taxa_info_list[i:i + args.chunk]).execute()
    except OperationalError as e:
        print("\n")  # needed because the above counter has none
        logger.error("sqlite3 error: %s" % e)
        logger.error("Maybe retry with a lower chunk size.")
        sys.exit(1)
    logger.info('Table Taxa completed')

    # At first load, table accession does not exist yet, we create it
    db.create_tables([Accession])

    if div in ['full', 'nucl', 'gb']:
        acc_dl_list.append(nucl_gb)
    if div in ['full', 'nucl', 'wgs']:
        acc_dl_list.append(nucl_wgs)
    if div in ['full', 'prot']:
        acc_dl_list.append(prot)
    parser = Accession2TaxidParser(verbose=args.verbose, fast=args.fast)
    with db.atomic():
        for acc_file in acc_dl_list:
            inserted_rows = 0
            logger.info("Parsing %s" % str(acc_file))
            for data_dict in tqdm(parser.accession2taxid(
                    acc2taxid=os.path.join(args.input,
                                           acc_file), chunk=args.chunk),
                                  unit=' chunks',
                                  desc='INFO:taxadb.app',
                                  total=''):
                Accession.insert_many(data_dict[0:args.chunk]).execute()
                inserted_rows += len(data_dict)
            logger.info('%s: %s added to database (%d rows inserted)' %
                        (Accession.get_table_name(), acc_file, inserted_rows))
        if not Accession.has_index(name='accession_accession'):
            logger.info('Creating index for %s' % Accession.get_table_name())
            try:
                # db.add_index(Accession, ['accession'], unique=True)
                idx = db.index(db.Accession, name='accession', unique=True)
                db.add_index(idx)
            except PeeweeException as err:
                raise Exception("Could not create Accession index: %s" %
                                str(err))
    logger.info('Table Accession completed')
    db.close()
Esempio n. 15
0
def create_db(args):
    """Main function for the 'taxadb create' sub-command.

    This function creates a taxonomy database with 2 tables: Taxa and Sequence.

    Args:

        args.input (:obj:`str`): input directory. It is the directory created
            by `taxadb download`
        args.dbname (:obj:`str`): name of the database to be created
        args.dbtype (:obj:`str`): type of database to be used.
        args.division (:obj:`str`): division to create the db for.
        args.fast (:obj:`bool`): Disables checks for faster db creation. Use
                                 with caution!

    """
    logger = logging.getLogger(__name__)
    database = DatabaseFactory(**args.__dict__).get_database()
    div = args.division  # am lazy at typing
    db.initialize(database)

    nucl_gb = 'nucl_gb.accession2taxid.gz'
    nucl_wgs = 'nucl_wgs.accession2taxid.gz'
    prot = 'prot.accession2taxid.gz'
    acc_dl_list = []

    db.connect()
    parser = TaxaDumpParser(nodes_file=os.path.join(args.input, 'nodes.dmp'),
                            names_file=os.path.join(args.input, 'names.dmp'),
                            verbose=args.verbose)

    logger.debug('Connected to database')
    # If taxa table already exists, do not recreate and fill it
    # safe=True prevent not to create the table if it already exists
    if not Taxa.table_exists():
        logger.info('Creating table %s' % str(Taxa.get_table_name()))
        db.create_tables([Taxa])

    logger.info("Parsing files")
    taxa_info_list = parser.taxdump()

    logger.info("Inserting taxonomy data")
    total_size = len(taxa_info_list)
    try:
        with db.atomic():
            for i in tqdm(range(0, total_size, args.chunk),
                          unit=' chunks', desc='INFO:taxadb.app',
                          total=''):
                Taxa.insert_many(taxa_info_list[i:i+args.chunk]).execute()
    except OperationalError as e:
        print("\n")  # needed because the above counter has none
        logger.error("sqlite3 error: %s" % e)
        logger.error("Maybe retry with a lower chunk size.")
        sys.exit(1)
    logger.info('Table Taxa completed')

    # At first load, table accession does not exist yet, we create it
    db.create_tables([Accession])

    if div in ['full', 'nucl', 'gb']:
        acc_dl_list.append(nucl_gb)
    if div in ['full', 'nucl', 'wgs']:
        acc_dl_list.append(nucl_wgs)
    if div in ['full', 'prot']:
        acc_dl_list.append(prot)
    parser = Accession2TaxidParser(verbose=args.verbose, fast=args.fast)
    with db.atomic():
        for acc_file in acc_dl_list:
            inserted_rows = 0
            logger.info("Parsing %s" % str(acc_file))
            for data_dict in tqdm(
                parser.accession2taxid(
                    acc2taxid=os.path.join(args.input, acc_file),
                    chunk=args.chunk), unit=' chunks',
                    desc='INFO:taxadb.app',
                    total=''):
                Accession.insert_many(data_dict[0:args.chunk]).execute()
                inserted_rows += len(data_dict)
            logger.info('%s: %s added to database (%d rows inserted)'
                        % (Accession.get_table_name(),
                            acc_file, inserted_rows))
        if not Accession.has_index(name='accession_accession'):
            logger.info('Creating index for %s'
                        % Accession.get_table_name())
            try:
                # db.add_index(Accession, ['accession'], unique=True)
                idx = db.index(db.Accession, name='accession', unique=True)
                db.add_index(idx)
            except PeeweeException as err:
                raise Exception("Could not create Accession index: %s"
                                % str(err))
    logger.info('Table Accession completed')
    db.close()