def lineage_name(self, taxid, reverse=False): """Get a lineage name for a taxonomic id Given a taxid, return its associated lineage Arguments: taxid (:obj:`int`): a taxid reverse (:obj:`bool`): Inverted lineage, from top to bottom taxonomy hierarchy. Default False Returns: list: lineage_name, associated lineage name with taxid or None if taxid not found """ try: lineage_list = [] current_lineage = Taxa.get(Taxa.ncbi_taxid == taxid).tax_name parent = Taxa.get(Taxa.ncbi_taxid == taxid).parent_taxid while current_lineage != 'root': lineage_list.append(current_lineage) new_query = Taxa.get(Taxa.ncbi_taxid == parent) current_lineage = new_query.tax_name parent = new_query.parent_taxid if reverse is True: lineage_list.reverse() return lineage_list except Taxa.DoesNotExist: return None
def lineage_id(self, acc_number_list): """Get taxonomic lineage name for accession ids Given a list of accession numbers, yield the accession number and their associated lineage (in the form of taxids) as tuples Args: acc_number_list (:obj:`list`): a list of accession numbers Yields: tuple: (accession id, lineage list) """ self.check_list_ids(acc_number_list) with self.db.atomic(): query = Accession.select().where( Accession.accession << acc_number_list) for i in query: lineage_list = [] current_lineage = i.taxid.tax_name current_lineage_id = i.taxid.ncbi_taxid parent = i.taxid.parent_taxid while current_lineage != 'root': lineage_list.append(current_lineage_id) new_query = Taxa.get(Taxa.ncbi_taxid == parent) current_lineage = new_query.tax_name current_lineage_id = new_query.ncbi_taxid parent = new_query.parent_taxid yield (i.accession, lineage_list)
def cache_taxids(): """Load data from taxa table into a dictionary Returns: data (:obj:`dict`): Data from taxa table mapped as dictionary """ data = {} for x in Taxa.select(Taxa.ncbi_taxid).dicts(): data[str(x['ncbi_taxid'])] = True return data
def lineage_id(self, taxid, ranks=False, reverse=False): """Get lineage for a taxonomic id Given a taxid, return its associated lineage (in the form of a list of taxids, each parents of each others) Args: taxid (:obj:`int`): a taxid ranks (:obj:`bool`): Wether to return a dict with the tax ranks or not. Default False reverse (:obj:`bool`): Inverted lineage, from top to bottom taxonomy hierarchy. Default False Returns: list: lineage_list, associated lineage id with taxid or None if taxid not found """ try: lineages = {} if ranks else [] # lineage_list = [] current_lineage = Taxa.get(Taxa.ncbi_taxid == taxid).tax_name current_lineage_id = Taxa.get(Taxa.ncbi_taxid == taxid).ncbi_taxid parent = Taxa.get(Taxa.ncbi_taxid == taxid).parent_taxid rank = Taxa.get(Taxa.ncbi_taxid == taxid).lineage_level while current_lineage != 'root': if ranks: lineages[rank] = current_lineage_id else: lineages.append(current_lineage_id) new_query = Taxa.get(Taxa.ncbi_taxid == parent) current_lineage = new_query.tax_name current_lineage_id = new_query.ncbi_taxid parent = new_query.parent_taxid rank = new_query.lineage_level if reverse is True and ranks is False: lineages.reverse() return lineages except Taxa.DoesNotExist: return None
def lineage_id(self, taxid, ranks=False, reverse=False): """Get lineage for a taxonomic id Given a taxid, return its associated lineage (in the form of a list of taxids, each parents of each others) Args: taxid (:obj:`int`): a taxid ranks (:obj:`bool`): Wether to return the the tax ranks or not. Default False reverse (:obj:`bool`): Inverted lineage, from top to bottom taxonomy hierarchy. Default False Returns: list: lineage_list, associated lineage id with taxid or None if taxid not found """ try: lineages = [] # lineage_list = [] current_lineage = Taxa.get(Taxa.ncbi_taxid == taxid).tax_name current_lineage_id = Taxa.get(Taxa.ncbi_taxid == taxid).ncbi_taxid parent = Taxa.get(Taxa.ncbi_taxid == taxid).parent_taxid rank = Taxa.get(Taxa.ncbi_taxid == taxid).lineage_level while current_lineage != 'root': if ranks: lineages.append((rank, current_lineage_id)) else: lineages.append(current_lineage_id) new_query = Taxa.get(Taxa.ncbi_taxid == parent) current_lineage = new_query.tax_name current_lineage_id = new_query.ncbi_taxid parent = new_query.parent_taxid rank = new_query.lineage_level if reverse is True and ranks is False: lineages.reverse() return lineages except Taxa.DoesNotExist: return None
def test_accessionparser_accession2taxid(self): """Check method yield correct number of entries read from accession file""" # Need connection to db. We use an empty db to fill list returned by # parsing method db = TaxaDB(dbtype='sqlite', dbname=self.testdb) db.db.create_table(Taxa, safe=True) db.db.create_table(Accession, safe=True) # We need to load names.dmp and nodes.dmp tp = TaxaDumpParser(nodes_file=self.nodes, names_file=self.names, verbose=True) taxa_info = tp.taxdump() with db.db.atomic(): for i in range(0, len(taxa_info), self.chunk): Taxa.insert_many(taxa_info[i:i + self.chunk]).execute() ap = Accession2TaxidParser(acc_file=self.acc, chunk=self.chunk, verbose=True) acc_list = ap.accession2taxid() total_entrires = 0 for accs in acc_list: total_entrires += len(accs) self.assertEqual(total_entrires, 55211)
def test_accessionparser_accession2taxid(self): """Check method yield correct number of entries read from accession file""" # Need connection to db. We use an empty db to fill list returned by # parsing method db = TaxaDB(dbtype='sqlite', dbname=self.testdb) db.db.create_tables([Taxa]) db.db.create_tables([Accession]) # We need to load names.dmp and nodes.dmp tp = TaxaDumpParser(nodes_file=self.nodes, names_file=self.names, verbose=True) taxa_info = tp.taxdump() with db.db.atomic(): for i in range(0, len(taxa_info), self.chunk): Taxa.insert_many(taxa_info[i:i + self.chunk]).execute() ap = Accession2TaxidParser(acc_file=self.acc, chunk=self.chunk, verbose=True) acc_list = ap.accession2taxid() total_entrires = 0 for accs in acc_list: total_entrires += len(accs) self.assertEqual(total_entrires, 55211)
def sci_name(self, taxid): """Get taxonomic scientific name for taxonomy id Given a taxid, return its associated scientific name Args: taxid (:obj:`int`): a taxid Returns: str: name, scientific name or None if taxid not found """ try: name = Taxa.get(Taxa.ncbi_taxid == taxid).tax_name return name except Taxa.DoesNotExist: return None
def taxid(self, sci_name): """Get taxid from scientific name Given a taxid, return its associated scientific name Args: sci_name (:obj:`int`): a scientific name Returns: int: ncbi_taxid, taxid matching scientific name or None if taxid not found """ try: ncbi_taxid = Taxa.get(Taxa.tax_name == sci_name).ncbi_taxid return ncbi_taxid except Taxa.DoesNotExist: return None
def create_db(args): """Main function for the 'taxadb create' sub-command. This function creates a taxonomy database with 2 tables: Taxa and Sequence. Args: args.input (:obj:`str`): input directory. It is the directory created by `taxadb download` args.dbname (:obj:`str`): name of the database to be created args.dbtype (:obj:`str`): type of database to be used. args.division (:obj:`str`): division to create the db for. args.fast (:obj:`bool`): Disables checks for faster db creation. Use with caution! """ database = DatabaseFactory(**args.__dict__).get_database() div = args.division # am lazy at typing db.initialize(database) nucl_est = 'nucl_est.accession2taxid.gz' nucl_gb = 'nucl_gb.accession2taxid.gz' nucl_gss = 'nucl_gss.accession2taxid.gz' nucl_wgs = 'nucl_wgs.accession2taxid.gz' prot = 'prot.accession2taxid.gz' acc_dl_list = [] db.connect() parser = TaxaDumpParser(nodes_file=os.path.join(args.input, 'nodes.dmp'), names_file=os.path.join(args.input, 'names.dmp'), verbose=args.verbose) parser.verbose("Connected to database ...") # If taxa table already exists, do not recreate and fill it # safe=True prevent not to create the table if it already exists if not Taxa.table_exists(): parser.verbose("Creating table %s" % str(Taxa.get_table_name())) db.create_table(Taxa, safe=True) parser.verbose("Parsing files") taxa_info_list = parser.taxdump() parser.verbose("Inserting taxa data") with db.atomic(): for i in range(0, len(taxa_info_list), args.chunk): Taxa.insert_many(taxa_info_list[i:i + args.chunk]).execute() print('Taxa: completed') parser.verbose("Checking table accession ...") # At first load, table accession does not exist yet, we create it db.create_table(Accession, safe=True) if div in ['full', 'nucl', 'est']: acc_dl_list.append(nucl_est) if div in ['full', 'nucl', 'gb']: acc_dl_list.append(nucl_gb) if div in ['full', 'nucl', 'gss']: acc_dl_list.append(nucl_gss) if div in ['full', 'nucl', 'wgs']: acc_dl_list.append(nucl_wgs) if div in ['full', 'prot']: acc_dl_list.append(prot) parser = Accession2TaxidParser(verbose=args.verbose, fast=args.fast) with db.atomic(): for acc_file in acc_dl_list: inserted_rows = 0 parser.verbose("Parsing %s" % str(acc_file)) for data_dict in parser.accession2taxid(acc2taxid=os.path.join( args.input, acc_file), chunk=args.chunk): Accession.insert_many(data_dict[0:args.chunk]).execute() inserted_rows += len(data_dict) print('%s: %s added to database (%d rows inserted)' % (Accession.get_table_name(), acc_file, inserted_rows)) if not Accession.has_index(name='accession_accession'): print('Creating index for %s' % Accession.get_table_name()) try: db.create_index(Accession, ['accession'], unique=True) except PeeweeException as err: raise Exception("Could not create Accession index: %s" % str(err)) print('Accession: completed') db.close()
def create_db(args): """Main function for the 'taxadb create' sub-command. This function creates a taxonomy database with 2 tables: Taxa and Sequence. Args: args.input (:obj:`str`): input directory. It is the directory created by `taxadb download` args.dbname (:obj:`str`): name of the database to be created args.dbtype (:obj:`str`): type of database to be used. args.division (:obj:`str`): division to create the db for. args.fast (:obj:`bool`): Disables checks for faster db creation. Use with caution! """ logger = logging.getLogger(__name__) database = DatabaseFactory(**args.__dict__).get_database() div = args.division # am lazy at typing db.initialize(database) nucl_gb = 'nucl_gb.accession2taxid.gz' nucl_wgs = 'nucl_wgs.accession2taxid.gz' prot = 'prot.accession2taxid.gz' acc_dl_list = [] db.connect() parser = TaxaDumpParser(nodes_file=os.path.join(args.input, 'nodes.dmp'), names_file=os.path.join(args.input, 'names.dmp'), verbose=args.verbose) logger.debug('Connected to database') # If taxa table already exists, do not recreate and fill it # safe=True prevent not to create the table if it already exists if not Taxa.table_exists(): logger.info('Creating table %s' % str(Taxa.get_table_name())) db.create_tables([Taxa]) logger.info("Parsing files") taxa_info_list = parser.taxdump() logger.info("Inserting taxonomy data") total_size = len(taxa_info_list) try: with db.atomic(): for i in tqdm(range(0, total_size, args.chunk), unit=' chunks', desc='INFO:taxadb.app', total=''): Taxa.insert_many(taxa_info_list[i:i + args.chunk]).execute() except OperationalError as e: print("\n") # needed because the above counter has none logger.error("sqlite3 error: %s" % e) logger.error("Maybe retry with a lower chunk size.") sys.exit(1) logger.info('Table Taxa completed') # At first load, table accession does not exist yet, we create it db.create_tables([Accession]) if div in ['full', 'nucl', 'gb']: acc_dl_list.append(nucl_gb) if div in ['full', 'nucl', 'wgs']: acc_dl_list.append(nucl_wgs) if div in ['full', 'prot']: acc_dl_list.append(prot) parser = Accession2TaxidParser(verbose=args.verbose, fast=args.fast) with db.atomic(): for acc_file in acc_dl_list: inserted_rows = 0 logger.info("Parsing %s" % str(acc_file)) for data_dict in tqdm(parser.accession2taxid( acc2taxid=os.path.join(args.input, acc_file), chunk=args.chunk), unit=' chunks', desc='INFO:taxadb.app', total=''): Accession.insert_many(data_dict[0:args.chunk]).execute() inserted_rows += len(data_dict) logger.info('%s: %s added to database (%d rows inserted)' % (Accession.get_table_name(), acc_file, inserted_rows)) if not Accession.has_index(name='accession_accession'): logger.info('Creating index for %s' % Accession.get_table_name()) try: # db.add_index(Accession, ['accession'], unique=True) idx = db.index(db.Accession, name='accession', unique=True) db.add_index(idx) except PeeweeException as err: raise Exception("Could not create Accession index: %s" % str(err)) logger.info('Table Accession completed') db.close()
def create_db(args): """Main function for the 'taxadb create' sub-command. This function creates a taxonomy database with 2 tables: Taxa and Sequence. Args: args.input (:obj:`str`): input directory. It is the directory created by `taxadb download` args.dbname (:obj:`str`): name of the database to be created args.dbtype (:obj:`str`): type of database to be used. args.division (:obj:`str`): division to create the db for. args.fast (:obj:`bool`): Disables checks for faster db creation. Use with caution! """ logger = logging.getLogger(__name__) database = DatabaseFactory(**args.__dict__).get_database() div = args.division # am lazy at typing db.initialize(database) nucl_gb = 'nucl_gb.accession2taxid.gz' nucl_wgs = 'nucl_wgs.accession2taxid.gz' prot = 'prot.accession2taxid.gz' acc_dl_list = [] db.connect() parser = TaxaDumpParser(nodes_file=os.path.join(args.input, 'nodes.dmp'), names_file=os.path.join(args.input, 'names.dmp'), verbose=args.verbose) logger.debug('Connected to database') # If taxa table already exists, do not recreate and fill it # safe=True prevent not to create the table if it already exists if not Taxa.table_exists(): logger.info('Creating table %s' % str(Taxa.get_table_name())) db.create_tables([Taxa]) logger.info("Parsing files") taxa_info_list = parser.taxdump() logger.info("Inserting taxonomy data") total_size = len(taxa_info_list) try: with db.atomic(): for i in tqdm(range(0, total_size, args.chunk), unit=' chunks', desc='INFO:taxadb.app', total=''): Taxa.insert_many(taxa_info_list[i:i+args.chunk]).execute() except OperationalError as e: print("\n") # needed because the above counter has none logger.error("sqlite3 error: %s" % e) logger.error("Maybe retry with a lower chunk size.") sys.exit(1) logger.info('Table Taxa completed') # At first load, table accession does not exist yet, we create it db.create_tables([Accession]) if div in ['full', 'nucl', 'gb']: acc_dl_list.append(nucl_gb) if div in ['full', 'nucl', 'wgs']: acc_dl_list.append(nucl_wgs) if div in ['full', 'prot']: acc_dl_list.append(prot) parser = Accession2TaxidParser(verbose=args.verbose, fast=args.fast) with db.atomic(): for acc_file in acc_dl_list: inserted_rows = 0 logger.info("Parsing %s" % str(acc_file)) for data_dict in tqdm( parser.accession2taxid( acc2taxid=os.path.join(args.input, acc_file), chunk=args.chunk), unit=' chunks', desc='INFO:taxadb.app', total=''): Accession.insert_many(data_dict[0:args.chunk]).execute() inserted_rows += len(data_dict) logger.info('%s: %s added to database (%d rows inserted)' % (Accession.get_table_name(), acc_file, inserted_rows)) if not Accession.has_index(name='accession_accession'): logger.info('Creating index for %s' % Accession.get_table_name()) try: # db.add_index(Accession, ['accession'], unique=True) idx = db.index(db.Accession, name='accession', unique=True) db.add_index(idx) except PeeweeException as err: raise Exception("Could not create Accession index: %s" % str(err)) logger.info('Table Accession completed') db.close()