def test_rank_and_parent(): engine = create_engine('sqlite:///../testfiles/taxonomy.db', echo=False) tax = Taxonomy(engine, taxtastic.ncbi.RANKS) assert tax.rank(None) is None assert tax.rank('1239') == 'phylum' assert tax.rank('1280') == 'species' assert tax.parent_id(None) is None assert tax.parent_id('1239') == '2'
def test_species_below(): engine = create_engine('sqlite:///../testfiles/taxonomy.db', echo=False) tax = Taxonomy(engine, taxtastic.ncbi.RANKS) t = tax.species_below('1239') parent_id, rank = tax._node(t) for t in [None, '1239', '186801', '1117']: s = tax.species_below(t) assert t is None or s is None or tax.is_ancestor_of(s, t) assert s is None or tax.rank(s) == 'species'
def test_nary_subtree(): engine = create_engine('sqlite:///../testfiles/small_taxonomy.db', echo=False) tax = Taxonomy(engine, taxtastic.ncbi.RANKS) assert tax.nary_subtree(None) is None t = tax.nary_subtree('1239') assert t == [ '1280', '372074', '1579', '1580', '37734', '420335', '166485', '166486' ]
def action(args): engine = sqlalchemy.create_engine(args.url, echo=args.verbosity > 3) tax = Taxonomy(engine, schema=args.schema) writer = csv.writer(args.outfile) for row in tax._get_lineage_table(args.tax_ids): writer.writerow(row) engine.dispose()
def test_new_nodes05(self): args = ['add_nodes', self.dbname, data_path('staph_species_group2.yml')] self.assertZeroExitStatus(main(args)) tax = Taxonomy(sqlalchemy.create_engine('sqlite:///' + self.dbname)) with tax.engine.connect() as con: result = con.execute( 'select * from nodes where parent_id = ?', ('stapha_sg',)) keys = list(result.keys()) nodes = [dict(list(zip(keys, row))) for row in result.fetchall()] self.assertEqual(len(nodes), 5) self.assertEqual([row['source_id'] for row in nodes], [2] * len(nodes))
def action(args): log.info('reading tax_ids') if args.tax_ids: tax_ids = set(args.tax_ids) elif args.tax_id_file: tax_ids = set(args.tax_id_file.read().split()) elif args.seq_info: tax_ids = {row['tax_id'] for row in csv.DictReader(args.seq_info)} else: sys.exit('Error: no tax_ids were specified') engine = sqlalchemy.create_engine(args.url, echo=args.verbosity > 3) tax = Taxonomy(engine, schema=args.schema) rows = tax._get_lineage_table(tax_ids) log.info('grouping lineages') all_ranks = set() taxtable = {} for tax_id, grp in groupby(rows, lambda row: row[0]): ranks, tax_rows = as_taxtable_rows(grp, seen=taxtable) taxtable.update(dict(tax_rows)) all_ranks |= set(ranks) # guppy requires that tax_id == parent_id for the root node; # identify the root node by calculating an arbitrary lineage. root_id = tax.lineage(tax_id)['root'] taxtable[root_id]['parent_id'] = root_id sorted_ranks = sorted(all_ranks, key=order_ranks(tax.ranks[::-1])) # guppy requires this column order fieldnames = ['tax_id', 'parent_id', 'rank', 'tax_name'] + sorted_ranks output = list(taxtable.values()) log.info('sorting lineages') output = sorted( output, # key=getitems(*sorted_ranks) key=lambda row: tuple(row.get(rank) or '' for rank in sorted_ranks) ) log.info('writing taxtable') writer = csv.DictWriter( args.outfile, fieldnames=fieldnames, quoting=csv.QUOTE_ALL) writer.writeheader() writer.writerows(output)
def action(args): dbfile = args.dbfile taxnames_file = args.taxnames_file taxnames = args.taxnames outfile = args.outfile engine = create_engine('sqlite:///%s' % dbfile, echo=False) tax = Taxonomy(engine, ncbi.RANKS) names = [] if taxnames_file: names += [line.split('#', 1)[0].strip() for line in taxnames_file if line.strip() and not line.startswith('#')] if taxnames: names += [x.strip() for x in taxnames.split(',')] taxa = {} for name in set(names): tax_id, tax_name, is_primary, rank, note = '', '', '', '', '' try: tax_id, tax_name, is_primary = tax.primary_from_name(name) except ValueError: note = 'not found' else: parent, rank = tax._node(tax_id) note = '' if is_primary else 'not primary' if note: log.warning( '%(name)20s | %(tax_id)7s %(tax_name)20s %(note)s' % locals()) if rank == 'species': taxa[tax_id] = dict(tax_id=tax_id, tax_name=tax_name, rank=rank) else: keys, rows = get_children(engine, [tax_id]) taxa.update(dict((row['tax_id'], row) for row in rows)) for d in sorted(taxa.values(), key=lambda x: x['tax_name']): outfile.write('%(tax_id)s # %(tax_name)s\n' % d)
def action(args): engine = create_engine('sqlite:///%s' % args.database_file, echo=args.verbosity > 2) tax = Taxonomy(engine, ncbi.RANKS) if any([args.taxids, args.taxnames, args.seq_info]): taxids = set() if args.taxids: if os.access(args.taxids, os.F_OK): for line in getlines(args.taxids): taxids.update(set(re.split(r'[\s,;]+', line))) else: taxids.update( [x.strip() for x in re.split(r'[\s,;]+', args.taxids)]) if args.seq_info: with args.seq_info: reader = csv.DictReader(args.seq_info) taxids.update( frozenset(i['tax_id'] for i in reader if i['tax_id'])) if not (are_valid(taxids, tax)): return "Some taxids were invalid. Exiting." if args.taxnames: for taxname in getlines(args.taxnames): for name in re.split(r'\s*[,;]\s*', taxname): tax_id, primary_name, is_primary = tax.primary_from_name( name.strip()) taxids.add(tax_id) else: taxids = set(tax.tax_ids()) # Extract all the taxids to be exported in the CSV file. taxids_to_export = set() for t in taxids: taxids_to_export.update([y for (x, y) in tax._get_lineage(t)]) tax.write_table(taxids_to_export, csvfile=args.out_file, full=args.full) engine.dispose() return 0
def action(a): # Start database e = sqlalchemy.create_engine('sqlite:///{0}'.format(a.database)) taxonomy = Taxonomy(e, ncbi.ranks) is_classified = species_is_classified_fn(taxonomy) with a.infile as fp, \ a.output as out_fp, \ a.fasta_out as fasta_fp: records = SeqIO.parse(fp, 'genbank') records = util.Counter(records, prefix='Record ') taxa = ((record, ncbi_extract_genbank.tax_of_genbank(record)) for record in records) taxa = ((record, tax_id, record.annotations['organism']) for record, tax_id in taxa) taxa = ((record, update_taxid(tax_id, taxonomy, organism)) for record, tax_id, organism in taxa) writer = csv.writer(out_fp, lineterminator='\n', quoting=csv.QUOTE_NONNUMERIC) if a.header: header = ('version', 'seqname', 'tax_id', 'accession', 'description', 'length', 'ambig_count', 'is_type', 'rdp_lineage', 'taxid_classified') writer.writerow(header) for record, tax_id in taxa: accession, version = ncbi_extract_genbank.accession_version_of_genbank( record) rdp_lineage = ';'.join(record.annotations.get('taxonomy', [])) rdp_lineage = rdp_lineage.replace('"', '') row = (version, record.name, tax_id, accession, record.description, len(record), count_ambiguous(str(record.seq)), str(ncbi_extract_genbank.is_type(record)).upper(), rdp_lineage, is_classified(tax_id)) writer.writerow(row) SeqIO.write([transform_id(record)], fasta_fp, 'fasta') logging.info("Total records: %d", records.count)
def action(args): dbname = args.database_file new_nodes = args.new_nodes source_name = args.source_name engine = create_engine('sqlite:///%s' % dbname, echo=args.verbosity > 2) tax = Taxonomy(engine, ncbi.RANKS) log.warning('adding new nodes') nodes = get_new_nodes(new_nodes) for d in nodes: if source_name: d['source_name'] = source_name try: tax.add_node(**d) except IntegrityError: log.info('node with tax_id %(tax_id)s already exists' % d) else: log.info('added new node with tax_id %(tax_id)s' % d) engine.dispose()
def action(args): engine = create_engine('sqlite:///%s' % args.database_file, echo=args.verbosity > 2) tax = Taxonomy(engine, ncbi.RANKS) taxids = set() if args.taxids: if os.access(args.taxids, os.F_OK): for line in getlines(args.taxids): taxids.update(set(re.split(r'[\s,;]+', line))) else: taxids.update([x.strip() for x in re.split(r'[\s,;]+', args.taxids)]) if args.seq_info: with args.seq_info: reader = csv.DictReader(args.seq_info) taxids.update(frozenset(i['tax_id'] for i in reader if i['tax_id'])) writer = csv.writer(args.out_file) for t in taxids: try: tax._node(t) except ValueError: # Check for merged m = tax._get_merged(t) if m and m != t: writer.writerow([t, m]) else: writer.writerow([t, None]) engine.dispose() return 0
def setUp(self): self.engine = create_engine('sqlite:///%s' % self.dbname, echo=echo) self.tax = Taxonomy(self.engine, taxtastic.ncbi.RANKS)
def test_is_ancestor_of(): engine = create_engine('sqlite:///../testfiles/taxonomy.db', echo=False) tax = Taxonomy(engine, taxtastic.ncbi.RANKS) assert tax.is_ancestor_of('1280', '1239') assert tax.is_ancestor_of(None, '1239') is False assert tax.is_ancestor_of('1239', None) is False
def test_sibling_of(): engine = create_engine('sqlite:///../testfiles/taxonomy.db', echo=False) tax = Taxonomy(engine, taxtastic.ncbi.RANKS) assert tax.sibling_of(None) is None assert tax.sibling_of('91061') == '186801' assert tax.sibling_of('1696') is None
def test__node(): engine = create_engine( 'sqlite:///../testfiles/small_taxonomy.db', echo=False) tax = Taxonomy(engine, taxtastic.ncbi.RANKS) assert tax._node(None) is None assert tax._node('91061') == (u'1239', u'class')
def action(args): rows = pandas.read_csv(args.infile, dtype='str') columns = rows.columns.tolist() # preserve column order if args.taxid_column not in columns: raise ValueError("No column " + args.taxid_column) if args.name_column: if args.name_column not in columns: msg = '"No "' + args.name_column + '" column' raise ValueError(msg) con = 'sqlite:///{0}'.format(args.database_file) e = sqlalchemy.create_engine(con) tax = Taxonomy(e, ncbi.RANKS) merged = pandas.read_sql_table('merged', con, index_col='old_tax_id') log.info('updating tax_ids') rows = rows.join(merged, on=args.taxid_column) # overwrite tax_ids where there is a new_tax_id inew_tax_ids = ~rows['new_tax_id'].isnull() rows.loc[inew_tax_ids, args.taxid_column] = \ rows[inew_tax_ids]['new_tax_id'] rows = rows.drop('new_tax_id', axis=1) log.info('loading names table') names = pandas.read_sql_table('names', con, columns=['tax_id', 'tax_name', 'is_primary']) if args.name_column: """ use the args.name_column to do a string comparison with names.tax_name column to find a suitable tax_id """ unknowns = rows[~rows[args.taxid_column].isin(names['tax_id'])] if not unknowns.empty: """ Take any tax_id associated with a string match to tax_name prioritizing is_primary=True """ unknowns = unknowns.drop(args.taxid_column, axis=1) names = names.sort_values('is_primary', ascending=False) names = names.drop_duplicates(subset='tax_name', keep='first') names = names.set_index('tax_name') found = unknowns.join(names, on=args.name_column, how='inner') rows.loc[found.index, args.taxid_column] = found['tax_id'] if not args.ignore_unknowns: unknowns = rows[~rows[args.taxid_column].isin(names['tax_id'])] if args.unknowns: """ Output unknown tax_ids """ unknowns.to_csv(args.unknowns, index=False, columns=columns, quoting=csv.QUOTE_NONNUMERIC) elif not unknowns.empty: raise ValueError('Unknown or missing tax_ids present') rows = rows[~rows.index.isin(unknowns.index)] if args.taxid_classified: """ """ if 'taxid_classified' in columns: rows = rows.drop('taxid_classified', axis=1) else: columns.append('taxid_classified') def is_classified(row): row['taxid_classified'] = species_is_classified( row[args.taxid_column], tax) return row msg = 'validating tax_ids:' rows = utils.apply_df_status(is_classified, rows, msg) if args.append_lineage: """ Append a column from the taxonomy to seq_info """ if args.append_lineage in columns: rows = rows.drop(args.append_lineage, axis=1) else: columns.append(args.append_lineage) def add_rank_column(row): try: lineage = tax.lineage(row[args.taxid_column]) except ValueError as e: log.warn(e) lineage = {} row[args.append_lineage] = lineage.get(args.append_lineage, None) return row msg = 'appending {} column'.format(args.append_lineage) rows = utils.apply_df_status(add_rank_column, rows, msg) # output seq_info with new tax_ids rows.to_csv(args.out_file, index=False, columns=columns, quoting=csv.QUOTE_NONNUMERIC)
def action(args): reader = csv.DictReader(args.infile) fieldnames = reader.fieldnames taxid_column = args.taxid_column drop = args.unknown_action == 'drop' error = args.unknown_action == 'error' ignore = args.unknown_action == 'ignore' if taxid_column not in fieldnames: raise ValueError("No column " + args.taxid_column) # TODO: remove unless --use-names is implemented # if args.use_names and args.name_column not in fieldnames: # raise ValueError("No column " + args.name_column) writer = csv.DictWriter(args.outfile, fieldnames=fieldnames, quoting=csv.QUOTE_ALL) writer.writeheader() if args.unknowns: unknowns = csv.DictWriter(args.unknowns, fieldnames=fieldnames, quoting=csv.QUOTE_ALL) unknowns.writeheader() engine = sqlalchemy.create_engine(args.url, echo=args.verbosity > 3) tax = Taxonomy(engine, taxtastic.ncbi.RANKS, schema=args.schema) with tax.engine.connect() as con: log.info('reading table merged') result = con.execute( 'select old_tax_id, new_tax_id from {merged}'.format( merged=tax.merged)) mergedict = dict(result.fetchall()) log.info('reading tax_ids from table {nodes}'.format(nodes=tax.nodes)) result = con.execute( 'select tax_id from {nodes}'.format(nodes=tax.nodes)) all_tax_ids = {x[0] for x in result.fetchall()} log.info('reading input file') for row in reader: tax_id = row[taxid_column] if tax_id in all_tax_ids: pass # write row without modification elif tax_id in mergedict: row[taxid_column] = mergedict[tax_id] else: # tax_id is unknown if args.unknowns: unknowns.writerow(row) if ignore: pass elif drop: continue elif error: sys.exit('Error: tax_id {} is unknown'.format(tax_id)) writer.writerow(row)
def setUp(self): self.engine = create_engine('sqlite:///%s' % dbname, echo=echo) self.tax = Taxonomy(self.engine)
def action(args): engine = sqlalchemy.create_engine(args.url, echo=args.verbosity > 3) ranks_df = pandas.read_sql_table('ranks', engine, schema=args.schema) # most operations in this script require ordering from 'root' down ranks_df = ranks_df.sort_values(by='height', ascending=False) ranks = ranks_df['rank'].tolist() nodes = None subset_ids = set() # check tax_ids subsets first before building taxtable if any([args.tax_ids, args.taxnames, args.seq_info]): tax = Taxonomy(engine, schema=args.schema) if args.tax_ids: if os.access(args.tax_ids, os.F_OK): for line in getlines(args.tax_ids): subset_ids.update(set(re.split(r'[\s,;]+', line))) else: subset_ids.update( [x.strip() for x in re.split(r'[\s,;]+', args.tax_ids)]) if args.seq_info: log.info('reading tax_ids ' + args.seq_info.name) with args.seq_info: reader = csv.DictReader(args.seq_info) subset_ids.update( frozenset(i['tax_id'] for i in reader if i['tax_id'])) # this will raise an error if any tax_ids do not exist in database all_known(subset_ids, tax) if args.taxnames: for taxname in getlines(args.taxnames): for name in re.split(r'\s*[,;]\s*', taxname): tax_id, primary_name, is_primary = tax.primary_from_name( name.strip()) subset_ids.add(tax_id) if not subset_ids: log.error('no tax_ids to subset taxtable, exiting') return log.info('loading nodes table from database') nodes = pandas.read_sql_table('nodes', engine, schema=args.schema, index_col='tax_id') if args.taxtable: log.info('using existing taxtable ' + args.taxtable) taxtable = pandas.read_csv(args.taxtable, dtype=str) taxtable = taxtable.set_index('tax_id') taxtable = taxtable.join(nodes[['parent_id', 'is_valid']]) else: log.info('building taxtable') names = pandas.read_sql_table( 'names', engine, schema=args.schema, columns=['tax_id', 'tax_name', 'is_primary']) names = names[names['is_primary']].set_index('tax_id') len_nodes = len(nodes) nodes = nodes.join(names['tax_name']) assert len_nodes == len(nodes) taxtable = build_taxtable(nodes, ranks) # subset taxtable clade lineages if args.clade_ids: dtypes = taxtable.dtypes clades = [] for i in args.clade_ids.split(','): ancestor = taxtable.loc[i] # select all rows where rank column == args.from_id clade = taxtable[taxtable[ancestor['rank']] == i] # build taxtable up to root from args.from_id while ancestor.name != '1': # root parent = taxtable.loc[ancestor['parent_id']] clade = pandas.concat([pandas.DataFrame(parent).T, clade]) ancestor = parent # reset lost index name after concatenating transposed series clades.append(clade) taxtable = pandas.concat(clades) taxtable = taxtable[~taxtable.index.duplicated()] # set index.name and dtypes back after concating transposed series taxtable.index.name = 'tax_id' for d, t in dtypes.iteritems(): taxtable[d] = taxtable[d].astype(t) # subset taxtable by set of tax_ids if subset_ids: keepers = taxtable.loc[subset_ids] for col in keepers.columns: if col in ranks: subset_ids.update(keepers[col].dropna().values) taxtable = taxtable.loc[subset_ids] # drop no rank nodes if args.ranked: ranks = ranks_df[~ranks_df['no_rank']]['rank'].tolist() taxtable = taxtable[taxtable['rank'].isin(ranks)] if args.valid: invalid = taxtable[~taxtable['is_valid']] # remove all invalids from the rank columns for r, g in invalid.groupby(by='rank'): taxtable.loc[taxtable[r].isin(g.index), r] = None # remove invalid rows taxtable = taxtable[taxtable['is_valid']] # clean up empty rank columns taxtable = taxtable.dropna(axis=1, how='all') # sort final column output taxtable = taxtable[['rank', 'tax_name'] + [r for r in ranks if r in taxtable.columns]] # sort rows taxtable['rank'] = taxtable['rank'].astype('category', categories=ranks) taxtable = taxtable.sort_values('rank') # write and close db taxtable.to_csv(args.out) engine.dispose()
def action(args): engine = sqlalchemy.create_engine(args.url, echo=args.verbosity > 2) tax = Taxonomy(engine, schema=args.schema) with engine.connect() as con: # TODO: need to order nodes so that parents are always created first cmd = """ select nodes.*, source.name as source_name from {nodes} join {source} on nodes.source_id = source.id where source.name = {x} """.format(x=tax.placeholder, nodes=tax.nodes, source=tax.source) result = con.execute(cmd, (args.source_name, )) keys = list(result.keys()) nodes = [clean_dict(keys, vals) for vals in result.fetchall()] # get the complete lineage for each node, and provide an # ordering for all nodes so that children may be placed after # parents. tax_ids = list(map(itemgetter('tax_id'), nodes)) lineages = tax._get_lineage_table(tax_ids) ordering = {} for i, lineage in enumerate(lineages): tax_id = lineage[1] if tax_id not in ordering: ordering[tax_id] = i nodes = sorted(nodes, key=lambda n: ordering[n['tax_id']]) cmd = """ select names.*, source.name as source_name from {names} join {source} on names.source_id = source.id where source.name = {x} """.format(x=tax.placeholder, names=tax.names, source=tax.source) result = con.execute(cmd, (args.source_name, )) keys = list(result.keys()) names = [clean_dict(keys, vals) for vals in result.fetchall()] namedict = { key: list(grp) for key, grp in groupby(names, itemgetter('tax_id')) } for node in nodes: node['type'] = 'node' tax_id = node['tax_id'] if tax_id in namedict: node['names'] = namedict.pop(tax_id) yaml.safe_dump_all(nodes, args.outfile, default_flow_style=False, explicit_start=True, indent=2) # prepare remaining names remaining_names = [] for tax_id, names in list(namedict.items()): for name in names: del name['tax_id'] remaining_names.append({ 'tax_id': tax_id, 'type': 'name', 'names': names }) yaml.safe_dump_all(remaining_names, args.outfile, default_flow_style=False, explicit_start=True, indent=2)