def action(args): engine = create_engine( 'sqlite:///%s' % args.database_file, echo=args.verbosity > 2) tax = Taxonomy(engine, ncbi.ranks) taxids = set() if args.taxids: if os.access(args.taxids, os.F_OK): for line in getlines(args.taxids): taxids.update(set(re.split(r'[\s,;]+', line))) else: taxids.update([x.strip() for x in re.split(r'[\s,;]+', args.taxids)]) if args.taxnames: for taxname in getlines(args.taxnames): for name in re.split(r'\s*[,;]\s*', taxname): tax_id, primary_name, is_primary = tax.primary_from_name( name.strip()) taxids.add(tax_id) if args.seq_info: with args.seq_info: reader = csv.DictReader(args.seq_info) taxids.update(frozenset(i['tax_id'] for i in reader if i['tax_id'])) # Before digging into lineages, make sure all the taxids exist in # the taxonomy database. valid_taxids = True for t in taxids: try: tax._node(t) except KeyError: # Check for merged m = tax._get_merged(t) if m and m != t: msg = ("Taxid {0} has been replaced by {1}. " "Please update your records").format(t, m) print >> sys.stderr, msg else: print >>sys.stderr, "Taxid %s not found in taxonomy." % t valid_taxids = False if not(valid_taxids): print >>sys.stderr, "Some taxids were invalid. Exiting." return 1 # exits with code 1 # Extract all the taxids to be exported in the CSV file. taxids_to_export = set() for t in taxids: taxids_to_export.update([y for (x, y) in tax._get_lineage(t)]) tax.write_table(taxids_to_export, csvfile=args.out_file, full=args.full) engine.dispose() return 0
def action(args): engine = create_engine(args.url, echo=args.verbosity > 2) tax = Taxonomy(engine, schema=args.schema) taxids = set() if args.taxids: if os.access(args.taxids, os.F_OK): for line in getlines(args.taxids): taxids.update(set(re.split(r'[\s,;]+', line))) else: taxids.update([x.strip() for x in re.split(r'[\s,;]+', args.taxids)]) if args.seq_info: with args.seq_info: reader = csv.DictReader(args.seq_info) taxids.update(frozenset(i['tax_id'] for i in reader if i['tax_id'])) writer = csv.writer(args.out) for t in taxids: try: tax._node(t) except ValueError: # Check for merged m = tax._get_merged(t) if m and m != t: writer.writerow([t, m]) else: writer.writerow([t, None]) engine.dispose() return 0
def action(args): engine = create_engine( 'sqlite:///%s' % args.database_file, echo=args.verbosity > 2) tax = Taxonomy(engine, ncbi.RANKS) if any([args.taxids, args.taxnames, args.seq_info]): taxids = set() if args.taxids: if os.access(args.taxids, os.F_OK): for line in getlines(args.taxids): taxids.update(set(re.split(r'[\s,;]+', line))) else: taxids.update( [x.strip() for x in re.split(r'[\s,;]+', args.taxids)]) if args.seq_info: with args.seq_info: reader = csv.DictReader(args.seq_info) taxids.update( frozenset(i['tax_id'] for i in reader if i['tax_id'])) if not(are_valid(taxids, tax)): return "Some taxids were invalid. Exiting." if args.taxnames: for taxname in getlines(args.taxnames): for name in re.split(r'\s*[,;]\s*', taxname): tax_id, primary_name, is_primary = tax.primary_from_name( name.strip()) taxids.add(tax_id) else: taxids = set(tax.tax_ids()) # Extract all the taxids to be exported in the CSV file. taxids_to_export = set() for t in taxids: taxids_to_export.update([y for (x, y) in tax._get_lineage(t)]) tax.write_table(taxids_to_export, csvfile=args.out_file, full=args.full) engine.dispose() return 0
def action(args): engine = create_engine('sqlite:///%s' % args.database_file, echo=args.verbosity > 2) tax = Taxonomy(engine, ncbi.RANKS) if any([args.taxids, args.taxnames, args.seq_info]): taxids = set() if args.taxids: if os.access(args.taxids, os.F_OK): for line in getlines(args.taxids): taxids.update(set(re.split(r'[\s,;]+', line))) else: taxids.update( [x.strip() for x in re.split(r'[\s,;]+', args.taxids)]) if args.seq_info: with args.seq_info: reader = csv.DictReader(args.seq_info) taxids.update( frozenset(i['tax_id'] for i in reader if i['tax_id'])) if not (are_valid(taxids, tax)): return "Some taxids were invalid. Exiting." if args.taxnames: for taxname in getlines(args.taxnames): for name in re.split(r'\s*[,;]\s*', taxname): tax_id, primary_name, is_primary = tax.primary_from_name( name.strip()) taxids.add(tax_id) else: taxids = set(tax.tax_ids()) # Extract all the taxids to be exported in the CSV file. taxids_to_export = set() for t in taxids: taxids_to_export.update([y for (x, y) in tax._get_lineage(t)]) tax.write_table(taxids_to_export, csvfile=args.out_file, full=args.full) engine.dispose() return 0
def action(args): engine = create_engine('sqlite:///%s' % args.database_file, echo=args.verbosity > 2) tax = Taxonomy(engine, ncbi.RANKS) taxids = set() if args.taxids: if os.access(args.taxids, os.F_OK): for line in getlines(args.taxids): taxids.update(set(re.split(r'[\s,;]+', line))) else: taxids.update([x.strip() for x in re.split(r'[\s,;]+', args.taxids)]) if args.seq_info: with args.seq_info: reader = csv.DictReader(args.seq_info) taxids.update(frozenset(i['tax_id'] for i in reader if i['tax_id'])) writer = csv.writer(args.out_file) for t in taxids: try: tax._node(t) except ValueError: # Check for merged m = tax._get_merged(t) if m and m != t: writer.writerow([t, m]) else: writer.writerow([t, None]) engine.dispose() return 0
def action(args): engine = sqlalchemy.create_engine(args.url, echo=args.verbosity > 3) ranks_df = pandas.read_sql_table('ranks', engine, schema=args.schema) # most operations in this script require ordering from 'root' down ranks_df = ranks_df.sort_values(by='height', ascending=False) ranks = ranks_df['rank'].tolist() nodes = None subset_ids = set() # check tax_ids subsets first before building taxtable if any([args.tax_ids, args.taxnames, args.seq_info]): tax = Taxonomy(engine, schema=args.schema) if args.tax_ids: if os.access(args.tax_ids, os.F_OK): for line in getlines(args.tax_ids): subset_ids.update(set(re.split(r'[\s,;]+', line))) else: subset_ids.update( [x.strip() for x in re.split(r'[\s,;]+', args.tax_ids)]) if args.seq_info: log.info('reading tax_ids ' + args.seq_info.name) with args.seq_info: reader = csv.DictReader(args.seq_info) subset_ids.update( frozenset(i['tax_id'] for i in reader if i['tax_id'])) # this will raise an error if any tax_ids do not exist in database all_known(subset_ids, tax) if args.taxnames: for taxname in getlines(args.taxnames): for name in re.split(r'\s*[,;]\s*', taxname): tax_id, primary_name, is_primary = tax.primary_from_name( name.strip()) subset_ids.add(tax_id) if not subset_ids: log.error('no tax_ids to subset taxtable, exiting') return log.info('loading nodes table from database') nodes = pandas.read_sql_table('nodes', engine, schema=args.schema, index_col='tax_id') if args.taxtable: log.info('using existing taxtable ' + args.taxtable) taxtable = pandas.read_csv(args.taxtable, dtype=str) taxtable = taxtable.set_index('tax_id') taxtable = taxtable.join(nodes[['parent_id', 'is_valid']]) else: log.info('building taxtable') names = pandas.read_sql_table( 'names', engine, schema=args.schema, columns=['tax_id', 'tax_name', 'is_primary']) names = names[names['is_primary']].set_index('tax_id') len_nodes = len(nodes) nodes = nodes.join(names['tax_name']) assert len_nodes == len(nodes) taxtable = build_taxtable(nodes, ranks) # subset taxtable clade lineages if args.clade_ids: dtypes = taxtable.dtypes clades = [] for i in args.clade_ids.split(','): ancestor = taxtable.loc[i] # select all rows where rank column == args.from_id clade = taxtable[taxtable[ancestor['rank']] == i] # build taxtable up to root from args.from_id while ancestor.name != '1': # root parent = taxtable.loc[ancestor['parent_id']] clade = pandas.concat([pandas.DataFrame(parent).T, clade]) ancestor = parent # reset lost index name after concatenating transposed series clades.append(clade) taxtable = pandas.concat(clades) taxtable = taxtable[~taxtable.index.duplicated()] # set index.name and dtypes back after concating transposed series taxtable.index.name = 'tax_id' for d, t in dtypes.iteritems(): taxtable[d] = taxtable[d].astype(t) # subset taxtable by set of tax_ids if subset_ids: keepers = taxtable.loc[subset_ids] for col in keepers.columns: if col in ranks: subset_ids.update(keepers[col].dropna().values) taxtable = taxtable.loc[subset_ids] # drop no rank nodes if args.ranked: ranks = ranks_df[~ranks_df['no_rank']]['rank'].tolist() taxtable = taxtable[taxtable['rank'].isin(ranks)] if args.valid: invalid = taxtable[~taxtable['is_valid']] # remove all invalids from the rank columns for r, g in invalid.groupby(by='rank'): taxtable.loc[taxtable[r].isin(g.index), r] = None # remove invalid rows taxtable = taxtable[taxtable['is_valid']] # clean up empty rank columns taxtable = taxtable.dropna(axis=1, how='all') # sort final column output taxtable = taxtable[['rank', 'tax_name'] + [r for r in ranks if r in taxtable.columns]] # sort rows taxtable['rank'] = taxtable['rank'].astype('category', categories=ranks) taxtable = taxtable.sort_values('rank') # write and close db taxtable.to_csv(args.out) engine.dispose()