def create_mmseqs_taxonomy(args): if args.action == 'accessions': for i, header in enumerate(split_file(args.infile, '\x00')): accessions = [ x.split(' ')[0].split('.')[0] for x in header.split('\x01') ] for a in accessions: print(a) sys.exit() tax_db = ncbitax.TaxonomyDb(tax_dir=args.tax_db, load_names=True, load_nodes=True) accession_lookup = {} start_time = time.time() log.info('Loading accession to taxid mapping') if args.genomic_joined: with ioutil.compressed_open(args.genomic_joined, 'rt') as f: for line in f: parts = line.split('\t') accession = parts[0] taxid = parts[1] taxid = int(taxid) accession_lookup[base_accession] = taxid elif args.accession2taxid: for fn in args.accession2taxid: with ioutil.compressed_open(fn, 'rt') as f: for line in f: parts = line.split('\t') base_accession = parts[0] if base_accession == 'accession': continue try: taxid = parts[2] except: print(line) taxid = int(taxid) accession_lookup[base_accession] = taxid log.info('Loaded accession to taxid mapping: %.2ss', time.time() - start_time) for i, header in enumerate(split_file(args.infile, '\x00')): accessions = [ x.split(' ')[0].split('.')[0] for x in header.split('\x01') ] taxids = [] for a in accessions: try: taxid = accession_lookup[a] except KeyError: log.info('Taxid for accession not found: %s', a) taxids.append(taxid) lca = tax_db.coverage_lca(taxids) if not lca: log.error('LCA not found for taxids %s', taxids) lca = 1 print("{}\t{}".format(i + 1, lca), file=args.output)
def main(): parser = argparse.ArgumentParser(description='Get all viral taxons') parser.add_argument('taxid_file') parser.add_argument('--tax-dir', required=False) args = parser.parse_args() db = ncbitax.TaxonomyDb(tax_dir=args.tax_dir, load_nodes=True, load_names=True, load_merged=True) with open(args.taxid_file) as f: for line in f: taxid = int(line) if db.is_viral(taxid): print(taxid)
def prepare_karp_fasta(args): tax_db = ncbitax.TaxonomyDb(tax_dir=args.tax_db, load_names=True, load_nodes=True) if args.genomic_joined: accession_lookup = {} with open(args.genomic_joined, 'rt') as f: for line in f: parts = line.split('\t') accession, taxid, gi = parts taxid = int(taxid) gi = int(gi) accession_lookup[accession] = taxid for seq in SeqIO.parse(args.infile, 'fasta'): seq_id = seq.id seq_id = seq_id.split(':')[0] taxid = accession_lookup.get(seq_id) if not taxid: log.info('Sequence ID %s not found in taxonomy db', seq_id) continue karp_name = full_name(tax_db.parents, tax_db.names, taxid) if not karp_name: continue seq.description = '\t'.join([seq.id, karp_name]) SeqIO.write(seq, args.output, 'fasta') return for seq in SeqIO.parse(args.infile, 'fasta'): taxid = parse_taxid(seq.description) if taxid == 0: continue # seq.id = str(taxid) # seq.description = '\t'.join([str(taxid), full_name(tax_db.parents, tax_db.names, taxid)]) # seq.description = '\t'.join([str(taxid), seq.description]) karp_name = full_name(tax_db.parents, tax_db.names, taxid) if not karp_name: continue seq.description = '\t'.join([seq.id, karp_name]) SeqIO.write(seq, args.output, 'fasta')
def compile_reports(args): assert args.files or args.dir, "One of --files or --dir must be specified" conf = load_config(args.config) db = ncbitax.TaxonomyDb(tax_dir=args.tax_dir, load_nodes=True, load_names=True, load_merged=True, scientific_names_only=False) with open(args.output, 'wt') as out_f: writer = ReportTableWriter(out_f) proc = ReportProcessor(db, conf) proc.total_reads = {} if args.read_counts: with open(args.read_counts) as reads_f: for row in csv.reader(reads_f, delimiter='\t'): sample, count = row if sample.endswith('.art'): sample = sample[:-4] proc.total_reads[sample] = int(count) if args.classified_counts: with open(args.classified_counts) as cc_f: proc.classified_counts = read_classified_counts(cc_f) taxa = [] if args.files: for fn in args.files: basename = os.path.basename(fn) res = proc.process_file(basename, open(fn)) if res: taxa.extend(res) #sys.exit() elif args.dir: for x in Path(args.dir).iterdir(): if x.is_dir(): continue basename = x.name with x.open() as in_f: res = proc.process_file(basename, in_f) if res: taxa.extend(res) proc.sum_rank_abundances(taxa) for d in taxa: writer.write(d) with open(args.rank_abundances, 'wt') as out_f: for k, v in proc.rank_abundances.items(): parts = [str(x) for x in list(k) + [v]] print('\t'.join(parts), file=out_f) if args.missing_parents: with open(args.missing_parents, 'wt') as f: missing_parents = { k for k, v in proc.parent_path_cache.items() if v is None } log.info('Missing parents for %s taxids', len(missing_parents)) for taxid in sorted(list(missing_parents)): if taxid == 0: continue print(taxid, file=f)
def create_metaothello_taxinfo(args): taxids = set() with open(args.taxids, 'rt') as f: for line in f: taxid = int(line) # accession, taxid, gi = line.split('\t') # taxid = int(taxid) # accession, taxid, gi = line.split('\t') # taxid = int(taxid) # gi = int(gi) taxids.add(taxid) tax_db = ncbitax.TaxonomyDb(tax_dir=args.tax_dir, load_nodes=True, load_names=True, load_merged=True) taxid_info = {} for taxid in taxids: inf = taxid_info[taxid] = { 'species_taxid': taxid, 'species_name': tax_db.names[taxid] } while True: taxid = tax_db.parents[taxid] if taxid == 1: break if tax_db.ranks[taxid] == 'genus': inf['genus_taxid'] = taxid inf['genus_name'] = tax_db.names[taxid] continue elif tax_db.ranks[taxid] == 'family': inf['family_taxid'] = taxid inf['family_name'] = tax_db.names[taxid] continue elif tax_db.ranks[taxid] == 'order': inf['order_taxid'] = taxid inf['order_name'] = tax_db.names[taxid] continue elif tax_db.ranks[taxid] == 'class': inf['class_taxid'] = taxid inf['class_name'] = tax_db.names[taxid] continue elif tax_db.ranks[taxid] == 'phylum': inf['phylum_taxid'] = taxid inf['phylum_name'] = tax_db.names[taxid] continue elif tax_db.ranks[taxid] == 'kingdom': inf['kingdom_taxid'] = taxid inf['kingdom_name'] = tax_db.names[taxid] continue def create_index(key): return { taxid: i for i, taxid in enumerate( inf.get(key + '_taxid', -1) for inf in taxid_info.values()) } species_index = create_index('species') genus_index = create_index('genus') family_index = create_index('family') order_index = create_index('order') class_index = create_index('class') phylum_index = create_index('phylum') kingdom_index = create_index('kingdom') with open(args.output, 'wt') as f: header = '\t'.join([ 'Species_index', 'Species_ID', 'Species_name', 'Genus_index', 'Genus_ID', 'Genus_name', 'Family_index', 'Family_ID', 'Family_name', 'Order_index', 'Order_ID', 'Order_name', 'Class_index', 'Class_ID', 'Class_name', 'Phylum_index', 'Phylum_ID', 'Phylum_name' ]) print(header, file=f) for taxid in taxids: inf = taxid_info[taxid] sid = inf.get('species_taxid', -1) gid = inf.get('genus_taxid', -1) fid = inf.get('family_taxid', -1) oid = inf.get('order_taxid', -1) cid = inf.get('class_taxid', -1) pid = inf.get('phylum_taxid', -1) parts = [ species_index[sid], sid, inf.get('species_name', 'NULL'), genus_index[gid], gid, inf.get('genus_name', 'NULL'), family_index[fid], fid, inf.get('family_name', 'NULL'), order_index[oid], oid, inf.get('order_name', 'NULL'), class_index[cid], cid, inf.get('class_name', 'NULL'), phylum_index[pid], pid, inf.get('phylum_name', 'NULL'), ] line = '\t'.join(str(x) for x in parts) print(line, file=f)