Example #1
0
def create_mmseqs_taxonomy(args):
    if args.action == 'accessions':
        for i, header in enumerate(split_file(args.infile, '\x00')):
            accessions = [
                x.split(' ')[0].split('.')[0] for x in header.split('\x01')
            ]
            for a in accessions:
                print(a)
        sys.exit()

    tax_db = ncbitax.TaxonomyDb(tax_dir=args.tax_db,
                                load_names=True,
                                load_nodes=True)
    accession_lookup = {}
    start_time = time.time()
    log.info('Loading accession to taxid mapping')
    if args.genomic_joined:
        with ioutil.compressed_open(args.genomic_joined, 'rt') as f:
            for line in f:
                parts = line.split('\t')
                accession = parts[0]
                taxid = parts[1]
                taxid = int(taxid)
                accession_lookup[base_accession] = taxid
    elif args.accession2taxid:
        for fn in args.accession2taxid:
            with ioutil.compressed_open(fn, 'rt') as f:
                for line in f:
                    parts = line.split('\t')
                    base_accession = parts[0]
                    if base_accession == 'accession':
                        continue
                    try:
                        taxid = parts[2]
                    except:
                        print(line)
                    taxid = int(taxid)
                    accession_lookup[base_accession] = taxid
    log.info('Loaded accession to taxid mapping: %.2ss',
             time.time() - start_time)

    for i, header in enumerate(split_file(args.infile, '\x00')):
        accessions = [
            x.split(' ')[0].split('.')[0] for x in header.split('\x01')
        ]
        taxids = []
        for a in accessions:
            try:
                taxid = accession_lookup[a]
            except KeyError:
                log.info('Taxid for accession not found: %s', a)
            taxids.append(taxid)
        lca = tax_db.coverage_lca(taxids)
        if not lca:
            log.error('LCA not found for taxids %s', taxids)
            lca = 1
        print("{}\t{}".format(i + 1, lca), file=args.output)
Example #2
0
def main():
    parser = argparse.ArgumentParser(description='Get all viral taxons')

    parser.add_argument('taxid_file')
    parser.add_argument('--tax-dir', required=False)
    args = parser.parse_args()

    db = ncbitax.TaxonomyDb(tax_dir=args.tax_dir,
                            load_nodes=True,
                            load_names=True,
                            load_merged=True)

    with open(args.taxid_file) as f:
        for line in f:
            taxid = int(line)
            if db.is_viral(taxid):
                print(taxid)
Example #3
0
def prepare_karp_fasta(args):
    tax_db = ncbitax.TaxonomyDb(tax_dir=args.tax_db,
                                load_names=True,
                                load_nodes=True)

    if args.genomic_joined:
        accession_lookup = {}
        with open(args.genomic_joined, 'rt') as f:
            for line in f:
                parts = line.split('\t')
                accession, taxid, gi = parts
                taxid = int(taxid)
                gi = int(gi)
                accession_lookup[accession] = taxid

        for seq in SeqIO.parse(args.infile, 'fasta'):
            seq_id = seq.id
            seq_id = seq_id.split(':')[0]
            taxid = accession_lookup.get(seq_id)
            if not taxid:
                log.info('Sequence ID %s not found in taxonomy db', seq_id)
                continue

            karp_name = full_name(tax_db.parents, tax_db.names, taxid)
            if not karp_name:
                continue
            seq.description = '\t'.join([seq.id, karp_name])
            SeqIO.write(seq, args.output, 'fasta')

        return

    for seq in SeqIO.parse(args.infile, 'fasta'):
        taxid = parse_taxid(seq.description)
        if taxid == 0:
            continue
        # seq.id = str(taxid)
        # seq.description = '\t'.join([str(taxid), full_name(tax_db.parents, tax_db.names, taxid)])
        # seq.description = '\t'.join([str(taxid), seq.description])
        karp_name = full_name(tax_db.parents, tax_db.names, taxid)
        if not karp_name:
            continue
        seq.description = '\t'.join([seq.id, karp_name])

        SeqIO.write(seq, args.output, 'fasta')
Example #4
0
def compile_reports(args):
    assert args.files or args.dir, "One of --files or --dir must be specified"
    conf = load_config(args.config)

    db = ncbitax.TaxonomyDb(tax_dir=args.tax_dir,
                            load_nodes=True,
                            load_names=True,
                            load_merged=True,
                            scientific_names_only=False)

    with open(args.output, 'wt') as out_f:

        writer = ReportTableWriter(out_f)
        proc = ReportProcessor(db, conf)

        proc.total_reads = {}
        if args.read_counts:
            with open(args.read_counts) as reads_f:
                for row in csv.reader(reads_f, delimiter='\t'):
                    sample, count = row
                    if sample.endswith('.art'):
                        sample = sample[:-4]
                    proc.total_reads[sample] = int(count)
        if args.classified_counts:
            with open(args.classified_counts) as cc_f:
                proc.classified_counts = read_classified_counts(cc_f)

        taxa = []
        if args.files:
            for fn in args.files:
                basename = os.path.basename(fn)
                res = proc.process_file(basename, open(fn))
                if res:
                    taxa.extend(res)
                #sys.exit()
        elif args.dir:
            for x in Path(args.dir).iterdir():
                if x.is_dir():
                    continue
                basename = x.name
                with x.open() as in_f:
                    res = proc.process_file(basename, in_f)
                    if res:
                        taxa.extend(res)
        proc.sum_rank_abundances(taxa)
        for d in taxa:
            writer.write(d)
        with open(args.rank_abundances, 'wt') as out_f:
            for k, v in proc.rank_abundances.items():
                parts = [str(x) for x in list(k) + [v]]
                print('\t'.join(parts), file=out_f)

        if args.missing_parents:
            with open(args.missing_parents, 'wt') as f:
                missing_parents = {
                    k
                    for k, v in proc.parent_path_cache.items() if v is None
                }
                log.info('Missing parents for %s taxids', len(missing_parents))
                for taxid in sorted(list(missing_parents)):
                    if taxid == 0:
                        continue
                    print(taxid, file=f)
Example #5
0
def create_metaothello_taxinfo(args):
    taxids = set()
    with open(args.taxids, 'rt') as f:
        for line in f:
            taxid = int(line)
            # accession, taxid, gi = line.split('\t')
            # taxid = int(taxid)
            # accession, taxid, gi = line.split('\t')
            # taxid = int(taxid)
            # gi = int(gi)
            taxids.add(taxid)

    tax_db = ncbitax.TaxonomyDb(tax_dir=args.tax_dir,
                                load_nodes=True,
                                load_names=True,
                                load_merged=True)

    taxid_info = {}
    for taxid in taxids:
        inf = taxid_info[taxid] = {
            'species_taxid': taxid,
            'species_name': tax_db.names[taxid]
        }

        while True:
            taxid = tax_db.parents[taxid]
            if taxid == 1:
                break
            if tax_db.ranks[taxid] == 'genus':
                inf['genus_taxid'] = taxid
                inf['genus_name'] = tax_db.names[taxid]
                continue

            elif tax_db.ranks[taxid] == 'family':
                inf['family_taxid'] = taxid
                inf['family_name'] = tax_db.names[taxid]
                continue

            elif tax_db.ranks[taxid] == 'order':
                inf['order_taxid'] = taxid
                inf['order_name'] = tax_db.names[taxid]
                continue

            elif tax_db.ranks[taxid] == 'class':
                inf['class_taxid'] = taxid
                inf['class_name'] = tax_db.names[taxid]
                continue

            elif tax_db.ranks[taxid] == 'phylum':
                inf['phylum_taxid'] = taxid
                inf['phylum_name'] = tax_db.names[taxid]
                continue

            elif tax_db.ranks[taxid] == 'kingdom':
                inf['kingdom_taxid'] = taxid
                inf['kingdom_name'] = tax_db.names[taxid]
                continue

    def create_index(key):
        return {
            taxid: i
            for i, taxid in enumerate(
                inf.get(key + '_taxid', -1) for inf in taxid_info.values())
        }

    species_index = create_index('species')
    genus_index = create_index('genus')
    family_index = create_index('family')
    order_index = create_index('order')
    class_index = create_index('class')
    phylum_index = create_index('phylum')
    kingdom_index = create_index('kingdom')

    with open(args.output, 'wt') as f:
        header = '\t'.join([
            'Species_index', 'Species_ID', 'Species_name', 'Genus_index',
            'Genus_ID', 'Genus_name', 'Family_index', 'Family_ID',
            'Family_name', 'Order_index', 'Order_ID', 'Order_name',
            'Class_index', 'Class_ID', 'Class_name', 'Phylum_index',
            'Phylum_ID', 'Phylum_name'
        ])

        print(header, file=f)

        for taxid in taxids:
            inf = taxid_info[taxid]
            sid = inf.get('species_taxid', -1)
            gid = inf.get('genus_taxid', -1)
            fid = inf.get('family_taxid', -1)
            oid = inf.get('order_taxid', -1)
            cid = inf.get('class_taxid', -1)
            pid = inf.get('phylum_taxid', -1)
            parts = [
                species_index[sid],
                sid,
                inf.get('species_name', 'NULL'),
                genus_index[gid],
                gid,
                inf.get('genus_name', 'NULL'),
                family_index[fid],
                fid,
                inf.get('family_name', 'NULL'),
                order_index[oid],
                oid,
                inf.get('order_name', 'NULL'),
                class_index[cid],
                cid,
                inf.get('class_name', 'NULL'),
                phylum_index[pid],
                pid,
                inf.get('phylum_name', 'NULL'),
            ]
            line = '\t'.join(str(x) for x in parts)
            print(line, file=f)