Beispiel #1
0
def metaothello_report(args):
    db = ncbitax.TaxonomyDb.from_args(args,
                                      load_nodes=True,
                                      load_names=True,
                                      load_merged=True)

    species_counter = collections.Counter()
    genus_counter = collections.Counter()
    with ioutil.compressed_open(args.input, 'rt') as in_f:
        for line in in_f:
            parts = line.split('\t')
            species_taxid, genus_taxid = int(parts[1]), int(parts[2])

            # Since metaothello db is old, some taxids get moved to subspecies, so let's attempt to fix
            if species_taxid > 1 and db.ranks.get(species_taxid) != 'species':
                path = db.parent_path(species_taxid)
                if path:
                    for p in path:
                        if db.ranks[p] == 'species':
                            species_taxid = p

            if genus_taxid > 1 and db.ranks.get(genus_taxid) != 'genus':
                path = db.parent_path(genus_taxid)
                if path:
                    for p in path:
                        if db.ranks[p] == 'genus':
                            genus_taxid = p

            species_counter[species_taxid] += 1
            genus_counter[genus_taxid] += 1

    with ioutil.compressed_open(args.output, 'wt') as out_f:
        neg_taxids = [x for x in set(species_counter.keys()) if x < 1]
        for taxid in neg_taxids:
            species_counter[0] += species_counter[taxid]
            del species_counter[taxid]
        species_total = sum(species_counter.values())
        for taxid, count in species_counter.items():
            if taxid == 0:
                name = 'unclassified'
            else:
                name = db.names[taxid]
            abundance = count / species_total
            print('\t'.join(
                str(x) for x in [taxid, name, abundance, 'species']),
                  file=out_f)

        neg_taxids = [x for x in set(genus_counter.keys()) if x < 1]
        for taxid in neg_taxids:
            genus_counter[0] += genus_counter[taxid]
            del genus_counter[taxid]
        genus_total = sum(genus_counter.values())
        for taxid, count in genus_counter.items():
            if taxid == 0:
                name = 'unclassified'
            else:
                name = db.names[taxid]
            abundance = count / genus_total
            print('\t'.join(str(x) for x in [taxid, name, abundance, 'genus']),
                  file=out_f)
Beispiel #2
0
def create_mmseqs_taxonomy(args):
    if args.action == 'accessions':
        for i, header in enumerate(split_file(args.infile, '\x00')):
            accessions = [
                x.split(' ')[0].split('.')[0] for x in header.split('\x01')
            ]
            for a in accessions:
                print(a)
        sys.exit()

    tax_db = ncbitax.TaxonomyDb(tax_dir=args.tax_db,
                                load_names=True,
                                load_nodes=True)
    accession_lookup = {}
    start_time = time.time()
    log.info('Loading accession to taxid mapping')
    if args.genomic_joined:
        with ioutil.compressed_open(args.genomic_joined, 'rt') as f:
            for line in f:
                parts = line.split('\t')
                accession = parts[0]
                taxid = parts[1]
                taxid = int(taxid)
                accession_lookup[base_accession] = taxid
    elif args.accession2taxid:
        for fn in args.accession2taxid:
            with ioutil.compressed_open(fn, 'rt') as f:
                for line in f:
                    parts = line.split('\t')
                    base_accession = parts[0]
                    if base_accession == 'accession':
                        continue
                    try:
                        taxid = parts[2]
                    except:
                        print(line)
                    taxid = int(taxid)
                    accession_lookup[base_accession] = taxid
    log.info('Loaded accession to taxid mapping: %.2ss',
             time.time() - start_time)

    for i, header in enumerate(split_file(args.infile, '\x00')):
        accessions = [
            x.split(' ')[0].split('.')[0] for x in header.split('\x01')
        ]
        taxids = []
        for a in accessions:
            try:
                taxid = accession_lookup[a]
            except KeyError:
                log.info('Taxid for accession not found: %s', a)
            taxids.append(taxid)
        lca = tax_db.coverage_lca(taxids)
        if not lca:
            log.error('LCA not found for taxids %s', taxids)
            lca = 1
        print("{}\t{}".format(i + 1, lca), file=args.output)
Beispiel #3
0
def taxid_report(args):
    taxid_ind = args.taxid_column - 1

    db = ncbitax.TaxonomyDb.from_args(args, load_nodes=True, load_names=True, load_merged=True)
    in_f = ioutil.compressed_open(args.input)

    with in_f:
        with ioutil.compressed_open(args.output, 'wt') as out_f:
            lcas = collections.Counter(x[1] for x in qname_taxids(db, in_f, out_f, taxid_ind))
            for line in db.kraken_dfs_report(lcas):
                print(line, file=out_f)
    def process_clark(self, fn):
        bname = os.path.basename(fn)
        bname = re.sub('.csv$', '', bname)
        if self.output_newer(fn, bname):
            return

        hits = collections.Counter()
        with ioutil.compressed_open(fn, 'rt') as in_f:
            header = next(in_f)
            if '1st_assignment' in header:
                clark_s = True
                taxid_col = 3
            else:
                clark_s = False
                taxid_col = 2

            for row in csv.reader(in_f):
                qname = row[0]
                if row[taxid_col] == 'NA':
                    continue
                try:
                    taxid = int(row[taxid_col])
                except:
                    print(row)
                    raise
                mo = ART_QNAME_ACCESSION.match(qname)
                if not mo:
                    logging.warning('No match for qname: %s', qname)
                accession = mo.group(1)
                truth_taxid = self.a2t.taxids[accession]
                hits[(taxid, truth_taxid)] += 1

        self.print_hits(hits, bname)
Beispiel #5
0
def blast_report(args):

    tax_db = ncbitax.TaxonomyDb.from_args(args, load_nodes=True, load_names=True, load_merged=True)
    with contextlib.ExitStack() as ctx:
        if args.blast_report and not args.blast_lca:
            _, blast_lca_fn = tempfile.mkstemp('.blastn.lca.tsv')
        else:
            blast_lca_fn = args.blast_lca

        blast_m8_f = ctx.enter_context(ioutil.compressed_open(args.blast_m8, 'rt'))
        blast_lca_f = ctx.enter_context(ioutil.compressed_open(blast_lca_fn, 'wt'))
        hits = blast_lca(tax_db, blast_m8_f, blast_lca_f, min_bit_score=args.min_bit_score,
                         max_expected_value=args.max_expected_value, top_percent=args.top_percent)

        if not args.blast_report:
            return

        blast_report_f = ctx.enter_context(ioutil.compressed_open(args.blast_report, 'wt'))
        if blast_report_f:
            for line in tax_db.kraken_dfs_report(hits, total_reads=args.total_reads):
                print(line, file=blast_report_f)
    def process_kraken(self, fn):
        bname = os.path.basename(fn)
        bname = re.sub('.reads.gz$', '', bname)
        if self.output_newer(fn, bname):
            return

        hits = collections.Counter()
        with ioutil.compressed_open(fn, 'rt') as f:
            for line in f:
                parts = line.split('\t')
                classified = parts[0]
                qname = parts[1]
                taxid = int(parts[2])
                mo = ART_QNAME_ACCESSION.match(qname)
                if not mo:
                    logging.warning('No match for qname: %s', qname)
                accession = mo.group(1)
                truth_taxid = self.a2t.taxids[accession]
                hits[(taxid, truth_taxid)] += 1


        self.print_hits(hits, bname)
    def process_tsv(self, fn, qname_col=0, taxid_col=1, sub=None, skip_lines=0):
        bname = os.path.basename(fn)
        if sub:
            bname = re.sub(sub, '', bname)
        if self.output_newer(fn, bname):
            return

        hits = collections.Counter()
        with ioutil.compressed_open(fn, 'rt') as f:
            if skip_lines > 0:
                for _ in range(skip_lines):
                    next(f)
            for line in f:
                parts = line.split('\t')
                qname = parts[qname_col]
                # taxmaps
                try:
                    if parts[taxid_col] == '-':
                        taxid = 0
                    else:
                        taxid = int(parts[taxid_col])
                except:
                    print(line)
                    raise
                mo = ART_QNAME_ACCESSION.match(qname)
                if not mo:
                    logging.warning('No match for qname: %s', qname)
                try:
                    accession = mo.group(1)
                except:
                    print(line)
                    raise
                truth_taxid = self.a2t.taxids[accession]
                hits[(taxid, truth_taxid)] += 1

        self.print_hits(hits, bname)
Beispiel #8
0
def mmseqs_report(args):
    db = ncbitax.TaxonomyDb.from_args(args,
                                      load_nodes=True,
                                      load_names=True,
                                      load_merged=True)

    total_species_counter = collections.Counter()
    total_genus_counter = collections.Counter()

    taxid_map = {}

    last_read_id = None
    species_counter = collections.Counter()
    genus_counter = collections.Counter()
    with ioutil.compressed_open(args.input, 'rt') as in_f:
        for line in in_f:
            parts = line.rstrip().split('\t')
            read_id, taxid, _, rank, tax_name = parts
            taxid = int(taxid)
            if read_id != last_read_id:
                best = choose_best_random(species_counter)
                if best:
                    total_species_counter[best[0]] += 1
                # else:
                #     total_species_counter['unclassified'] += 1

                best = choose_best_random(genus_counter)
                if best:
                    total_genus_counter[best[0]] += 1
                # else:
                #     total_genus_counter['unclassified'] += 1
                species_counter = collections.Counter()
                genus_counter = collections.Counter()
            if rank == 'species':
                species_counter[taxid] += 1
            elif rank == 'genus':
                genus_counter[taxid] += 1

            taxid_map[taxid] = tax_name
            last_read_id = read_id

    total_reads = args.total_reads
    species_total = sum(x[1] for x in total_species_counter.items())
    genus_total = sum(x[1] for x in total_genus_counter.items())

    with ioutil.compressed_open(args.output, 'wt') as out_f:

        print('\t'.join([
            '0', 'unclassified',
            str((total_reads - species_total) / total_reads), 'species'
        ]),
              file=out_f)
        for taxid, n_reads in total_species_counter.most_common():
            abundance = n_reads / total_reads
            print('\t'.join(
                str(x)
                for x in [taxid, taxid_map[taxid], abundance, 'species']),
                  file=out_f)

        print('\t'.join([
            '0', 'unclassified',
            str((total_reads - genus_total) / total_reads), 'genus'
        ]),
              file=out_f)
        for taxid, n_reads in total_genus_counter.most_common():
            abundance = n_reads / total_reads
            print('\t'.join(
                str(x) for x in [taxid, taxid_map[taxid], abundance, 'genus']),
                  file=out_f)