Beispiel #1
0
def createTree(arr):
    """
    Create a tree using the provided list of tuples. In the tree, a node may contains multiple
    child nodes, but it can only have one parent node.
    :param arr: list of tuples (taxid, weight, lineage), where lineage is a list of taxid
    :return: Tree instance
    """
    taxadb = TaxID()

    root = Node(0, 0, _get_name_for_taxid(0, taxadb))
    catalog = {}
    for bucket in arr:
        taxid = int(bucket[0])
        name = _get_name_for_taxid(taxid, taxadb)

        leaf = Node(taxid, bucket[1], name)
        iter = root
        for i in range(len(bucket[2])):
            lin = bucket[2][i]
            if catalog.get(lin) is None:
                node = Node(lin, 0, _get_name_for_taxid(lin, taxadb))
                iter.addChildNode(node)
                catalog[lin] = node
                iter = node
            else:
                # below we check the compatibility of the data
                # each node can only have one parent node
                eid = catalog[lin].parentNode.taxid
                if i == 0:
                    if eid != root.taxid:
                        m = "node #{} got two parent nodes #{} and #{}".format(
                            lin, eid, root.taxid)
                        raise ValueError(m)
                else:
                    if eid != bucket[2][i - 1]:
                        m = "node #{} got two parent nodes #{} and #{}".format(
                            lin, eid, bucket[2][i - 1])
                        raise ValueError(m)

                iter = catalog[lin]

        if catalog.get(leaf.taxid) is None:
            iter.addChildNode(leaf)
            catalog[leaf.taxid] = leaf
        else:
            eid = catalog[bucket[0]].parentNode.taxid
            if eid != bucket[2][-1]:
                m = "node #{} got two parent nodes #{} and #{}".format(
                    bucket[0], eid, bucket[2][-1])
                raise ValueError(m)
            iter = catalog[bucket[0]]
            iter.updateWeight(leaf.weight)

    return Tree(root, catalog, root.weight)
Beispiel #2
0
def harvest(outputs, taxadb):
    queue = []

    # process command line args
    i = 0
    while i < len(outputs):
        if outputs[i].endswith("="):
            assert i + 1 < len(outputs), f"missing path after {outputs[i]}"
            sample = outputs[i][:-1]
            rundir = outputs[i + 1]
            i += 2
        else:
            parts = outputs[i].split("=")
            assert len(parts) == 2, f"invalid SAMPLE_ID=RUNDIR pair: {outputs[i]}"
            sample = parts[0]
            rundir = parts[1]
            i += 1
        assert sample in BENCHMARKS["samples"], f"unknown sample {sample}"
        if rundir.startswith("s3:"):
            assert os.environ.get(
                "AWS_PROFILE", False
            ), f"set environment AWS_PROFILE to read from {rundir}"
        else:
            assert (
                Path(rundir) / "outputs.json"
            ).is_file(), f"couldn't find outputs.json in {rundir}"
        queue.append((sample, rundir))

    if taxadb:
        taxadb = TaxID(dbtype="sqlite", dbname=taxadb)

    # harvest each supplied sample
    rslt = {}
    for sample, rundir in queue:
        assert sample not in rslt, f"repeated sample {sample}"
        outputs_json = read_outputs_json(rundir)
        rslt[sample] = harvest_sample(sample, outputs_json, taxadb)
        rslt[sample]["outputs"] = outputs_json

    print(json.dumps(rslt, indent=2))
Beispiel #3
0
def main():
    """
    Main program function
    """
    # Get arguments
    args = get_arguments()
    # Step 1
    print("STEP 1: Extracting Genbank IDS from BLAST output...")
    accession = extract_genbank_id(args.blast_output_file)
    chunks = [accession[i:i + 999] for i in range(0, len(accession), 999)]
    print("Found {0} ids !".format(len(accession)))
    #print(accession)
    # Step 3
    print("STEP 2: Writing results to file '{0}'...".format(
        args.taxonomy_file))
    accession_db = AccessionID(dbtype='sqlite', dbname=args.taxadb_file)
    tax_db = TaxID(dbtype='sqlite', dbname=args.taxadb_file)
    taxids = accession_db.taxid(accession)
    #taxids = [accession_db.taxid(acc) for acc in accession]
    #print(taxids)
    write_results(chunks, accession_db, tax_db, args.taxonomy_file)
    print("DONE !")
#!/usr/bin/env python3

import sys
from os.path import expanduser
from taxadb.taxid import TaxID


handler = TaxID(dbtype="sqlite", dbname=expanduser("~") + "/.taxadb/taxadb.sqlite")

for taxid in sys.stdin:
    taxid = taxid.strip()
    
    lineage = handler.lineage_name(taxid, reverse=True)
    
    lineage_s = ""
    if len(lineage) > 0:
        lineage_s = "; ".join(lineage)
    
    print("{}\t{}".format(taxid, lineage_s))
def main():
    parser = argparse.ArgumentParser(
        description='Filter a Uniref FASTA file by taxonomy')

    ## output file to be written
    parser.add_argument('-i',
                        '--input_fasta',
                        type=str,
                        required=True,
                        help='Path to an input file to be read')
    parser.add_argument('-o',
                        '--output_fasta',
                        type=str,
                        required=True,
                        help='Path to an output file to be created')
    parser.add_argument(
        '-c',
        '--clades',
        type=str,
        required=True,
        help='Comma-separated string of clades to be included.')
    parser.add_argument('-db',
                        '--taxadb',
                        type=str,
                        required=True,
                        help='Path to the taxadb sqlite3 file')
    args = parser.parse_args()

    if not os.path.exists(args.taxadb):
        raise Exception("ERROR:  Couldn't find taxadb specified")

    taxid = TaxID(dbtype='sqlite', dbname=args.taxadb)
    clades = args.clades.split(',')

    record_count = 0
    print_every = 1000

    clade_counter = dict()
    for clade in clades:
        clade_counter[clade] = 0

    # remembers for each ID if we're keeping it or not
    id_cache = dict()

    fout = open(args.output_fasta, 'wt')
    keep_entry = False

    for line in open(args.input_fasta):
        if line[0] == '>':
            record_count += 1
            if record_count % print_every == 0:
                print("{0} records processed ...".format(record_count),
                      flush=True)

            m = re.search('TaxID=(\d+)', line)
            if m:
                tax_id = m.group(1)

                if tax_id in id_cache:
                    if id_cache[tax_id] == True:
                        keep_entry = True
                    else:
                        keep_entry = False
                else:
                    lineage = taxid.lineage_name(tax_id, reverse=True)
                    clade_found = False

                    if lineage is None:
                        keep_entry = False
                        continue

                    for clade in clades:
                        if clade in lineage:
                            clade_found = True
                            clade_counter[clade] += 1
                            break

                    if clade_found:
                        keep_entry = True
                        id_cache[tax_id] = True
                    else:
                        keep_entry = False
                        id_cache[tax_id] = False

            else:
                keep_entry = False

        if keep_entry:
            fout.write(line)

    fout.close()

    print("Number of entries exported by clade:")

    for clade in clade_counter:
        print("\t{0}: {1}".format(clade, clade_counter[clade]))
Beispiel #6
0
#!/usr/bin/env python3
import sys
from taxadb.taxid import TaxID

taxid = TaxID()
for t in sys.argv[1:]:
    name = taxid.sci_name(int(t))
    print(name)
def main():
    parser = argparse.ArgumentParser( description='Filter a Uniref FASTA file by taxonomy')

    ## output file to be written
    parser.add_argument('-i', '--input_fasta', type=str, required=True, help='Path to an input file to be read' )
    parser.add_argument('-o', '--output_fasta', type=str, required=True, help='Path to an output file to be created' )
    parser.add_argument('-c', '--clades', type=str, required=True, help='Comma-separated string of clades to be included.' )
    parser.add_argument('-db', '--taxadb', type=str, required=True, help='Path to the taxadb sqlite3 file' )
    args = parser.parse_args()

    if not os.path.exists(args.taxadb):
        raise Exception("ERROR:  Couldn't find taxadb specified")
    
    taxid = TaxID(dbtype='sqlite', dbname=args.taxadb)
    clades = args.clades.split(',')

    record_count = 0
    print_every = 1000

    clade_counter = dict()
    for clade in clades:
        clade_counter[clade] = 0

    # remembers for each ID if we're keeping it or not
    id_cache = dict()

    fout = open(args.output_fasta, 'wt')
    keep_entry = False

    for line in open(args.input_fasta):
        if line[0] == '>':
            record_count += 1
            if record_count % print_every == 0:
                print("{0} records processed ...".format(record_count), flush=True)
            
            m = re.search('TaxID=(\d+)', line)
            if m:
                tax_id = m.group(1)

                if tax_id in id_cache:
                    if id_cache[tax_id] == True:
                        keep_entry = True
                    else:
                        keep_entry = False
                else:
                    lineage = taxid.lineage_name(tax_id, reverse=True)
                    clade_found = False

                    if lineage is None:
                        keep_entry = False
                        continue

                    for clade in clades:
                        if clade in lineage:
                            clade_found = True
                            clade_counter[clade] += 1
                            break

                    if clade_found:
                        keep_entry = True
                        id_cache[tax_id] = True
                    else:
                        keep_entry = False
                        id_cache[tax_id] = False
                        
            else:
                keep_entry = False

        if keep_entry:
            fout.write(line)
        
    fout.close()

    print("Number of entries exported by clade:")

    for clade in clade_counter:
        print("\t{0}: {1}".format(clade, clade_counter[clade]))
Beispiel #8
0
# Download TinySeq XML format file from NCBI BioProject 33175:
# https://www.ncbi.nlm.nih.gov/bioproject?db=bioproject&Cmd=DetailsSearch&Term=33175%5Buid%5D
from bs4 import BeautifulSoup as BS
soup = BS(open('./sequence.fasta.xml'), 'lxml')

# You'll need to build the taxa taxadb database
from taxadb.taxid import TaxID
db = TaxID(dbtype='sqlite', dbname='taxadb.sqlite')

import sys
'''
soup.prettify()[:2100] ==
<html>
 <body>
  <tseqset>
   <tseq>
    <tseq_seqtype value="nucleotide">
    </tseq_seqtype>
    <tseq_accver>
     NR_170543.1
    </tseq_accver>
    <tseq_sid>
     gnl|REF_SSU16S|KU507537:1-1439
    </tseq_sid>
    <tseq_taxid>
     1849015
    </tseq_taxid>
    <tseq_orgname>
     Pseudoarcobacter acticola
    </tseq_orgname>
    <tseq_defline>
Beispiel #9
0
#!/usr/bin/env python3
from taxadb.taxid import TaxID
import fileinput
taxid = TaxID()
for line in fileinput.input():
    print(taxid.sci_name(line.rstrip()))