コード例 #1
0
    def get_parent_taxa(self):
        """ Get parent taxa

        Returns:
            :obj:`list` of :obj:`Taxon`: list of parent taxa
        """
        if self.id_of_nearest_ncbi_taxon is None:
            return None

        cls = self.__class__
        ncbi_taxa = NCBITaxa()
        lineage = [
            cls(ncbi_id=id)
            for id in ncbi_taxa.get_lineage(self.id_of_nearest_ncbi_taxon)
        ]

        if self.additional_name_beyond_nearest_ncbi_taxon:
            base_name = ncbi_taxa.translate_to_names(
                [self.id_of_nearest_ncbi_taxon])[0]
            names = self.additional_name_beyond_nearest_ncbi_taxon[1:].split(
                ' ')
            for i_rank, name, in enumerate(names):
                lineage.append(
                    cls(name=base_name + ''.join(' ' + n
                                                 for n in name[0:i_rank + 1])))

        return lineage[0:-1]
コード例 #2
0
def main():
    """Make queries against NCBI Taxa databases"""
    # Get commandline args
    args = get_args()

    # Instantiate the ete NCBI taxa object
    ncbi = NCBITaxa()

    if args.verbose > 1:
        print("Taxa database is stored under ~/.etetoolkit/taxa.sqlite")

    # Update the database if required.
    if args.update is True:
        if args.verbose > 1:
            print(
                "Updating the taxonomy database. This may take several minutes..."
            )
        ncbi.update_taxonomy_database()

    # If a name was provided instead of a TaxID, convert and store it.
    if args.name:
        args.taxid = ncbi.get_name_translator([args.name])[args.name][0]

    if args.verbose > 0:
        tax_dict = {}
        # If a name was provided, simply add it to dict
        if args.name:
            tax_dict['Name'] = args.name
        # If not, do the opposite conversion to the above and store that
        else:
            tax_dict['Name'] = ncbi.get_taxid_translator([args.taxid
                                                          ])[args.taxid]

# Continue to populate the taxa dict with other information
        tax_dict['TaxID'] = args.taxid
        tax_dict['Rank'] = ncbi.get_rank([args.taxid])
        tax_dict['Lineage'] = ncbi.get_taxid_translator(
            ncbi.get_lineage(args.taxid))

        print("Information about your selected taxa:")
        pretty(tax_dict)

    # Main feature of the script is to get all taxa within a given group.
    descendent_taxa = ncbi.get_descendant_taxa(args.taxid)
    descendent_taxa_names = ncbi.translate_to_names(descendent_taxa)
    print("Descendent taxa for TaxID: %s" % (args.taxid))

    # Under python3, zip = izip. In python2, this list could be very large, and memory intensive
    # Suggest the script is run with python3
    if args.verbose > 0:
        for dtn, dt in zip(descendent_taxa_names, descendent_taxa):
            print("%s\t%s" % (dtn, dt))

    if args.outfile:
        with open(args.outfile, 'w') as ofh:
            for id in descendent_taxa:
                ofh.write(str(id) + '\n')
コード例 #3
0
ファイル: Kraken-SBT.py プロジェクト: tbenavi1/Kraken-SBT
def get_taxonid_to_name(tree):
	ncbi = NCBITaxa()
	taxonid_to_name = {}
	
	for node in tree.traverse():
		taxonid = int(node.name)
		taxonid_to_name[taxonid] = ncbi.translate_to_names([taxonid])[0]
	
	return taxonid_to_name
コード例 #4
0
    def __init__(self, id='', name='', ncbi_id=None, cross_references=None):
        """
        Args:
            id (:obj:`str`, optional): identifier
            name (:obj:`str`, optional): name
            ncbi_id (:obj:`int`, optional): NCBI identifier
            cross_references (:obj:`list` of :obj:`CrossReference`, optional): list of cross references
        """

        self.id = id
        self.name = name
        self.id_of_nearest_ncbi_taxon = None
        self.distance_from_nearest_ncbi_taxon = None
        self.additional_name_beyond_nearest_ncbi_taxon = None
        self.cross_references = cross_references or []

        ncbi_taxa = NCBITaxa()

        if ncbi_id:
            self.id_of_nearest_ncbi_taxon = ncbi_id
            self.distance_from_nearest_ncbi_taxon = 0
            self.additional_name_beyond_nearest_ncbi_taxon = ''
            self.name = ncbi_taxa.translate_to_names([ncbi_id])[0]
            if self.name == ncbi_id:
                raise ValueError(
                    'The NCBI taxonomy database does not contain a taxon with id {}'
                    .format(ncbi_id))
        else:
            rank_names = name.split(' ')
            for i_rank in range(len(rank_names)):
                partial_name = ' '.join(rank_names[0:len(rank_names) - i_rank])
                result = ncbi_taxa.get_name_translator([partial_name])
                if result:
                    self.id_of_nearest_ncbi_taxon = result[partial_name][0]
                    self.distance_from_nearest_ncbi_taxon = i_rank
                    self.additional_name_beyond_nearest_ncbi_taxon = ''.join(
                        ' ' + n for n in rank_names[len(rank_names) - i_rank:])
                    self.name = ncbi_taxa.translate_to_names([self.id_of_nearest_ncbi_taxon])[0] \
                        + self.additional_name_beyond_nearest_ncbi_taxon
                    return

            self.name = name
コード例 #5
0
def main(InputMSA, output):
    ncbi = NCBITaxa()
    #ncbi.update_taxonomy_database()
    headers, seqs = readAlg(InputMSA)
    sys.stdout.write("Annotating headers for %d sequences..." % len(headers))

    for i in range(0, len(headers)):
        head_terms = read_header(headers[i])
        lin = ncbi.get_lineage(head_terms["taxid"])
        #sp_name = ncbi.translate_to_names([tid])
        lin_name = ncbi.translate_to_names(lin)
        with open(output, 'w+') as output_fasta:
            output_fasta.write(">%s|%s|%s\n%s\n" %
                               (head_terms["header"], lin_name[-1], ", ".join(
                                   lin_name[1:]), seqs[i]))
    sys.stdout.write("Done\n")
コード例 #6
0
def main():
    """main"""
    args = get_args()
    infile = args.infile
    email = args.email
    out_file = args.outfile

    Entrez.email = email

    acc_list = open(infile, 'r').read().splitlines()
    acc = ','.join(acc_list)
    handle = Entrez.esummary(db="nuccore", id=acc)
    records = Entrez.read(handle)
    handle.close()

    ncbi = NCBITaxa()

    with open(out_file, 'w') as out_f:
        for record in records:
            acc_version = record["AccessionVersion"]
            tax_id = record["TaxId"]
            lineage = ncbi.get_lineage(tax_id)
            name = ncbi.translate_to_names(lineage)
            print("{}\t{}".format(acc_version, '\t'.join(name)), file=out_f)
コード例 #7
0
def run(args):
    # add lineage profiles/stats

    import re
    from ete3 import PhyloTree, NCBITaxa

    # dump tree by default
    if not args.tree and not args.info and not args.descendants:
        args.tree = True

    ncbi = NCBITaxa()

    all_taxids = {}
    all_names = set()
    queries = []

    if not args.search:
        log.error('Search terms should be provided (i.e. --search) ')
        sys.exit(-1)
    for n in args.search:
        queries.append(n)
        try:
            all_taxids[int(n)] = None
        except ValueError:
            all_names.add(n.strip())

    # translate names
    name2tax = ncbi.get_name_translator(all_names)
    all_taxids.update([(v, None) for v in list(name2tax.values())])

    not_found_names = all_names - set(name2tax.keys())
    if args.fuzzy and not_found_names:
        log.warn("%s unknown names", len(not_found_names))
        for name in not_found_names:
            # enable extension loading
            tax, realname, sim = ncbi.get_fuzzy_name_translation(
                name, args.fuzzy)
            if tax:
                all_taxids[tax] = None
                name2tax[name] = tax
                name2realname[name] = realname
                name2score[name] = "Fuzzy:%0.2f" % sim

    if not_found_names:
        log.warn("[%s] could not be translated into taxids!" %
                 ','.join(not_found_names))

    if args.tree:
        if len(all_taxids) == 1:
            target_taxid = next(all_taxids.keys())
            log.info("Dumping NCBI descendants tree for %s" % (target_taxid))
            t = ncbi.get_descendant_taxa(
                target_taxid,
                collapse_subspecies=args.collapse_subspecies,
                rank_limit=args.rank_limit,
                return_tree=True)
        else:
            log.info("Dumping NCBI taxonomy of %d taxa..." % (len(all_taxids)))
            t = ncbi.get_topology(list(all_taxids.keys()),
                                  intermediate_nodes=args.full_lineage,
                                  rank_limit=args.rank_limit,
                                  collapse_subspecies=args.collapse_subspecies)

        id2name = ncbi.get_taxid_translator([n.name for n in t.traverse()])
        for n in t.traverse():
            n.add_features(taxid=n.name)
            n.add_features(sci_name=str(id2name.get(int(n.name), "?")))
            n.name = "%s - %s" % (id2name.get(int(n.name), n.name), n.name)
            lineage = ncbi.get_lineage(n.taxid)
            n.add_features(
                named_lineage='|'.join(ncbi.translate_to_names(lineage)))
        dump(t,
             features=[
                 "taxid", "name", "rank", "bgcolor", "sci_name",
                 "collapse_subspecies", "named_lineage"
             ])
    elif args.descendants:
        log.info("Dumping NCBI taxonomy of %d taxa..." % (len(all_taxids)))
        print('# ' + '\t'.join([
            "Taxid", "Sci.Name", "Rank", "descendant_taxids",
            "descendant_names"
        ]))
        translator = ncbi.get_taxid_translator(all_taxids)
        ranks = ncbi.get_rank(all_taxids)
        for taxid in all_taxids:
            descendants = ncbi.get_descendant_taxa(
                taxid,
                collapse_subspecies=args.collapse_subspecies,
                rank_limit=args.rank_limit)
            print('\t'.join([
                str(taxid),
                translator.get(taxid, taxid),
                ranks.get(taxid, ''), '|'.join(map(str, descendants)),
                '|'.join(map(str, ncbi.translate_to_names(descendants)))
            ]))

    elif args.info:
        print('# ' + '\t'.join(
            ["Taxid", "Sci.Name", "Rank", "Named Lineage", "Taxid Lineage"]))
        translator = ncbi.get_taxid_translator(all_taxids)
        ranks = ncbi.get_rank(all_taxids)
        for taxid, name in six.iteritems(translator):
            lineage = ncbi.get_lineage(taxid)
            named_lineage = ','.join(ncbi.translate_to_names(lineage))
            lineage_string = ','.join(map(str, lineage))
            print('\t'.join([
                str(taxid), name,
                ranks.get(taxid, ''), named_lineage, lineage_string
            ]))
コード例 #8
0
class TaxIDExpander(object):
    def __init__(self,
                 taxdump_filename: str = None,
                 taxdb_filename: str = None) -> 'TaxIDExpander':
        """Constructor for TaxIDExpander

        Args:
            taxdump_filename(str): if specified, refers to a local copy of the NCBI taxdump.tar.gz file
            taxdb_filename(str): if specified will be used to look for a db containing the NCBI database to load.
                                 if both taxdump_filename and taxdb_filename are set, save to taxdb_filename """
        if taxdump_filename is not None:
            taxdump_path = Path(taxdump_filename)
            if not (taxdump_path.exists() and taxdump_path.is_file()):
                raise ValueError(f'{taxdump_filename} must be a readable file')
            if taxdb_filename is not None:
                # we have both a taxdump file and a taxdb file
                # this means we load from taxdump file and save to taxdb file
                self.ncbi = NCBITaxa(taxdump_file=taxdump_filename,
                                     dbfile=taxdb_filename)
            else:
                # we have a taxdump file and no taxdb file
                # this means we load from the taxdump file and let ete3 save to its default location
                self.ncbi = NCBITaxa(taxdump_file=taxdump_filename)
        else:
            if taxdb_filename is not None:
                # we have a taxdb file and no taxdump file
                # this means we load the database from the taxdb file
                taxdb_path = Path(taxdb_filename)
                if not (taxdb_path.exists() and taxdb_path.is_file()):
                    raise ValueError(
                        f'{taxdb_filename} must be a readable file')
                self.ncbi = NCBITaxa(dbfile=taxdb_filename)
            else:
                # we have neither a taxdump file nor a taxdb file
                # this means ete3 loads the database over the network (and cache in local directory)
                # and let ete3 save the taxdb to its default location
                self.ncbi = NCBITaxa()

    def get_lineage(self,
                    taxid: str,
                    only_standard_ranks: Optional[bool] = False
                    ) -> List[Tuple[str, str]]:
        """Return lineage for a given taxonomy ID

        Raises ValueError if taxonomy ID is not found.

        Args:
            taxid(str): NCBI taxonomy ID
            only_standard_ranks(bool): if True only return superkingdom, phylum, class, order, family, genus and species ranks
        Returns:
            list of tuples where the tuples have members (taxon rank, taxon name)"""
        lineage_ids = self.ncbi.get_lineage(taxid)
        names = self.ncbi.get_taxid_translator(lineage_ids)
        ranks = self.ncbi.get_rank(lineage_ids)
        standard_ranks = set([
            'superkingdom', 'phylum', 'class', 'order', 'family', 'genus',
            'species'
        ])
        lineage = []
        for id in lineage_ids:
            rank = ranks[id]
            if only_standard_ranks and rank not in standard_ranks:
                continue
            lineage.append((ranks[id], names[id]))
        return lineage

    def get_scientific_name(self, taxid: str):
        results = self.ncbi.translate_to_names([taxid])
        if not results:
            return 'UNKNOWN'
        else:
            return results[0]
コード例 #9
0
ファイル: ete_ncbiquery.py プロジェクト: Ward9250/ete
def run(args):
    # add lineage profiles/stats

    import re
    from ete3 import PhyloTree, NCBITaxa

    # dump tree by default
    if not args.tree and not args.info and not args.descendants:
        args.tree = True

    ncbi = NCBITaxa()

    all_taxids = {}
    all_names = set()
    queries = []

    if not args.search:
        log.error('Search terms should be provided (i.e. --search) ')
        sys.exit(-1)
    for n in args.search:
        queries.append(n)
        try:
            all_taxids[int(n)] = None
        except ValueError:
            all_names.add(n.strip())

    # translate names
    name2tax = ncbi.get_name_translator(all_names)
    all_taxids.update([(v, None) for v in list(name2tax.values())])

    not_found_names = all_names - set(name2tax.keys())
    if args.fuzzy and not_found_names:
        log.warn("%s unknown names", len(not_found_names))
        for name in not_found_names:
            # enable extension loading
            tax, realname, sim = ncbi.get_fuzzy_name_translation(name, args.fuzzy)
            if tax:
                all_taxids[tax] = None
                name2tax[name] = tax
                name2realname[name] = realname
                name2score[name] = "Fuzzy:%0.2f" %sim

    if not_found_names:
        log.warn("[%s] could not be translated into taxids!" %','.join(not_found_names))

    if args.tree:
        if len(all_taxids) == 1:
            target_taxid = next(all_taxids.keys())
            log.info("Dumping NCBI descendants tree for %s" %(target_taxid))
            t = ncbi.get_descendant_taxa(target_taxid, collapse_subspecies=args.collapse_subspecies, rank_limit=args.rank_limit, return_tree=True)
        else:
            log.info("Dumping NCBI taxonomy of %d taxa..." %(len(all_taxids)))
            t = ncbi.get_topology(list(all_taxids.keys()),
                              intermediate_nodes=args.full_lineage,
                              rank_limit=args.rank_limit,
                              collapse_subspecies=args.collapse_subspecies)

        id2name = ncbi.get_taxid_translator([n.name for n in t.traverse()])
        for n in t.traverse():
            n.add_features(taxid=n.name)
            n.add_features(sci_name=str(id2name.get(int(n.name), "?")))
            n.name = "%s - %s" %(id2name.get(int(n.name), n.name), n.name)
            lineage = ncbi.get_lineage(n.taxid)
            n.add_features(named_lineage = '|'.join(ncbi.translate_to_names(lineage)))
        dump(t, features=["taxid", "name", "rank", "bgcolor", "sci_name",
                          "collapse_subspecies", "named_lineage"])
    elif args.descendants:
        log.info("Dumping NCBI taxonomy of %d taxa..." %(len(all_taxids)))
        print('# ' + '\t'.join(["Taxid", "Sci.Name", "Rank", "descendant_taxids", "descendant_names"]))
        translator = ncbi.get_taxid_translator(all_taxids)
        ranks = ncbi.get_rank(all_taxids)
        for taxid in all_taxids:
            descendants = ncbi.get_descendant_taxa(taxid, collapse_subspecies=args.collapse_subspecies, rank_limit=args.rank_limit)
            print('\t'.join([str(taxid), translator.get(taxid, taxid), ranks.get(taxid, ''),
                             '|'.join(map(str, descendants)),
                             '|'.join(map(str, ncbi.translate_to_names(descendants)))]))

    elif args.info:
        print('# ' + '\t'.join(["Taxid", "Sci.Name", "Rank", "Named Lineage", "Taxid Lineage"]))
        translator = ncbi.get_taxid_translator(all_taxids)
        ranks = ncbi.get_rank(all_taxids)
        for taxid, name in six.iteritems(translator):
            lineage = ncbi.get_lineage(taxid)
            named_lineage = ','.join(ncbi.translate_to_names(lineage))
            lineage_string = ','.join(map(str, lineage))
            print('\t'.join([str(taxid), name, ranks.get(taxid, ''), named_lineage, lineage_string]))
コード例 #10
0
from ete3 import NCBITaxa
ncbi = NCBITaxa()

descendants = ncbi.get_descendant_taxa('Salamandridae')
print(ncbi.translate_to_names(descendants))

descendants = ncbi.get_descendant_taxa('Salamandridae',
                                       collapse_subspecies=True)
print(ncbi.translate_to_names(descendants))

tree = ncbi.get_descendant_taxa('Salamandridae',
                                collapse_subspecies=True,
                                return_tree=True)
print(tree.get_ascii(attributes=['sci_name', 'taxid']))

# ['Notophthalmus viridescens', 'Notophthalmus perstriatus', 'Notophthalmus meridionalis kallerti',
# 'Notophthalmus meridionalis meridionalis', 'Pleurodeles waltl waltl', 'Pleurodeles poireti',
# 'Pleurodeles nebulosus', 'Taricha granulosa', 'Taricha rivularis', 'Taricha torosa torosa',
# 'Taricha torosa sierrae', 'Taricha sp. AMNH A168420', 'Triturus cristatus',
# 'Triturus karelinii arntzeni', 'Triturus karelinii karelinii', 'Triturus carnifex carnifex',
# 'Triturus dobrogicus dobrogicus', 'Triturus dobrogicus macrosomus', 'Triturus marmoratus marmoratus',
# 'Triturus pygmaeus', 'Triturus macedonicus', 'Triturus cristatus x Triturus dobrogicus macrosomus',
# 'Triturus cristatus s.l. AH-2007', "Triturus cf. karelinii 'eastern'", "Triturus cf. karelinii 'western'",
# 'Triturus ivanbureschi', 'Triturus anatolicus', 'Cynops pyrrhogaster', 'Cynops ensicauda',
# 'Cynops orientalis', 'Cynops cyanurus chuxiongensis', 'Cynops cyanurus cyanurus', 'Cynops orphicus',
# 'Cynops fudingensis', 'Cynops glaucus', 'Euproctus montanus', 'Euproctus platycephalus',
# 'Tylototriton taliangensis', 'Tylototriton verrucosus pulcherrima', 'Tylototriton shanjing',
# 'Tylototriton kweichowensis',
# Tylototriton sp. MH-2011', 'Tylototriton pseudoverrucosus', 'Tylototriton yangi', 'Tylototriton uyenoi',
# 'Tylototriton shanorum', 'Tylototriton anguliceps', 'Tylototriton daweishanensis',
# 'Tylototriton podichthys', 'Tylototriton himalayanus', 'Tylototriton ngarsuensis',
コード例 #11
0
ファイル: tax2tax.py プロジェクト: cbirdlab/makesapdb
def main(in_file, out_file, in_type, out_filetype):

    ncbi = NCBITaxa()
    records = []
    record = {'taxid': None, 'gi': None, 'id': None}

    with open(in_file, 'r') as fh:
        queries = fh.read().splitlines()

    if out_filetype == 'JSON':
        output = open(out_file, 'w')

    queries = [x.replace("_", " ") for x in queries if x]
    try:
        if in_type == "taxid":
            queries = list(ncbi.get_taxid_translator(queries).values())
        taxons = ncbi.get_name_translator(queries)
        record['taxid'] == taxons[queries[0]][0]
    except:
        print("Unable to read keys.")
        print("Are you using the right type ('-t') option?")
        print(
            "Default is 'sciname', but 'taxid' available if using NCBI taxonomics IDs."
        )
        exit(-1)

    count = 0
    for q in queries:
        record['taxid'] = taxons[q][0]
        record['gi'] = count
        record['id'] = q
        count = count + 1
        record['tax_path'] = []
        lineage = ncbi.get_lineage(record['taxid'])
        lineage_names = ncbi.translate_to_names(lineage)
        lineage_ranks = ncbi.get_rank(lineage)
        lineage.pop()
        lineage.pop(0)
        print(lineage_names)
        for l in lineage:
            tax_path_entry = {}
            tax_path_entry['taxid'] = l
            tax_path_entry['rank_name'] = lineage_names[count]
            tax_path_entry['rank'] = lineage_ranks[l]
            lin = ncbi.get_lineage(l)
            tax_path_entry['parent_taxid'] = lin[-2]
            record['tax_path'].append(tax_path_entry)

        if out_file is not None:
            if out_filetype == 'JSON':
                output = open(out_file, 'w')
                output.write(json.dumps(record) + '\n')
        else:
            sys.stdout.write(json.dumps(record) + '\n')

        records.append(record)

    if out_filetype == 'CSV':
        with open(out_file, 'w') as csvfile:
            fieldnames = ['ID', 'GI', 'TAXID']
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            writer.writeheader()
            for record in records:
                writer.writerow({
                    'ID': record['id'],
                    'GI': record['gi'],
                    'TAXID': record['taxid']
                })
コード例 #12
0
        if refaa != '-' and frac > 0.66:
            valid_cols += 1
            variants = a[a[col] != refaa][col].to_dict()
            refaas.append(refaa)
            for sp, var in variants.iteritems():
                sp = sp.split(".")[0]
                spvariants[sp].update([(refaa, var)])

        #if valid_cols > 500:
        #    break

refaacounter = Counter(refaas)

for sp, varcounter in spvariants.iteritems():
    try:
        sp_name = ncbi.translate_to_names([int(sp.split(".")[0])])[0]
    except ValueError:
        sp_name = "oxymonad-%s" % sp
    for varc in varcounter:
        ratio = varcounter[varc] / float(refaacounter[varc[0]])
        if ratio > 0.25:
            print sp, sp_name, "%s\t%s/%s" % (
                "->".join(varc), varcounter[varc], refaacounter[varc[0]])
    # #print varcounter.most_common(1)[0], varcounter[('W', 'X')]
    # most_common = varcounter.most_common(1)[0]
    # ratio = (most_common[1] / float(refaacounter[most_common[0][0]]))
    # #if varcounter.most_common(1)[0][1] > 10 and "-" not in varcounter.most_common(1)[0][0]:
    # if ratio > 0.33:# and "-" not in most_common[0]:
    #     print sp, ncbi.translate_to_names([int(sp.split(".")[0])])[0]
    #     for varc in varcounter:
    #         #if varcounter[varc] > 2:#/float(refaacounter[varc[0]]) > 0.2:
コード例 #13
0
class preprocess():
    def __init__(self, organism, input, name, outdir, reference, paired,
                 input2, log, verbose, map, outlier, trim, kraken, db,
                 taxon_id, n_results):
        self.organism = organism
        self.input = input
        self.name = name
        self.outdir = outdir
        self.reference = reference
        self.paired = paired
        self.input2 = input2
        self.log = log
        self.verbose = verbose
        self.outlier = outlier
        self.map = map
        self.trim = trim
        self.kraken = kraken
        self.db = db
        self.taxid = taxon_id
        self.n_results = n_results
        self.logger = logging.getLogger()
        self.outlier_file = open(outdir + '/outlier_list.txt', 'a')
        self.ncbi = NCBITaxa()
        self.descendants = set(
            self.ncbi.get_descendant_taxa(self.taxid, intermediate_nodes=True))

    """Shell Execution"""

    def runCommand(self, command, directory, write_output):
        process = subprocess.Popen(command,
                                   stdout=subprocess.PIPE,
                                   stderr=subprocess.PIPE,
                                   cwd=directory)
        out, err = process.communicate()

        if out:
            if write_output:
                return out
            self.logger.info("Standard output: \n" + out.decode('utf-8') +
                             "\n")
        if err:
            self.logger.info("Standard error: \n" + err.decode('utf-8') + "\n")

    """Running Refseq_masher"""

    def refseq_masher(self):
        self.ifVerbose("Running Refseq_masher matching")
        self.runCommand([
            'refseq_masher', 'matches', '-o', self.name + '.match',
            '--output-type', 'tab', self.input, '-n',
            str(self.n_results + 5)
        ],
                        os.path.join(self.outdir, 'mash'),
                        write_output=False)

    """Running Kraken"""

    def run_kraken(self):
        self.ifVerbose("Running Kraken")

        gzip = ""
        if self.input[-3:] == ".gz":
            gzip = "--gzip-compressed"

        if self.paired:
            self.runCommand([
                'kraken', '--db', self.db, '--paired', '--output',
                self.name + '.kraken', '--fastq-input',
                "%s" % gzip, self.input, self.input2
            ],
                            os.path.join(self.outdir, 'kraken'),
                            write_output=False)
        else:
            self.runCommand([
                'kraken', '--db', self.db, '--output', self.name + '.kraken',
                '--fastq-input',
                "%s" % gzip, self.input
            ],
                            os.path.join(self.outdir, 'kraken'),
                            write_output=False)

    """Parse Kraken resuts"""

    def parse_kraken_results(self):
        self.ifVerbose("Parsing Kraken results")

        kraken = {}  # Store classification for each read

        with open(os.path.join(self.outdir, 'kraken/%s.kraken' % self.name),
                  'r') as classification:
            for line in classification:
                classified, read_id, tax_id, length, details = line.strip(
                ).split("\t")
                kraken[read_id] = tax_id

        # Classify each read
        kraken_class = {}

        with open(os.path.join(self.outdir, 'kraken/%s.log' % self.name),
                  'w') as log:
            for read_id, tax_id in kraken.items():
                if int(tax_id) == 0:
                    kraken_class[read_id] = "unclassified"
                elif int(tax_id) in self.descendants or int(tax_id) == int(
                        self.taxid):
                    kraken_class[read_id] = "target"
                else:
                    kraken_class[read_id] = "other"
                    log.write(
                        "%s was trimmed because it was classified as %s (%s)\n"
                        % (read_id, self.ncbi.translate_to_names(
                            [int(tax_id)])[0], tax_id))

        return kraken_class

    """Trim fastq reads not belonging to target organism"""

    def kraken_trim(self):
        kraken = self.parse_kraken_results()

        # Write new fastq file
        files = [self.input, self.input2]
        for fastq_in in files:
            with gzip.open(fastq_in, 'r') as f_in:
                fastq_out = os.path.split(fastq_in)[1]
                if fastq_out[-3:] == ".gz":  # Eliminate .gz from filename
                    fastq_out = fastq_out[:-3]
                with open(
                        os.path.join(self.outdir,
                                     'kraken_trim/%s' % fastq_out),
                        'w') as f_out:
                    self.ifVerbose(
                        "Trimming reads from %s that do not belong to the target organism"
                        % fastq_out)
                    for line in f_in:
                        # Split ID with space, then remove "/1" or "/2" if it exists and ignore initial @
                        read_id = line.decode('utf-8').split(" ")[0].split(
                            "/")[0][1:]
                        if read_id in kraken and kraken[read_id] != "other":
                            f_out.write(line.decode('utf-8'))
                            for i in range(3):
                                f_out.write(f_in.readline().decode('utf-8'))
                        else:
                            for i in range(3):
                                f_in.readline()

            # Zip output files
            self.runCommand([
                'gzip',
                os.path.join(self.outdir, 'kraken_trim/%s' % fastq_out)
            ],
                            None,
                            write_output=False)

    """Run Trim_galore to preprocess fastq files"""

    def trim_galore(self):
        self.ifVerbose("Trimming fastq files using Trim_galore")

        if self.paired:
            self.runCommand([
                'trim_galore', '--fastqc_args', "\"--outdir " +
                os.path.join(self.outdir, "trimmed_fastq/fastqc") + "\"",
                '--gzip', '-o',
                os.path.join(self.outdir, "trimmed_fastq"), '--paired',
                self.input, self.input2
            ],
                            directory=None,
                            write_output=False)

            self.input = os.path.join(
                os.path.join(self.outdir, "trimmed_fastq"),
                self.name + "_1_val_1.fq.gz")
            self.input2 = self.input = os.path.join(
                os.path.join(self.outdir, "trimmed_fastq"),
                self.name + "_2_val_2.fq.gz")

        else:
            self.runCommand([
                'trim_galore', '--fastqc', '--gzip', '-o',
                os.path.join(self.outdir, "trimmed_fastq"), self.input
            ],
                            directory=None,
                            write_output=False)

            self.input = os.path.join(
                os.path.join(self.outdir, "trimmed_fastq"),
                self.name + "_val.fq.gz")

    """Mapping with Smalt"""

    def smalt_map(self):
        self.ifVerbose("Mapping reads to reference using Smalt")
        if self.paired:
            self.runCommand([
                'smalt', 'map', '-i', '1000', '-j', '20', '-l', 'pe', '-o',
                self.name + ".BAM", 'reference', self.input, self.input2
            ],
                            os.path.join(self.outdir, 'mapping'),
                            write_output=False)
        else:
            self.runCommand([
                'smalt', 'map', '-o', self.name + ".BAM", 'reference',
                self.input
            ],
                            os.path.join(self.outdir, 'mapping'),
                            write_output=False)

    """Mapping with BWA"""

    def bwa_map(self):
        self.ifVerbose("Mapping reads to reference using BWA")
        with open(os.path.join(self.outdir, 'mapping/%s.SAM' % self.name),
                  'wb') as sam:
            with open(os.path.join(self.outdir, 'mapping/%s.BAM' % self.name),
                      'wb') as bam:
                if self.paired:
                    sam_output = self.runCommand(
                        ['bwa', 'mem', 'reference', self.input, self.input2],
                        os.path.join(self.outdir, 'mapping'),
                        write_output=True)
                    sam.write(sam_output)
                    bam_output = self.runCommand(
                        ['samtools', 'view', '-Sb', self.name + ".SAM"],
                        os.path.join(self.outdir, 'mapping'),
                        write_output=True)
                    bam.write(bam_output)
                else:
                    sam_output = self.runCommand([
                        'bwa', 'mem', 'reference', self.input, '>',
                        self.name + ".SAM"
                    ],
                                                 os.path.join(
                                                     self.outdir, 'mapping'),
                                                 write_output=True)
                    sam.write(sam_output)
                    bam_output = self.runCommand([
                        'samtools', 'view', '-Sb', self.name + ".SAM", '>',
                        self.name + ".BAM"
                    ],
                                                 os.path.join(
                                                     self.outdir, 'mapping'),
                                                 write_output=True)
                    bam.write(bam_output)

    """Sort BAM files using Samtools"""

    def samtools(self):
        self.ifVerbose("Sorting BAM files using Samtools")
        self.runCommand([
            'samtools', 'sort', '-o', self.name + '_sorted.BAM',
            self.name + '.BAM'
        ],
                        os.path.join(self.outdir, 'mapping'),
                        write_output=False)

    """Checking mapping quality with Qualimap"""

    def qualimap(self):
        self.ifVerbose("Running qualimap BAM QC")

        self.runCommand([
            'qualimap', 'bamqc', '-bam',
            os.path.join(self.outdir, 'mapping/' + self.name + '_sorted.BAM'),
            '-outformat', 'HTML', '-outdir ',
            os.path.join(self.outdir, "qualimap/" + self.name)
        ],
                        directory=None,
                        write_output=False)

    """Parse through report file obtained from Qualimap or Refseq_masher"""

    def parser(self, refseq, qualimap):
        if self.outlier:
            outlier_flag = False
            class_flag = False
            distance_flag = False
            map_flag = False
            quality_flag = False
            n_count = 0

            if refseq:
                self.ifVerbose("Parsing Refseq_masher report")
                with open(
                        os.path.join(os.path.join(self.outdir, 'mash'),
                                     self.name + '.match')) as csvfile:
                    for row in csv.DictReader(csvfile, delimiter='\t'):
                        if (int(row['taxid']) != int(self.taxid)
                                and int(row['taxid']) not in self.descendants):
                            outlier_flag = True
                            class_flag = int(row['taxid'])
                            if n_count >= self.n_results:
                                break
                            else:
                                n_count += 1
                        elif float(row['distance']) > 0.05:
                            outlier_flag = True
                            distance_flag = True
                            break

            if qualimap:
                self.ifVerbose("Parsing Qualimap report")
                report = open(self.outdir + '/qualimap/' + self.name +
                              '/genome_results.txt')

                for line in report:
                    if "number of mapped reads" in line:
                        mapped_percentage = line.split()[-1].strip('()%')
                    if "mean mapping quality" in line:
                        mean_mapping_quality = line.split()[-1]

                if float(mapped_percentage) < 90:
                    outlier_flag = True
                    map_flag = True
                elif float(mean_mapping_quality) < 10:
                    outlier_flag = True
                    quality_flag = True

            if outlier_flag:
                self.ifVerbose("%s is an outlier" % self.name)
                if class_flag:
                    self.outlier_file.write(
                        "%s\tClassified as %s (%d)\n" %
                        (self.name, self.ncbi.translate_to_names(
                            [class_flag])[0], class_flag))
                if distance_flag:
                    self.outlier_file.write(
                        "%s\tDistance is greater than 0.05\n" % self.name)
                if map_flag:
                    self.outlier_file.write(
                        "%s\tMapping percentage is lower than 90%%\n" %
                        self.name)
                if quality_flag:
                    self.outlier_file.write(
                        "%s\tMean mapping quality is lower than 10\n" %
                        self.name)

    """Move outlier sample"""

    def outlier(self):
        self.ifVerbose("%s is an outlier" % self.name)
        # self.runCommand(['cp', self.input, os.path.join(self.outdir, 'outliers')], directory=None, write_output=False)
        # if self.paired:
        #     self.runCommand(['cp', self.input2, os.path.join(self.outdir, 'outliers')], directory=None, write_output=False)
        self.outlier_file.write("%s\n" % self.name)

    def ifVerbose(self, msg):
        if self.verbose:
            self.logger.info(msg)
コード例 #14
0
def main():
    if len(sys.argv) != 4:
        mes = 'Usage: python {} "phylum,genus" <id2taxonid.table> <id2taxonid2taxon.table>\n'
        sys.stderr.write(mes.format(os.path.basename(sys.argv[0])))
        sys.exit(1)

    REQUIRED_RANK = [i.strip() for i in sys.argv[1].strip().split(',')]
    infile = sys.argv[2]
    outfile = sys.argv[3]

    if infile == '-':
        infile = '/dev/stdin'

    if outfile == '-':
        outfile = '/dev/stdout'

    ncbi = NCBITaxa()
    rank_lis_needed = [
        'superkingdom', 'phylum', 'class', 'order', 'family', 'genus',
        'species'
    ]

    with open(infile) as fp, open(outfile, 'w') as fw:
        cnt = 0
        for line in fp:
            line = line.rstrip()
            id, taxonid = line.split()
            try:
                lineage_num = ncbi.get_lineage(taxonid)
            except ValueError as e:
                mes = '*** Invalid TaxonID (not in NCBI Taxonomy db): {}\n'
                sys.stderr.write(mes.format(taxonid))
                continue
            lineage_tax = ncbi.translate_to_names(lineage_num)

            rank_dict = ncbi.get_rank(lineage_num)
            st_rank = set(rank_dict.values())

            if not set(REQUIRED_RANK).issubset(st_rank):
                mes = ('*** Irregular taxon format with TaxonID ({})'
                       '(skipped): \n{}\n{}\n')
                sys.stderr.write(
                    mes.format(taxonid, repr(lineage_tax), repr(st_rank)))
                cnt += 1
                continue

            n_temp = -1
            index_temp = -1
            lis_lineage_taxon = []
            for n, idd in enumerate(lineage_num):
                rank = rank_dict[idd]
                if rank in set(rank_lis_needed):
                    index = rank_lis_needed.index(rank)
                    if index != index_temp + 1:
                        skipped = index - 1 - index_temp
                        # add skipped levels
                        while 1:
                            lis_lineage_taxon.append('Other')
                            skipped -= 1
                            if skipped == 0:
                                break
                            n_temp += 1

                    #assert n_temp < n, 'n_temp: {}; n: {}'.format(n_temp, n)
                    lis_lineage_taxon.append(lineage_tax[n])
                    n_temp = n
                    index_temp = index

            l1 = len(lis_lineage_taxon)
            l2 = len(rank_lis_needed)

            if l1 != l2:
                lis_lineage_taxon.extend(['Other'] * (l2 - l1))

            assert len(lis_lineage_taxon) == len(rank_lis_needed)
            fw.write('{}\t{}\t{}\n'.format(id, taxonid,
                                           '\t'.join(lis_lineage_taxon)))

        mes = ('*** Number of taxonid with irregular taxonomy '
               '(without {} info): {}')
        sys.stderr.write(mes.format(', '.join(REQUIRED_RANK), cnt))
コード例 #15
0
    print("Reading NCBI Taxa...")
    ncbi = NCBITaxa()
    print("Done...")

    if len(sys.argv) < 2:
        print("\nNeed exactly two parameters! None given...\n")
        print("Documentation:")
        print(__doc__)
        sys.exit(9)

    #root_taxon = 'Leptospira alexanderi'
    root_taxon = sys.argv[2]
    lineage = ncbi.get_descendant_taxa(root_taxon, intermediate_nodes=True)
    root_taxon_id = ncbi.get_name_translator([root_taxon])[root_taxon][0]
    lineage.append(root_taxon_id)
    names = ncbi.translate_to_names(lineage)
    seqs_by_taxon = dict()
    for name in names:
        seqs_by_taxon[name] = []
    if DEBUG: print("Total # of Taxons: %s " % (len(seqs_by_taxon)))
    if DEBUG: print("First 10 taxons: %s" % seqs_by_taxon.keys()[:10])

    FASTAFILE = sys.argv[1]
    FASTAFILE = os.path.expanduser(FASTAFILE)
    OUTFILE = os.path.splitext(FASTAFILE)[0] + '_' + root_taxon.replace(
        ' ', '_') + os.path.splitext(FASTAFILE)[1]

    if not os.path.isfile(FASTAFILE):
        raise OSError(2, 'No such file or directory:', FASTAFILE)
    else:
        with open(FASTAFILE, 'r') as f:
コード例 #16
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import re
import json
from sys import argv
from ete3 import Tree
from ete3 import NCBITaxa
ncbi = NCBITaxa()
ncbi.update_taxonomy_database()

descendants = ncbi.get_descendant_taxa(argv[1], collapse_subspecies=False) #2759 - eukarya
names = ncbi.translate_to_names(descendants)

def Back(linn):
    linn=re.sub(r"^la ", "", linn,flags=re.UNICODE)
    #linn = r"la " + linn
    #- final
    linn = re.sub(r"\bku'a la\b", "x", linn,flags=re.UNICODE)
    #linn = re.sub(r"\bx\b", "ku'a la", linn,flags=re.UNICODE)
    #- special words
    linn = re.sub(r"c$", "", linn,flags=re.UNICODE)
    linn = re.sub(r"c(?!\')\b", "", linn,flags=re.UNICODE)
    linn = re.sub(r"y$", "", linn,flags=re.UNICODE)
    linn = re.sub(r"y(?!\')\b", "", linn,flags=re.UNICODE)
    #linn = re.sub(r"([aeiou])$", "\g<1>c", linn,flags=re.UNICODE)
    #linn = re.sub(r"([aeiou]) ", "\g<1>c ", linn,flags=re.UNICODE)
    #linn = re.sub(r"c(?![\'])(\b|$)", "cyc", linn,flags=re.UNICODE)
    linn = re.sub(r"kau'", "q", linn,flags=re.UNICODE)
    #treat 'q'
    #- add consonant to the end
コード例 #17
0
def main():

    # =============== #
    # 	PARAMETERS    #
    # =============== #

    args = docopt(__doc__)

    ai_features = args['--aifeatures']
    output_dir = get_outdir(args['--output'])
    fasta_inputfile = args['--fastafile']
    blast_inputfile = args['--blastfile']
    groups_yaml = args['--tax_groups']
    config_yaml = args['--config_file']
    ortho_groups = args['--ortho_groups']
    cfg_file = args['--cfg']

    # =============== #
    # 	MAIN          #
    # =============== #
    """
    0. Setting up
    """

    print("[+] Setting up")

    # Check if programs in path
    check_programs("blastdbcmd", "mafft")

    # Create folders
    fasta_folder = os.path.join(output_dir, "fastagroups")
    get_outdir(fasta_folder)

    mafft_folder = os.path.join(output_dir, "mafftgroups")
    get_outdir(mafft_folder)

    tmp_folder = os.path.join(output_dir, "tmp")
    get_outdir(tmp_folder)

    # Load proteome in memory
    record_dict = SeqIO.index(fasta_inputfile, "fasta")

    # Create taxonomic groups
    orgtag = "@StudiedOrganism"

    stream = open(groups_yaml, 'r')
    toi_egp = yaml.safe_load(stream)
    stream.close()

    if not cfg_file:
        cfg_file = os.path.join(sys.path[0], "depot", "taxonomy.yaml")
    stream = open(cfg_file, 'r')
    config_groups = yaml.safe_load(stream)
    stream.close()

    stream = open(config_yaml, 'r')
    config_opts = yaml.safe_load(stream)
    stream.close()

    threads = config_opts["max_threads"]
    trim = config_opts["trimal"]
    ai_cutoff = config_opts["ai_cutoff"]
    percent_identity = config_opts["percent_identity"]
    cutoffextend = config_opts["cutoffextend"]
    min_num_hits = config_opts["min_num_hits"]
    percentage_similar_hits = config_opts["percentage_similar_hits"]
    mode = config_opts["mode"]
    mafft_options = config_opts["mafft_options"]
    trimal_options = config_opts["trimal_options"]

    if trim:
        check_programs("trimal")
        trim_folder = os.path.join(output_dir, "trim")
        get_outdir(trim_folder)
    else:
        trim_folder = ""

    #Setting up NCBI Taxonomy
    ncbi = NCBITaxa()
    """
    1. Select HGT
    """

    query_dict_set = {}
    queries_info = {}

    with open(ai_features, 'r', encoding='utf8') as csvfile:
        reader = csv.reader(csvfile, delimiter='\t')

        for row in reader:
            # Get index col
            i_query = row.index('query name')
            i_notoi = row.index('donor')
            i_toi = row.index('recipient')
            i_ai = row.index('AI')
            i_hgt = row.index('HGTindex')
            i_nbhits = row.index('query hits number')
            break

        for row in reader:
            L_notoi = row[i_notoi].rstrip('\n').rsplit(':', 4)
            L_toi = row[i_toi].rstrip('\n').rsplit(':', 4)
            if (row[i_notoi] != '::::'):  #Skipping hits with only TOI
                if (float(row[i_nbhits]) >= min_num_hits
                        and float(L_notoi[2]) <= percent_identity
                        and float(row[i_ai]) >= ai_cutoff):
                    notoi_pos = int(L_notoi[1])
                    if (row[i_toi] == '::::'):
                        toi_pos = 0
                    else:
                        toi_pos = int(L_toi[1])
                    #Select at least 50 hits
                    last_pos = min(
                        max(max(toi_pos, notoi_pos) + cutoffextend, 50),
                        int(row[i_nbhits]))
                    queries_info[row[i_query]] = {'pos': last_pos}
                    query_dict_set[row[i_query]] = set()

    print("[!] Selected " + str(len(query_dict_set)) + " HGT candidates")
    """
    2. Parse Blast
    """

    print("[+] Parsing Blast file and grouping similar queries")

    extract_hit_id_set = set()

    with open_file(blast_inputfile) as fhr_bl:
        for line in fhr_bl:
            if ('#' not in line):
                L_hitqline = line.rstrip('\n').split('\t')
                query_id = L_hitqline[0]
                if query_id in queries_info.keys(
                ):  # Queries that pass the initial selection
                    if (len(query_dict_set[query_id]) <=
                            queries_info[query_id]["pos"]):
                        if "@" in query_id:
                            sys.exit("@ symbol is not allowed: " + query_id)
                        if "@" in query_hit_id:
                            sys.exit("@ symbol is not allowed: " +
                                     query_hit_id)
                        query_hit_id = L_hitqline[1]
                        extract_hit_id_set.add(query_hit_id)
                        query_dict_set[query_id].add(query_hit_id)  # GK

    # Group hits
    G = nx.Graph()

    if ortho_groups:
        num_groups = 0
        with open_file(ortho_groups) as fhr_og:
            for line in fhr_og:
                num_groups += 1
                members = line.split()
                for i in range(1, len(members), 1):
                    G.add_node(members[i])
                    if i > 1:
                        G.add_edge(members[i], members[i - 1])
        print("[!] Found " + str(num_groups) + " groups")

    else:
        for protein_id, hits in query_dict_set.items():
            G.add_node(protein_id)
            for protein_id_other, hitsc in query_dict_set.items():
                if protein_id != protein_id_other:
                    u = len(set.intersection(hits, hitsc))
                    m = min(len(hits), len(hitsc))
                    if (u / m) >= percentage_similar_hits:
                        G.add_edge(protein_id, protein_id_other)
        print("[!] Formed " + str(len(list(nx.connected_components(G)))) +
              " groups")
    """
    3. Extract hits
    """

    print("[+] Extracting hits from DB")

    extract_id_path = os.path.join(tmp_folder, "extract_id.txt")
    fhw_extract_id = open(extract_id_path, 'w')
    fhw_extract_id.write('\n'.join(extract_hit_id_set) + '\n')
    fhw_extract_id.close()

    setnrfa_path = os.path.join(tmp_folder, "setnr.fa")
    fhw_setnrfa = open(setnrfa_path, 'w')
    setnrlog_path = os.path.join(tmp_folder, "setnr.log")

    if mode == "nr":
        blastdbcmd_command = 'blastdbcmd -db ' + config_opts[
            "nr_db_path"] + ' -dbtype prot -entry_batch ' + extract_id_path + ' -target_only -outfmt ">%a@%T\n%s" -logfile ' + setnrlog_path + ' -out ' + setnrfa_path
        subprocess.call(blastdbcmd_command, shell=True)
    else:  # GK This is specific to SwissProt for now, have to test for UniProt in the future
        if mode == "sp":
            db_re = re.compile("OX=\d*")
        elif mode == "ur90":
            db_re = re.compile("TaxID=\d*")
        else:
            sys.exit(mode + " is not a valid mode")
        with open_file(config_opts["sp_fasta_path"]) as handle:
            for record in SeqIO.parse(handle, "fasta"):
                if record.id in extract_hit_id_set:
                    ox, taxid = db_re.search(
                        record.description).group().split("=")
                    fhw_setnrfa.write(">" + record.id + "@" + taxid + "\n")
                    seq = str(record.seq)
                    fhw_setnrfa.write(seq + "\n")

    fhw_setnrfa.close()

    # Load hits to memory
    hits_dict = {}
    record_to_taxid = {}

    with open_file(setnrfa_path) as handle:
        for record in SeqIO.parse(handle, "fasta"):
            id, taxid = record.id.rstrip('\n').split('@')
            record_to_taxid[id] = taxid
            hits_dict[id] = str(record.seq)
    """
    4. Write fasta
    """
    print("[+] Writing fasta files")

    taxonomy_nexus_path = os.path.join(tmp_folder, "taxonomy_nexus.txt")
    taxonomy_nexus = open(taxonomy_nexus_path, 'w')
    groups_tsv_path = os.path.join(output_dir, "groups.tsv")
    groups_tsv = open(groups_tsv_path, 'w')

    group_id = 1
    final_number_of_candidates = 0
    final_number_of_groups = 0

    group_dict = {}
    number_of_lost_taxids = 0
    number_of_lost_records = 0

    for subgraph in nx.connected_components(G):

        num_of_seqs_in_group = 0
        grouped_hits = []
        queries_to_fasta = []
        queries_to_group = []

        for q in subgraph:
            queries_to_fasta.append(q)
            queries_to_group.append(q)
            grouped_hits.extend(query_dict_set[q])

        grouped_hits = set(grouped_hits)

        if len(grouped_hits) >= min_num_hits:
            group_name_file = "gp" + str(group_id) + '.fa'
            gp_pathname = os.path.join(fasta_folder, group_name_file)
            fw_gp = open(gp_pathname, 'w')
            final_number_of_groups += 1

            for q in queries_to_fasta:
                final_number_of_candidates += 1
                fw_gp.write('>' + q + orgtag + '\n' + str(record_dict[q].seq) +
                            '\n')
                num_of_seqs_in_group += 1

            for record_id in grouped_hits:
                if record_id in hits_dict:
                    taxid = record_to_taxid[record_id]
                    try:
                        ncbi.get_lineage(taxid)
                        taxid_found = True
                    except:
                        taxid_found = False
                    if not taxid or not taxid_found:
                        # actually print a file containing lost taxids
                        number_of_lost_taxids += 1
                        selectname = "Unknown"
                    else:
                        lnode = set(ncbi.get_lineage(taxid))
                        lname = ncbi.translate_to_names(
                            lnode)  # This does not output them in order
                        #print(ncbi.get_rank(lnode)) Maybe try this with {1: 'no rank', 2: 'superkingdom'}
                        taxonomy_nexus.write(
                            str(record_id) + "\t" + str(lname) + "\n")

                        egp_hit = list(
                            lnode.intersection(set(toi_egp["EGP"].keys())))
                        toi_hit = list(
                            lnode.intersection(set(toi_egp["TOI"].keys())))
                        cfg_hit = list(
                            lnode.intersection(
                                set(config_groups["Other"].keys())))
                        kdom_hit = list(
                            lnode.intersection(
                                set(config_groups["Kingdom"].keys())))

                        if egp_hit:
                            selectname = "EGP-" + toi_egp["EGP"][egp_hit[0]]
                        elif toi_hit:
                            selectname = "TOI-" + toi_egp["TOI"][toi_hit[0]]
                        elif cfg_hit:
                            selectname = config_groups["Other"][cfg_hit[0]]
                        elif kdom_hit:
                            selectname = config_groups["Kingdom"][kdom_hit[0]]
                        else:
                            selectname = "Unknown"

                    fw_gp.write(">" + record_id + "@" + selectname + "\n")
                    fw_gp.write(hits_dict[record_id] + "\n")
                    num_of_seqs_in_group += 1
                else:
                    # actually print a file containing lost gids
                    number_of_lost_records += 1

            groups_tsv.write(group_name_file + '\t' +
                             str(num_of_seqs_in_group) + '\t' +
                             '\t'.join(queries_to_group) + '\n')
            group_dict[group_name_file] = num_of_seqs_in_group
            fw_gp.close()
            group_id += 1

    groups_tsv.close()

    print("[!] Skipped " + str(number_of_lost_records) + " hits and " +
          str(number_of_lost_taxids) + " taxids.")
    """
    5. Align fasta
    """

    print("[+] Aligning fasta files")

    jobs = threads
    p = Pool(jobs)

    job_list = []

    for group_name, value in sorted(group_dict.items(),
                                    key=itemgetter(1),
                                    reverse=True):
        g_list = [
            group_name, fasta_folder, mafft_folder, trim, trim_folder,
            mafft_options, trimal_options
        ]
        job_list.append(g_list)

    i = 0
    for i, _ in enumerate(p.imap_unordered(run_mafft, job_list), 1):
        progress(i, 1, len(job_list))

    print("[!] Finished with " + str(final_number_of_candidates) +
          " HGT candidates in " + str(final_number_of_groups) + " groups")
コード例 #18
0
ファイル: workflow.py プロジェクト: alienzj/EukCC
    def inferLineage(self, places):
        """
        infer the lineage from looking at the location of placement
        looking at the leaves and their tax id
        and looking at the lineages of all these
        """
        if self.cfg["touch"]:
            return
        ncbi = NCBITaxa()
        # fetch file and load taxinformation
        seqinfo = self.config.pkgfile("concat.refpkg", "seq_info")
        taxids = {}
        si = base.readCSV(seqinfo)
        # make dictionary
        for r in si:
            taxids[r["seqname"]] = r["tax_id"]

        # for each placement:
        logging.debug("Infering lineages now")
        for p in places:
            # get the GCA names
            children = p["sisters"]
            # fetch lineages for all
            lngs = []
            for c in children:
                try:
                    lngs.append(ncbi.get_lineage(taxids[c]))
                except ValueError as e:
                    logging.warning(e)

            # find common elements:
            common = set(lngs[0])
            for l in lngs[1:]:
                common = common & set(l)

            # common lineage
            lng = []
            for v in lngs[0]:
                if v not in common:
                    break
                # add common elements
                lng.append(v)

            nodetaxid = lng[-1]
            # now we can make it pretty
            if not self.cfg["fullineage"]:
                # limit to desired ranks2
                desired_ranks = [
                    "superkingdom",
                    "kingdom",
                    "phylum",
                    "class",
                    "order",
                    "family",
                    "genus",
                    "species",
                ]
                lineage2ranks = ncbi.get_rank(lng)
                ranks2lineage = dict(
                    (rank, taxid) for (taxid, rank) in lineage2ranks.items())
                ranks = {
                    "{}_id".format(rank): ranks2lineage.get(rank, "NA")
                    for rank in desired_ranks
                }
                lng = [i for i in lng if i in ranks.values()]
            # get translator and make string
            named = ncbi.translate_to_names(lng)
            # save to placed object
            p["lineage"] = "_".join(named)
            # replace names with spaces into points
            p["lineage"] = p["lineage"].replace(" ", ".")
            p["taxidlineage"] = "_".join([str(x) for x in lng])
            p["taxid"] = nodetaxid

        return ()
コード例 #19
0
        if refaa !=  '-' and frac > 0.66:
            valid_cols += 1
            variants = a[a[col] != refaa][col].to_dict()
            refaas.append(refaa)
            for sp, var in variants.iteritems():
                sp = sp.split(".")[0]
                spvariants[sp].update([(refaa, var)])
    
        #if valid_cols > 500:
        #    break

refaacounter = Counter(refaas)
            
for sp, varcounter in spvariants.iteritems():
    try:
        sp_name = ncbi.translate_to_names([int(sp.split(".")[0])])[0]
    except ValueError:
        sp_name = "oxymonad-%s" % sp
    for varc in varcounter:
        ratio = varcounter[varc] / float(refaacounter[varc[0]])
        if ratio > 0.25:
            print sp, sp_name, "%s\t%s/%s" % ( "->".join(varc), varcounter[varc], refaacounter[varc[0]] )
    # #print varcounter.most_common(1)[0], varcounter[('W', 'X')]
    # most_common = varcounter.most_common(1)[0]
    # ratio = (most_common[1] / float(refaacounter[most_common[0][0]]))
    # #if varcounter.most_common(1)[0][1] > 10 and "-" not in varcounter.most_common(1)[0][0]:
    # if ratio > 0.33:# and "-" not in most_common[0]:
    #     print sp, ncbi.translate_to_names([int(sp.split(".")[0])])[0]
    #     for varc in varcounter:
    #         #if varcounter[varc] > 2:#/float(refaacounter[varc[0]]) > 0.2:
    #         try:
コード例 #20
0
 host_gb = out.decode("utf-8").rstrip().replace('"', '').replace("[u'", "")
 host_gb = ""
 cleaned_host_gb = host_gb.split(' (', 1)[0]
 if cleaned_host_gb == "mosquito" or cleaned_host_gb == "mosquitoes":
     cleaned_host_gb = "Culicoidea"
 if cleaned_host_gb != '':
     liste.extend([ids, cleaned_host_gb])
     liste_host.append(liste)
     name2taxid = ncbi.get_name_translator([cleaned_host_gb])
     fieldnames.append(host_gb)
     if cleaned_host_gb in name2taxid:
         host_id = int(name2taxid[cleaned_host_gb][0])
         lineage = ncbi.get_lineage(host_id)
         fieldnames.append(host_id)
 if cleaned_host_gb != '' and cleaned_host_gb in name2taxid:
     lineage2name = ncbi.translate_to_names(lineage)
     lineage2ranks = ncbi.get_rank(lineage)
     Dic_lineage2name = dict(zip(lineage, lineage2name))
     for (taxid, rank) in lineage2ranks.items():
         if rank in desired_ranks:
             if not rank in ranks2lineage:
                 ranks2lineage[rank] = [taxid]
             else:
                 ranks2lineage[rank].append(taxid)
     ranks2names = {}
     for rank in ranks2lineage:
         ranks2names[rank] = ""
         for i in range(len(ranks2lineage[rank])):
             ranks2names[rank] += Dic_lineage2name[ranks2lineage[rank]
                                                   [i]] + ";"
         ranks2names[rank] = ranks2names[rank][:-1]
コード例 #21
0
ファイル: phylotree.py プロジェクト: KirkVM/kdatapack
    def update_leafcdsdict_fromxr(self,xrpathstr):#,fields=['pdbids','ecs','subfam','extragbs'],searchby='gbacc'):
        mds=xr.open_dataset(xrpathstr)
        ncbitaxa=NCBITaxa()
        knownra=mds.taxra[np.isnan(mds.taxra.loc[:,'species'].values)==False]
        known_accs=knownra.dbseq.values
        known_species=ncbitaxa.translate_to_names(knownra.loc[:,'species'].values)
#        known_phylum=ncbitaxa.translate_to_names(knownra.loc[:,'phylum'].values)
#        knowndict={ka:[ks,kc,kp] for ka,ks,kc,kp in zip(known_accs,known_species,known_class,known_phylum)}
        knowndict={ka:ks for ka,ks in zip(known_accs,known_species)}
        self.leaf_cds_dict['species']=[]
#        self.leaf_cds_dict['class']=[]
#        self.leaf_cds_dict['phylum']=[]
        for gbacc in self.leaf_cds_dict['gbacc']:
            if gbacc in known_accs:
                self.leaf_cds_dict['species'].append(knowndict[gbacc])
#                self.leaf_cds_dict['class'].append(knowndict[gbacc][0])
#                self.leaf_cds_dict['phylum'].append(knowndict[gbacc][0])
            else:
                self.leaf_cds_dict['species'].append('Unknown')

        knownra=mds.taxra[np.isnan(mds.taxra.loc[:,'class'].values)==False]
        known_accs=knownra.dbseq.values
        known_class=ncbitaxa.translate_to_names(knownra.loc[:,'class'].values)
        knowndict={ka:kc for ka,kc in zip(known_accs,known_class)}
        self.leaf_cds_dict['class']=[]
        for gbacc in self.leaf_cds_dict['gbacc']:
            if gbacc in known_accs:
                self.leaf_cds_dict['class'].append(knowndict[gbacc])
            else:
                self.leaf_cds_dict['class'].append('Unknown')

        knownra=mds.taxra[np.isnan(mds.taxra.loc[:,'phylum'].values)==False]
        known_accs=knownra.dbseq.values
        known_class=ncbitaxa.translate_to_names(knownra.loc[:,'phylum'].values)
        knowndict={ka:kc for ka,kc in zip(known_accs,known_class)}
        self.leaf_cds_dict['phylum']=[]
        for gbacc in self.leaf_cds_dict['gbacc']:
            if gbacc in known_accs:
                self.leaf_cds_dict['phylum'].append(knowndict[gbacc])
            else:
                self.leaf_cds_dict['phylum'].append('Unknown')
        
        knownra=mds.taxra[np.isnan(mds.taxra.loc[:,'genus'].values)==False]
        known_accs=knownra.dbseq.values
        known_class=ncbitaxa.translate_to_names(knownra.loc[:,'genus'].values)
        knowndict={ka:kc for ka,kc in zip(known_accs,known_class)}
        self.leaf_cds_dict['genus']=[]
        for gbacc in self.leaf_cds_dict['gbacc']:
            if gbacc in known_accs:
                self.leaf_cds_dict['genus'].append(knowndict[gbacc])
            else:
                self.leaf_cds_dict['genus'].append('Unknown')
        
        knownra=mds.taxra[np.isnan(mds.taxra.loc[:,'superkingdom'].values)==False]
        known_accs=knownra.dbseq.values
        known_class=ncbitaxa.translate_to_names(knownra.loc[:,'superkingdom'].values)
        knowndict={ka:kc for ka,kc in zip(known_accs,known_class)}
        self.leaf_cds_dict['superkingdom']=[]
        for gbacc in self.leaf_cds_dict['gbacc']:
            if gbacc in known_accs:
                self.leaf_cds_dict['superkingdom'].append(knowndict[gbacc])
            else:
                self.leaf_cds_dict['superkingdom'].append('Unknown')