Esempi in Python per get_taxid_translator

Linguaggio di programmazione: Python

Spazio dei nomi/nome del pacchetto: common.ncbi

Metodo/funzione: get_taxid_translator

Esempi su hotexamples.com: 4

get_taxid_translator in Python: 4 esempi trovati. Questi sono i migliori esempi reali in Python per common.ncbi.get_taxid_translator, estratti da progetti open source. Li puoi valutare, per aiutarci a migliorare la qualità dei nostri esempi.

Esempio n. 1

Mostra file

File: ete_ncbiquery.py Progetto: daisieh/ete

def main(argv):
    
    parser = ArgumentParser(description=__DESCRIPTION__)

    parser.add_argument("--db",  dest="dbfile",
                        type=str,
                        help="""NCBI sqlite3 db file.""")
    
    parser.add_argument("-t", "--taxid", dest="taxid", nargs="+",  
                        type=int, 
                        help="""taxids (space separated)""")

    parser.add_argument("-tf", "--taxid_file", dest="taxid_file",   
                        type=str, 
                        help="""file containing a list of taxids (one per line)""")

    parser.add_argument("-r", "--reftree", dest="reftree",   
                        type=str, 
                        help="""tree file containing taxids as node names.""")
    
    parser.add_argument("--reftree_attr", dest="reftree_attr",   
                        type=str, default="name",
                        help="""Where taxid should be read from""")
    
    parser.add_argument("-n", "--name", dest="names", nargs="+",  
                        type=str, 
                        help="""species or taxa names (comma separated)""")

    parser.add_argument("-nf", "--names_file", dest="names_file",   
                        type=str, 
                        help="""file containing a list of taxids (one per line)""")

    parser.add_argument("-x", "--taxonomy", dest="taxonomy",   
                        action="store_true",
                        help=("returns a pruned version of the NCBI taxonomy"
                              " tree containing target species"))

    parser.add_argument("--show_tree", dest="show_tree",   
                        action="store_true",
                        help="""shows the NCBI taxonomy tree of the provided species""")
    
    parser.add_argument("--collapse_subspecies", dest="collapse_subspecies",   
                        action="store_true",
                        help=("When used, all nodes under the the species rank"
                              " are collapsed, so all species and subspecies"
                              " are seen as sister nodes"))

    parser.add_argument("--rank_limit", dest="rank_limit",   
                        type=str,
                        help=("When used, all nodes under the provided rank"
                              " are discarded"))
    
    parser.add_argument("--full_lineage", dest="full_lineage",   
                        action="store_true",
                        help=("When used, topology is not pruned to avoid "
                              " one-child-nodes, so the complete lineage"
                              " track leading from root to tips is kept."))
        
    parser.add_argument("-i", "--info", dest="info",   
                        action="store_true",
                        help="""shows NCBI information about the species""")

    parser.add_argument("--fuzzy", dest="fuzzy", type=float,
                        help=("Tries a fuzzy (and SLOW) search for those"
                              " species names that could not be translated"
                              " into taxids. A float number must be provided"
                              " indicating the minimum string similarity."))
   
    
    args = parser.parse_args(argv)
    if not args.taxonomy and not args.info and not args.reftree:
        parser.print_usage()
        sys.exit(0)
    
    if args.fuzzy:
        import pysqlite2.dbapi2 as sqlite3
        c = sqlite3.connect(os.path.join(MODULE_PATH, args.dbfile))
    else:
        ncbi.connect_database(args.dbfile)
    

        
    all_names = set([])
    all_taxids = []

    if args.names_file:
        all_names.update(map(strip, open(args.names_file, "rU").read().split("\n")))
    if args.names:
        all_names.update(map(strip, " ".join(args.names).split(",")))
    all_names.discard("")
    #all_names = set([n.lower() for n in all_names])
    not_found = set()
    name2realname = {}
    name2score = {}
    if all_names:
        log.info("Dumping name translations:")
        name2id = ncbi.get_name_translator(all_names)
        not_found = all_names - set(name2id.keys())

        if args.fuzzy and not_found:
            log.info("%s unknown names", len(not_found))
            for name in not_found:
                # enable extension loading
                c.enable_load_extension(True)
                c.execute("select load_extension('%s')" % os.path.join(module_path,
                                            "SQLite-Levenshtein/levenshtein.sqlext"))
                tax, realname, sim = ncbi.get_fuzzy_name_translation(name, args.fuzzy)
                if tax:
                    name2id[name] = tax
                    name2realname[name] = realname
                    name2score[name] = "Fuzzy:%0.2f" %sim
                    
        for name in all_names:
            taxid = name2id.get(name, "???")
            realname = name2realname.get(name, name)
            score = name2score.get(name, "Exact:1.0")
            print "\t".join(map(str, [score, name, realname.capitalize(), taxid]))
            
    if args.taxid_file:
        all_taxids.extend(map(strip, open(args.taxid_file, "rU").read().split("\n")))
    if args.taxid:
        all_taxids.extend(args.taxid)
        
    reftree = None
    if args.reftree:
        reftree = PhyloTree(args.reftree)
        all_taxids.extend(list(set([getattr(n, args.reftree_attr) for n in reftree.iter_leaves()])))

       
    if all_taxids and args.info:
        all_taxids = set(all_taxids)
        all_taxids.discard("")
        all_taxids, merge_conversion = ncbi.translate_merged(all_taxids)
        log.info("Dumping %d taxid translations:" %len(all_taxids))
        all_taxids.discard("")
        translator = ncbi.get_taxid_translator(all_taxids)
        for taxid, name in translator.iteritems():
            lineage = ncbi.get_sp_lineage(taxid)
            named_lineage = ','.join(ncbi.translate_to_names(lineage))
            lineage = ','.join(map(str, lineage))
            print "\t".join(map(str, [merge_conversion.get(int(taxid), taxid), name, named_lineage, lineage ]))
            
        for notfound in set(map(str, all_taxids)) - set(str(k) for k in translator.iterkeys()):
            print >>sys.stderr, notfound, "NOT FOUND"
            
    if all_taxids and args.taxonomy:
        all_taxids = set(all_taxids)
        all_taxids.discard("")
        log.info("Dumping NCBI taxonomy of %d taxa:" %len(all_taxids))

        t = ncbi.get_topology(all_taxids, args.full_lineage, args.rank_limit)
        id2name = ncbi.get_taxid_translator([n.name for n in t.traverse()])
        for n in t.traverse():
            n.add_features(taxid=n.name)
            n.add_features(sci_name=str(id2name.get(int(n.name), "?")))
            if n.rank in COLOR_RANKS:
                n.add_features(bgcolor=COLOR_RANKS[n.rank])
            n.name = "%s{%s}" %(id2name.get(int(n.name), n.name), n.name)
            lineage = ncbi.get_sp_lineage(n.taxid)
            n.add_features(named_lineage = '|'.join(ncbi.translate_to_names(lineage)))
            
        if args.collapse_subspecies:
            species_nodes = [n for n in t.traverse() if n.rank == "species"
                             if int(n.taxid) in all_taxids]
            for sp_node in species_nodes:
                bellow = sp_node.get_descendants()
                if bellow:
                    # creates a copy of the species node
                    connector = sp_node.__class__()
                    for f in sp_node.features:
                        connector.add_feature(f, getattr(sp_node, f))
                    connector.name = connector.name + "{species}"
                    for n in bellow:
                        n.detach()
                        n.name = n.name + "{%s}" %n.rank
                        sp_node.add_child(n)
                    sp_node.add_child(connector)
                    sp_node.add_feature("collapse_subspecies", "1")
                
        if args.show_tree:
            t.show()
            
        print "\n\n  ===== Newick files saved as 'your_taxa_query.*' ===== "
        t.write(format=9, outfile="your_ncbi_query.nw")
        t.write(format=8, outfile="your_ncbi_query.named.nw")
        t.write(format=9, features=["taxid", "name", "rank", "bgcolor", "sci_name", "collapse_subspecies", "named_lineage"],
                outfile="your_ncbi_query.extended.nw")
        for i in t.iter_leaves():
            i.name = i.taxid
        t.write(format=9, outfile="your_ncbi_query.taxids.nw")

    if all_taxids and reftree:
        translator = ncbi.get_taxid_translator(all_taxids)
        for n in reftree.iter_leaves():
            if not hasattr(n, "taxid"):
                n.add_features(taxid=int(getattr(n, args.reftree_attr)))
            n.add_features(sci_name = translator.get(int(n.taxid), n.name))
            lineage = ncbi.get_sp_lineage(n.taxid)
            named_lineage = '|'.join(ncbi.translate_to_names(lineage))
            n.add_features(ncbi_track=named_lineage)
            
        print reftree.write(features=["taxid", "sci_name", "ncbi_track"])

Esempio n. 2

Mostra file

File: ete_diff.py Progetto: daisieh/ete

def main(argv):
    global args
    # test()
    parser = argparse.ArgumentParser(description=__DESCRIPTION__, formatter_class=argparse.RawDescriptionHelpFormatter)

    parser.add_argument("target_trees", type=str, nargs="+", help="a list of target tree files")

    parser.add_argument("-r", dest="reftree", type=str, help="The reference tree to compare with")

    parser.add_argument(
        "--ref_attr",
        dest="ref_attr",
        default="name",
        help=("Defines the attribute in REFERENCE tree that will be used" " to perform the comparison"),
    )

    parser.add_argument(
        "--target_attr",
        dest="target_attr",
        default="name",
        help=("Defines the attribute in TARGET tree that will be used" " to perform the comparison"),
    )

    parser.add_argument(
        "--fullsearch",
        dest="fullsearch",
        action="store_false",
        help=("Enable this option if duplicated attributes (i.e. name)" "exist in reference or target trees."),
    )

    parser.add_argument("--quite", dest="quite", action="store_true", help="Do not show process information")

    parser.add_argument(
        "--report",
        dest="report",
        choices=["topology", "diffs", "diffs_tab", "summary"],
        default="topology",
        help="Different format for the comparison results",
    )

    parser.add_argument(
        "--ncbi",
        dest="ncbi",
        action="store_true",
        help="If enabled, it will use the ETE ncbi_taxonomy module to for ncbi taxid translation",
    )

    parser.add_argument(
        "--color", dest="color", action="store_true", help="If enabled, it will use colors in some of the report"
    )

    args = parser.parse_args(argv)

    if args.quite:
        logging.basicConfig(format="%(message)s", level=logging.WARNING)
    else:
        logging.basicConfig(format="%(message)s", level=logging.INFO)
    log = logging

    t1 = Tree(args.reftree)
    if args.ncbi:
        from common import ncbi

        ncbi.connect_database()

    for ttree in args.target_trees:
        t2 = Tree(ttree)

        if args.ncbi:

            taxids = set([getattr(leaf, args.ref_attr) for leaf in t1.iter_leaves()])
            taxids.update([getattr(leaf, args.target_attr) for leaf in t2.iter_leaves()])
            taxid2name = ncbi.get_taxid_translator(taxids)
            for leaf in t1.get_leaves() + t2.get_leaves():
                try:
                    leaf.name = taxid2name.get(int(leaf.name), leaf.name)
                except ValueError:
                    pass

        difftable = treediff(t1, t2, args.ref_attr, args.target_attr, reduce_matrix=args.fullsearch)
        if args.report == "topology":
            show_difftable_topo(difftable, args.ref_attr, args.target_attr, usecolor=args.color)
        elif args.report == "diffs":
            show_difftable(difftable)
        elif args.report == "diffs_tab":
            show_difftable_tab(difftable)
        elif args.report == "table":
            rf, rf_max, _, _, _, _, _ = t1.robinson_foulds(t2, attr_t1=args.ref_attr, attr_t2=args.target_attr)[:2]
            show_difftable_summary(difftable, rf, rf_max)

Esempio n. 3

Mostra file

File: ete_diff.py Progetto: tarah28/ete

def main(argv):
    global args
    #test()
    parser = argparse.ArgumentParser(
        description=__DESCRIPTION__,
        formatter_class=argparse.RawDescriptionHelpFormatter)

    parser.add_argument("target_trees",
                        type=str,
                        nargs="+",
                        help='a list of target tree files')

    parser.add_argument("-r",
                        dest='reftree',
                        type=str,
                        help='The reference tree to compare with')

    parser.add_argument(
        "--ref_attr",
        dest="ref_attr",
        default="name",
        help=("Defines the attribute in REFERENCE tree that will be used"
              " to perform the comparison"))

    parser.add_argument(
        "--target_attr",
        dest="target_attr",
        default="name",
        help=("Defines the attribute in TARGET tree that will be used"
              " to perform the comparison"))

    parser.add_argument(
        "--fullsearch",
        dest="fullsearch",
        action="store_false",
        help=("Enable this option if duplicated attributes (i.e. name)"
              "exist in reference or target trees."))

    parser.add_argument("--quite",
                        dest="quite",
                        action="store_true",
                        help="Do not show process information")

    parser.add_argument("--report",
                        dest="report",
                        choices=["topology", "diffs", "diffs_tab", "summary"],
                        default="topology",
                        help="Different format for the comparison results")

    parser.add_argument(
        "--ncbi",
        dest="ncbi",
        action="store_true",
        help=
        "If enabled, it will use the ETE ncbi_taxonomy module to for ncbi taxid translation"
    )

    parser.add_argument(
        "--color",
        dest="color",
        action="store_true",
        help="If enabled, it will use colors in some of the report")

    args = parser.parse_args(argv)

    if args.quite:
        logging.basicConfig(format='%(message)s', level=logging.WARNING)
    else:
        logging.basicConfig(format='%(message)s', level=logging.INFO)
    log = logging

    t1 = Tree(args.reftree)
    if args.ncbi:
        from common import ncbi
        ncbi.connect_database()

    for ttree in args.target_trees:
        t2 = Tree(ttree)

        if args.ncbi:

            taxids = set(
                [getattr(leaf, args.ref_attr) for leaf in t1.iter_leaves()])
            taxids.update(
                [getattr(leaf, args.target_attr) for leaf in t2.iter_leaves()])
            taxid2name = ncbi.get_taxid_translator(taxids)
            for leaf in t1.get_leaves() + t2.get_leaves():
                try:
                    leaf.name = taxid2name.get(int(leaf.name), leaf.name)
                except ValueError:
                    pass

        difftable = treediff(t1,
                             t2,
                             args.ref_attr,
                             args.target_attr,
                             reduce_matrix=args.fullsearch)
        if args.report == "topology":
            show_difftable_topo(difftable,
                                args.ref_attr,
                                args.target_attr,
                                usecolor=args.color)
        elif args.report == "diffs":
            show_difftable(difftable)
        elif args.report == "diffs_tab":
            show_difftable_tab(difftable)
        elif args.report == 'table':
            rf, rf_max, _, _, _, _, _ = t1.robinson_foulds(
                t2, attr_t1=args.ref_attr, attr_t2=args.target_attr)[:2]
            show_difftable_summary(difftable, rf, rf_max)

Esempio n. 4

Mostra file

File: ete_ncbiquery.py Progetto: daisieh/ete

def main(argv):

    parser = ArgumentParser(description=__DESCRIPTION__)

    parser.add_argument("--db",
                        dest="dbfile",
                        type=str,
                        help="""NCBI sqlite3 db file.""")

    parser.add_argument("-t",
                        "--taxid",
                        dest="taxid",
                        nargs="+",
                        type=int,
                        help="""taxids (space separated)""")

    parser.add_argument(
        "-tf",
        "--taxid_file",
        dest="taxid_file",
        type=str,
        help="""file containing a list of taxids (one per line)""")

    parser.add_argument("-r",
                        "--reftree",
                        dest="reftree",
                        type=str,
                        help="""tree file containing taxids as node names.""")

    parser.add_argument("--reftree_attr",
                        dest="reftree_attr",
                        type=str,
                        default="name",
                        help="""Where taxid should be read from""")

    parser.add_argument("-n",
                        "--name",
                        dest="names",
                        nargs="+",
                        type=str,
                        help="""species or taxa names (comma separated)""")

    parser.add_argument(
        "-nf",
        "--names_file",
        dest="names_file",
        type=str,
        help="""file containing a list of taxids (one per line)""")

    parser.add_argument("-x",
                        "--taxonomy",
                        dest="taxonomy",
                        action="store_true",
                        help=("returns a pruned version of the NCBI taxonomy"
                              " tree containing target species"))

    parser.add_argument(
        "--show_tree",
        dest="show_tree",
        action="store_true",
        help="""shows the NCBI taxonomy tree of the provided species""")

    parser.add_argument("--collapse_subspecies",
                        dest="collapse_subspecies",
                        action="store_true",
                        help=("When used, all nodes under the the species rank"
                              " are collapsed, so all species and subspecies"
                              " are seen as sister nodes"))

    parser.add_argument("--rank_limit",
                        dest="rank_limit",
                        type=str,
                        help=("When used, all nodes under the provided rank"
                              " are discarded"))

    parser.add_argument("--full_lineage",
                        dest="full_lineage",
                        action="store_true",
                        help=("When used, topology is not pruned to avoid "
                              " one-child-nodes, so the complete lineage"
                              " track leading from root to tips is kept."))

    parser.add_argument("-i",
                        "--info",
                        dest="info",
                        action="store_true",
                        help="""shows NCBI information about the species""")

    parser.add_argument("--fuzzy",
                        dest="fuzzy",
                        type=float,
                        help=("Tries a fuzzy (and SLOW) search for those"
                              " species names that could not be translated"
                              " into taxids. A float number must be provided"
                              " indicating the minimum string similarity."))

    args = parser.parse_args(argv)
    if not args.taxonomy and not args.info and not args.reftree:
        parser.print_usage()
        sys.exit(0)

    if args.fuzzy:
        import pysqlite2.dbapi2 as sqlite3
        c = sqlite3.connect(os.path.join(MODULE_PATH, args.dbfile))
    else:
        ncbi.connect_database(args.dbfile)

    all_names = set([])
    all_taxids = []

    if args.names_file:
        all_names.update(
            map(strip,
                open(args.names_file, "rU").read().split("\n")))
    if args.names:
        all_names.update(map(strip, " ".join(args.names).split(",")))
    all_names.discard("")
    #all_names = set([n.lower() for n in all_names])
    not_found = set()
    name2realname = {}
    name2score = {}
    if all_names:
        log.info("Dumping name translations:")
        name2id = ncbi.get_name_translator(all_names)
        not_found = all_names - set(name2id.keys())

        if args.fuzzy and not_found:
            log.info("%s unknown names", len(not_found))
            for name in not_found:
                # enable extension loading
                c.enable_load_extension(True)
                c.execute("select load_extension('%s')" % os.path.join(
                    module_path, "SQLite-Levenshtein/levenshtein.sqlext"))
                tax, realname, sim = ncbi.get_fuzzy_name_translation(
                    name, args.fuzzy)
                if tax:
                    name2id[name] = tax
                    name2realname[name] = realname
                    name2score[name] = "Fuzzy:%0.2f" % sim

        for name in all_names:
            taxid = name2id.get(name, "???")
            realname = name2realname.get(name, name)
            score = name2score.get(name, "Exact:1.0")
            print "\t".join(
                map(str,
                    [score, name, realname.capitalize(), taxid]))

    if args.taxid_file:
        all_taxids.extend(
            map(strip,
                open(args.taxid_file, "rU").read().split("\n")))
    if args.taxid:
        all_taxids.extend(args.taxid)

    reftree = None
    if args.reftree:
        reftree = PhyloTree(args.reftree)
        all_taxids.extend(
            list(
                set([
                    getattr(n, args.reftree_attr)
                    for n in reftree.iter_leaves()
                ])))

    if all_taxids and args.info:
        all_taxids = set(all_taxids)
        all_taxids.discard("")
        all_taxids, merge_conversion = ncbi.translate_merged(all_taxids)
        log.info("Dumping %d taxid translations:" % len(all_taxids))
        all_taxids.discard("")
        translator = ncbi.get_taxid_translator(all_taxids)
        for taxid, name in translator.iteritems():
            lineage = ncbi.get_sp_lineage(taxid)
            named_lineage = ','.join(ncbi.translate_to_names(lineage))
            lineage = ','.join(map(str, lineage))
            print "\t".join(
                map(str, [
                    merge_conversion.get(int(taxid), taxid), name,
                    named_lineage, lineage
                ]))

        for notfound in set(map(str, all_taxids)) - set(
                str(k) for k in translator.iterkeys()):
            print >> sys.stderr, notfound, "NOT FOUND"

    if all_taxids and args.taxonomy:
        all_taxids = set(all_taxids)
        all_taxids.discard("")
        log.info("Dumping NCBI taxonomy of %d taxa:" % len(all_taxids))

        t = ncbi.get_topology(all_taxids, args.full_lineage, args.rank_limit)
        id2name = ncbi.get_taxid_translator([n.name for n in t.traverse()])
        for n in t.traverse():
            n.add_features(taxid=n.name)
            n.add_features(sci_name=str(id2name.get(int(n.name), "?")))
            if n.rank in COLOR_RANKS:
                n.add_features(bgcolor=COLOR_RANKS[n.rank])
            n.name = "%s{%s}" % (id2name.get(int(n.name), n.name), n.name)
            lineage = ncbi.get_sp_lineage(n.taxid)
            n.add_features(
                named_lineage='|'.join(ncbi.translate_to_names(lineage)))

        if args.collapse_subspecies:
            species_nodes = [
                n for n in t.traverse() if n.rank == "species"
                if int(n.taxid) in all_taxids
            ]
            for sp_node in species_nodes:
                bellow = sp_node.get_descendants()
                if bellow:
                    # creates a copy of the species node
                    connector = sp_node.__class__()
                    for f in sp_node.features:
                        connector.add_feature(f, getattr(sp_node, f))
                    connector.name = connector.name + "{species}"
                    for n in bellow:
                        n.detach()
                        n.name = n.name + "{%s}" % n.rank
                        sp_node.add_child(n)
                    sp_node.add_child(connector)
                    sp_node.add_feature("collapse_subspecies", "1")

        if args.show_tree:
            t.show()

        print "\n\n  ===== Newick files saved as 'your_taxa_query.*' ===== "
        t.write(format=9, outfile="your_ncbi_query.nw")
        t.write(format=8, outfile="your_ncbi_query.named.nw")
        t.write(format=9,
                features=[
                    "taxid", "name", "rank", "bgcolor", "sci_name",
                    "collapse_subspecies", "named_lineage"
                ],
                outfile="your_ncbi_query.extended.nw")
        for i in t.iter_leaves():
            i.name = i.taxid
        t.write(format=9, outfile="your_ncbi_query.taxids.nw")

    if all_taxids and reftree:
        translator = ncbi.get_taxid_translator(all_taxids)
        for n in reftree.iter_leaves():
            if not hasattr(n, "taxid"):
                n.add_features(taxid=int(getattr(n, args.reftree_attr)))
            n.add_features(sci_name=translator.get(int(n.taxid), n.name))
            lineage = ncbi.get_sp_lineage(n.taxid)
            named_lineage = '|'.join(ncbi.translate_to_names(lineage))
            n.add_features(ncbi_track=named_lineage)

        print reftree.write(features=["taxid", "sci_name", "ncbi_track"])