Example #1
0
def translate_ids(trees_file, outgroup_lineage="Bacteria"):
    for line in open(trees_file):
        if not line.strip() or line.startswith('#'):
            continue

        t = PhyloTree(line, sp_naming_function=spname)
        #t.set_outgroup(t.get_midpoint_outgroup())

        for lf in t:
            lf.add_features(coded_name=lf.name)
            if lf.name in NAME2SP:
                lf.name = "%s {%s}" % (lf.name, NAME2SP[lf.name])

        t.dist = 0
        ncbi.connect_database()
        name2sp = ncbi.get_name_translator(t.get_species())
        for lf in t.iter_leaves():
            lf.add_features(taxid=name2sp.get(lf.species, 0))

        t.set_outgroup(t.search_nodes(taxid=9606)[0])
        ncbi.annotate_tree(t, attr_name='taxid')
        t.set_outgroup(
            t.get_common_ancestor(
                [lf for lf in t if outgroup_lineage in lf.named_lineage]))
        ncbi.annotate_tree(t, attr_name='taxid')

        #print t.write(features=[])
        #print t.write()
        yield t
Example #2
0
def translate_ids(trees_file, outgroup_lineage="Bacteria"):
    for line in open(trees_file):
        if not line.strip() or line.startswith('#'):
            continue

        t = PhyloTree(line, sp_naming_function=spname)
        #t.set_outgroup(t.get_midpoint_outgroup())

        for lf in t:
            lf.add_features(coded_name = lf.name)            
            if lf.name in NAME2SP:
                lf.name = "%s {%s}" %(lf.name, NAME2SP[lf.name])
         
        t.dist = 0
        ncbi.connect_database()
        name2sp = ncbi.get_name_translator(t.get_species())
        for lf in t.iter_leaves():
            lf.add_features(taxid=name2sp.get(lf.species, 0))

        t.set_outgroup(t.search_nodes(taxid=9606)[0])
        ncbi.annotate_tree(t, attr_name='taxid')
        t.set_outgroup(t.get_common_ancestor([lf for lf in t if outgroup_lineage in lf.named_lineage]))
        ncbi.annotate_tree(t, attr_name='taxid')
            
        #print t.write(features=[])
        #print t.write()
        yield t
Example #3
0
      seqC   MAEIPDATIQ ---ALTNVSH NIAVQYLSEF GDLNEALNSY YASQTDDQPD RREEAHQFMA
      seqD   MAEAPDETIQ QFMALTNVSH NIAVQYLSEF GDLNEAL--- ---------- -REEAHQ---
             LTNVSHQFMA LTNVSH
             LTNVSH---- ------
             LTNVSH---- ------
             -------FMA LTNVSH
"""
# Load a tree and link it to an alignment. As usual, 'alignment' can
# be the path to a file or data in text format.
t = PhyloTree("(((seqA,seqB),seqC),seqD);",
              alignment=fasta_txt,
              alg_format="fasta")

#We can now access the sequence of every leaf node
print "These are the nodes and its sequences:"
for leaf in t.iter_leaves():
    print leaf.name, leaf.sequence
#seqD MAEAPDETIQQFMALTNVSHNIAVQYLSEFGDLNEAL--------------REEAH
#seqC MAEIPDATIQ---ALTNVSHNIAVQYLSEFGDLNEALNSYYASQTDDQPDRREEAH
#seqA MAEIPDETIQQFMALT---HNIAVQYLSEFGDLNEALNSYYASQTDDIKDRREEAH
#seqB MAEIPDATIQQFMALTNVSHNIAVQY--EFGDLNEALNSYYAYQTDDQKDRREEAH
#
# The associated alignment can be changed at any time
t.link_to_alignment(alignment=iphylip_txt, alg_format="iphylip")
# Let's check that sequences have changed
print "These are the nodes and its re-linked sequences:"
for leaf in t.iter_leaves():
    print leaf.name, leaf.sequence
#seqD MAEAPDETIQQFMALTNVSHNIAVQYLSEFGDLNEAL--------------REEAHQ----------FMALTNVSH
#seqC MAEIPDATIQ---ALTNVSHNIAVQYLSEFGDLNEALNSYYASQTDDQPDRREEAHQFMALTNVSH----------
#seqA MAEIPDETIQQFMALT---HNIAVQYLSEFGDLNEALNSYYASQTDDIKDRREEAHQFMALTNVSHQFMALTNVSH
Example #4
0
def main(argv):
    
    parser = argparse.ArgumentParser(description=__DESCRIPTION__, 
                            formatter_class=argparse.RawDescriptionHelpFormatter)


    input_args = parser.add_argument_group("INPUT OPTIONS")
    input_args.add_argument("source_trees", metavar='source_trees', type=str, nargs="*",
                   help='a list of source tree files')
    
    input_args.add_argument("--source_file", dest="source_file", 
                        type=str, 
                        help="""path to a file containing many source trees, one per line""")

    input_args.add_argument("-r", dest="reftree", 
                        type=str, required=True,
                        help="""Reference tree""")

    input_args.add_argument("--ref_tree_attr", dest="ref_tree_attr", 
                            type=str, default="name",
                            help=("attribute in ref tree used as leaf name"))
    
    input_args.add_argument("--src_tree_attr", dest="src_tree_attr", 
                            type=str, default="name",
                            help=("attribute in source tree used as leaf name"))

    input_args.add_argument("--min_support_ref",
                            type=float, default=0.0,
                        help=("min support for branches to be considered from the ref tree"))
    input_args.add_argument("--min_support_src",
                        type=float, default=0.0,
                        help=("min support for branches to be considered from the source tree"))

    
    output_args = parser.add_argument_group("OUTPUT OPTIONS")
    
    output_args.add_argument("-o", dest="output", 
                            type=str,
                            help="""Path to the tab delimited report file""")

    
    opt_args = parser.add_argument_group("DISTANCE OPTIONS")
    

    opt_args.add_argument("--outgroup", dest="outgroup", 
                        nargs = "+",
                        help="""outgroup used to root reference and source trees before distance computation""")
  
    opt_args.add_argument("--expand_polytomies", dest="polytomies", 
                        action = "store_true",
                        help="""expand politomies if necessary""")
  
    opt_args.add_argument("--unrooted", dest="unrooted", 
                        action = "store_true",
                        help="""compare trees as unrooted""")

    opt_args.add_argument("--min_support", dest="min_support", 
                        type=float, default=0.0,
                        help=("min support value for branches to be counted in the distance computation (RF, treeko and refTree/targeGene compatibility)"))

    opt_args = parser.add_argument_group("PHYLOGENETICS OPTIONS")
    
    opt_args.add_argument("--extract_species",
                        action = "store_true",
                        help="When used, leaf names in the reference and source trees are assumed to represent species."
                          " If target trees are gene-trees whose species information is encoded as a part of the leaf sequence name,"
                          " it can be automatically extracted by providing a Perl regular expression that extract a "
                          " valid species code (see --sp_regexp). Such information will be also used to detect duplication"
                          " events. ")

    opt_args.add_argument("--sp_regexp", 
                          type=str,
                         help=("Specifies a Perl regular expression to automatically extract species names"
                          " from the name string in source trees. If not used, leaf names are assumed to represent species names."
                          " Example: use this expression '[^_]+_(.+)' to extract HUMAN from the string 'P53_HUMAN'."))
        
    opt_args.add_argument("--collateral", 
                        action='store_true', 
                        help=(""))

    
    args = parser.parse_args(argv)
    print __DESCRIPTION__
    reftree = args.reftree
    if args.source_file and args.source_trees:
        print >>sys.stderr, 'The use of targets_file and targets at the same time is not supported.'
        sys.exit(1)
        
    if args.source_file:
        source_trees = tree_iterator(args.source_file)
    else:
        source_trees = args.source_trees
        
    ref_tree = Tree(reftree)

    if args.ref_tree_attr:
        for lf in ref_tree.iter_leaves():
            lf._origname = lf.name
            if args.ref_tree_attr not in lf.features:
                print lf
            lf.name = getattr(lf, args.ref_tree_attr)
    
    if args.outgroup:
        if len(args.outgroup) > 1:
            out = ref_tree.get_common_ancestor(args.outgroup)
        else:
            out = ref_tree.search_nodes(name=args.outgroup[0])[0]
        ref_tree.set_outgroup(out)
                     

    HEADER = ("source tree", 'ref tree', 'common\ntips', 'normRF', 'RF', 'maxRF', "%reftree", "%genetree", "subtrees", "treeko\ndist")
    if args.output:
        OUT = open(args.output, "w")
        print >>OUT, '# ' + ctime()
        print >>OUT, '# ' + ' '.join(sys.argv) 
        print >>OUT, '#'+'\t'.join(HEADER)
    else:
        print '# ' + ctime()
        print '# ' + ' '.join(sys.argv) 
        COL_WIDTHS = [20, 20] + [9] * 10
        print_table([HEADER], fix_col_width=COL_WIDTHS, wrap_style='wrap')
        
                
    prev_tree = None
    ref_fname = os.path.basename(args.reftree)
    for counter, tfile in enumerate(source_trees):
        if args.source_file:
            seedid, tfile = tfile
        else:
            seedid = None
           
        if args.extract_species:

            if args.sp_regexp:
                SPMATCHER = re.compile(args.sp_regexp)
                get_sp_name = lambda x: re.search(SPMATCHER, x).groups()[0]
            else:
                get_sp_name = lambda x: x
                
            tt = PhyloTree(tfile, sp_naming_function = get_sp_name)
        else:
            tt = Tree(tfile)

        if args.src_tree_attr:
            for lf in tt.iter_leaves():
                lf._origname = lf.name
                lf.name = getattr(lf, args.src_tree_attr)
            
        if args.outgroup:
            if len(args.outgroup) > 1:
                out = tt.get_common_ancestor(args.outgroup)
            else:
                out = tt.search_nodes(name=args.outgroup[0])[0]
            tt.set_outgroup(out)
        
        if args.source_trees:
            fname = os.path.basename(tfile)
        else:
            fname = '%05d' %counter                          


            
        r = tt.compare(ref_tree, 
                       ref_tree_attr=args.ref_tree_attr,
                       source_tree_attr=args.src_tree_attr,
                       min_support_ref=args.min_support_ref,
                       min_support_source = args.min_support_src,
                       unrooted=args.unrooted,
                       has_duplications=args.extract_species)

                          

        print_table([map(istr, [fname[-30:], ref_fname[-30:], r['effective_tree_size'], r['norm_rf'],
                               r['rf'], r['max_rf'], r["source_edges_in_ref"],
                               r["ref_edges_in_source"], r['source_subtrees'], r['treeko_dist']])],
                    fix_col_width = COL_WIDTHS, wrap_style='cut')
                          

    if args.output:
        OUT.close()
Example #5
0
        if args.ref_tree:
            print "Reading ref tree from", args.ref_tree
            reft = Tree(args.ref_tree, format=1)

        else:
            reft = None

        if args.tax_info:
            tax2name, tax2track = annotate_tree_with_taxa(
                t, args.tax_info, tax2name, tax2track)
            if args.dump_tax_info:
                cPickle.dump(tax2track, open("tax2track.pkl", "w"))
                cPickle.dump(tax2name, open("tax2name.pkl", "w"))
                print "Tax info written into pickle files"
        else:
            for n in t.iter_leaves():
                spcode = n.name
                n.add_features(taxid=spcode)
                n.add_features(species=spcode)
            tax2name, tax2track = annotate_tree_with_taxa(
                t, None, tax2name, tax2track)

        # Split tree into species trees
        #subtrees =  t.get_speciation_trees()
        if not args.rf_only:
            print "Calculating tree subparts..."
            t1 = time.time()
            if not args.is_sptree:
                subtrees = t.split_by_dups()
                print "Subparts:", len(subtrees), time.time() - t1
            else:
Example #6
0
def main(argv):

    parser = argparse.ArgumentParser(
        description=__DESCRIPTION__,
        formatter_class=argparse.RawDescriptionHelpFormatter)

    input_args = parser.add_argument_group("INPUT OPTIONS")
    input_args.add_argument("source_trees",
                            metavar='source_trees',
                            type=str,
                            nargs="*",
                            help='a list of source tree files')

    input_args.add_argument(
        "--source_file",
        dest="source_file",
        type=str,
        help="""path to a file containing many source trees, one per line""")

    input_args.add_argument("-r",
                            dest="reftree",
                            type=str,
                            required=True,
                            help="""Reference tree""")

    input_args.add_argument("--ref_tree_attr",
                            dest="ref_tree_attr",
                            type=str,
                            default="name",
                            help=("attribute in ref tree used as leaf name"))

    input_args.add_argument(
        "--src_tree_attr",
        dest="src_tree_attr",
        type=str,
        default="name",
        help=("attribute in source tree used as leaf name"))

    input_args.add_argument(
        "--min_support_ref",
        type=float,
        default=0.0,
        help=("min support for branches to be considered from the ref tree"))
    input_args.add_argument(
        "--min_support_src",
        type=float,
        default=0.0,
        help=(
            "min support for branches to be considered from the source tree"))

    output_args = parser.add_argument_group("OUTPUT OPTIONS")

    output_args.add_argument("-o",
                             dest="output",
                             type=str,
                             help="""Path to the tab delimited report file""")

    opt_args = parser.add_argument_group("DISTANCE OPTIONS")

    opt_args.add_argument(
        "--outgroup",
        dest="outgroup",
        nargs="+",
        help=
        """outgroup used to root reference and source trees before distance computation"""
    )

    opt_args.add_argument("--expand_polytomies",
                          dest="polytomies",
                          action="store_true",
                          help="""expand politomies if necessary""")

    opt_args.add_argument("--unrooted",
                          dest="unrooted",
                          action="store_true",
                          help="""compare trees as unrooted""")

    opt_args.add_argument(
        "--min_support",
        dest="min_support",
        type=float,
        default=0.0,
        help=
        ("min support value for branches to be counted in the distance computation (RF, treeko and refTree/targeGene compatibility)"
         ))

    opt_args = parser.add_argument_group("PHYLOGENETICS OPTIONS")

    opt_args.add_argument(
        "--extract_species",
        action="store_true",
        help=
        "When used, leaf names in the reference and source trees are assumed to represent species."
        " If target trees are gene-trees whose species information is encoded as a part of the leaf sequence name,"
        " it can be automatically extracted by providing a Perl regular expression that extract a "
        " valid species code (see --sp_regexp). Such information will be also used to detect duplication"
        " events. ")

    opt_args.add_argument(
        "--sp_regexp",
        type=str,
        help=
        ("Specifies a Perl regular expression to automatically extract species names"
         " from the name string in source trees. If not used, leaf names are assumed to represent species names."
         " Example: use this expression '[^_]+_(.+)' to extract HUMAN from the string 'P53_HUMAN'."
         ))

    opt_args.add_argument("--collateral", action='store_true', help=(""))

    args = parser.parse_args(argv)
    print __DESCRIPTION__
    reftree = args.reftree
    if args.source_file and args.source_trees:
        print >> sys.stderr, 'The use of targets_file and targets at the same time is not supported.'
        sys.exit(1)

    if args.source_file:
        source_trees = tree_iterator(args.source_file)
    else:
        source_trees = args.source_trees

    ref_tree = Tree(reftree)

    if args.ref_tree_attr:
        for lf in ref_tree.iter_leaves():
            lf._origname = lf.name
            if args.ref_tree_attr not in lf.features:
                print lf
            lf.name = getattr(lf, args.ref_tree_attr)

    if args.outgroup:
        if len(args.outgroup) > 1:
            out = ref_tree.get_common_ancestor(args.outgroup)
        else:
            out = ref_tree.search_nodes(name=args.outgroup[0])[0]
        ref_tree.set_outgroup(out)

    HEADER = ("source tree", 'ref tree', 'common\ntips', 'normRF', 'RF',
              'maxRF', "%reftree", "%genetree", "subtrees", "treeko\ndist")
    if args.output:
        OUT = open(args.output, "w")
        print >> OUT, '# ' + ctime()
        print >> OUT, '# ' + ' '.join(sys.argv)
        print >> OUT, '#' + '\t'.join(HEADER)
    else:
        print '# ' + ctime()
        print '# ' + ' '.join(sys.argv)
        COL_WIDTHS = [20, 20] + [9] * 10
        print_table([HEADER], fix_col_width=COL_WIDTHS, wrap_style='wrap')

    prev_tree = None
    ref_fname = os.path.basename(args.reftree)
    for counter, tfile in enumerate(source_trees):
        if args.source_file:
            seedid, tfile = tfile
        else:
            seedid = None

        if args.extract_species:

            if args.sp_regexp:
                SPMATCHER = re.compile(args.sp_regexp)
                get_sp_name = lambda x: re.search(SPMATCHER, x).groups()[0]
            else:
                get_sp_name = lambda x: x

            tt = PhyloTree(tfile, sp_naming_function=get_sp_name)
        else:
            tt = Tree(tfile)

        if args.src_tree_attr:
            for lf in tt.iter_leaves():
                lf._origname = lf.name
                lf.name = getattr(lf, args.src_tree_attr)

        if args.outgroup:
            if len(args.outgroup) > 1:
                out = tt.get_common_ancestor(args.outgroup)
            else:
                out = tt.search_nodes(name=args.outgroup[0])[0]
            tt.set_outgroup(out)

        if args.source_trees:
            fname = os.path.basename(tfile)
        else:
            fname = '%05d' % counter

        r = tt.compare(ref_tree,
                       ref_tree_attr=args.ref_tree_attr,
                       source_tree_attr=args.src_tree_attr,
                       min_support_ref=args.min_support_ref,
                       min_support_source=args.min_support_src,
                       unrooted=args.unrooted,
                       has_duplications=args.extract_species)

        print_table([
            map(istr, [
                fname[-30:], ref_fname[-30:], r['effective_tree_size'],
                r['norm_rf'], r['rf'], r['max_rf'], r["source_edges_in_ref"],
                r["ref_edges_in_source"], r['source_subtrees'],
                r['treeko_dist']
            ])
        ],
                    fix_col_width=COL_WIDTHS,
                    wrap_style='cut')

    if args.output:
        OUT.close()
        
        if args.ref_tree:
            print "Reading ref tree from", args.ref_tree
            reft = Tree(args.ref_tree, format=1)

        else:
            reft = None
        
        if args.tax_info:
            tax2name, tax2track = annotate_tree_with_taxa(t, args.tax_info, tax2name, tax2track)
            if args.dump_tax_info:
                cPickle.dump(tax2track, open("tax2track.pkl", "w"))
                cPickle.dump(tax2name, open("tax2name.pkl", "w"))
                print "Tax info written into pickle files"
        else:
            for n in t.iter_leaves():
                spcode = n.name
                n.add_features(taxid=spcode)
                n.add_features(species=spcode)
            tax2name, tax2track = annotate_tree_with_taxa(t, None, tax2name, tax2track)
                
        # Split tree into species trees
        #subtrees =  t.get_speciation_trees()
        if not args.rf_only:
            print "Calculating tree subparts..."
            t1 = time.time()
            if not args.is_sptree:
                subtrees =  t.split_by_dups()
                print "Subparts:", len(subtrees), time.time()-t1
            else:
                subtrees = [t]
Example #8
0
def main(argv):
    parser = argparse.ArgumentParser(description=__DESCRIPTION__, 
                                     formatter_class=argparse.RawDescriptionHelpFormatter)
    # name or flags - Either a name or a list of option strings, e.g. foo or -f, --foo.
    # action - The basic type of action to be taken when this argument is encountered at the command line. (store, store_const, store_true, store_false, append, append_const, version)
    # nargs - The number of command-line arguments that should be consumed. (N, ? (one or default), * (all 1 or more), + (more than 1) )
    # const - A constant value required by some action and nargs selections. 
    # default - The value produced if the argument is absent from the command line.
    # type - The type to which the command-line argument should be converted.
    # choices - A container of the allowable values for the argument.
    # required - Whether or not the command-line option may be omitted (optionals only).
    # help - A brief description of what the argument does.
    # metavar - A name for the argument in usage messages.
    # dest - The name of the attribute to be added to the object returned by parse_args().
    
    parser.add_argument("--show", dest="show_tree",
                        action="store_true", 
                        help="""Display tree after the analysis.""")
    
    parser.add_argument("--render", dest="render",
                        action="store_true", 
                        help="""Render tree.""")

    parser.add_argument("--dump", dest="dump",
                        action="store_true", 
                        help="""Dump analysis""")

    parser.add_argument("--explore", dest="explore",
                        type=str,
                        help="""Reads a previously analyzed tree and visualize it""")
    
    input_args = parser.add_mutually_exclusive_group()
    input_args.required=True
    input_args.add_argument("-t", "--tree", dest="target_tree",  nargs="+",
                        type=str,
                        help="""Tree file in newick format""")

    input_args.add_argument("-tf", dest="tree_list_file",
                        type=str, 
                        help="File with the list of tree files")
    
    parser.add_argument("--tax", dest="tax_info", type=str,
                        help="If the taxid attribute is not set in the"
                        " newick file for all leaf nodes, a tab file file"
                        " with the translation of name and taxid can be"
                        " provided with this option.")

    parser.add_argument("--sp_delimiter", dest="sp_delimiter", type=str,
                        help="If taxid is part of the leaf name, delimiter used to split the string")

    parser.add_argument("--sp_field", dest="sp_field", type=int, default=0,
                        help="field position for taxid after splitting leaf names")
    
    parser.add_argument("--ref", dest="ref_tree", type=str,
                        help="Uses ref tree to compute robinson foulds"
                        " distances of the different subtrees")

    parser.add_argument("--rf-only", dest="rf_only",
                        action = "store_true",
                        help="Skip ncbi consensus analysis")

    parser.add_argument("--outgroup", dest="outgroup",
                        type=str, nargs="+",
                        help="A list of node names defining the trees outgroup")
    
    parser.add_argument("--is_sptree", dest="is_sptree",
                        action = "store_true",
                        help="Assumes no duplication nodes in the tree")
    
    parser.add_argument("-o", dest="output", type=str,
                        help="Writes result into a file")

    parser.add_argument("--tax2name", dest="tax2name", type=str,
                        help="")
    
    parser.add_argument("--tax2track", dest="tax2track", type=str,
                        help="")
    
    parser.add_argument("--dump_tax_info", dest="dump_tax_info", action="store_true",
                        help="")
    
    args = parser.parse_args(argv)

    if args.sp_delimiter:
        GET_TAXID = lambda x: x.split(args.sp_delimiter)[args.sp_field]
    else:
        GET_TAXID = None
    
    reftree_name = os.path.basename(args.ref_tree) if args.ref_tree else ""
    if args.explore:
        print >>sys.stderr, "Reading tree from file:", args.explore
        t = cPickle.load(open(args.explore))
        ts = TreeStyle()
        ts.force_topology = True
        ts.show_leaf_name = False
        ts.layout_fn = ncbi_layout 
        ts.mode = "r"
        t.show(tree_style=ts)
        print >>sys.stderr, "dumping color config"
        cPickle.dump(name2color, open("ncbi_colors.pkl", "w"))
        sys.exit()
    
    if args.output:
        OUT = open(args.output, "w")
    else:
        OUT = sys.stdout

    print >>sys.stderr, "Dumping results into", OUT
    target_trees = []
    if args.tree_list_file:
        target_trees = [line.strip() for line in open(args.tree_list_file)]
    if args.target_tree:
        target_trees += args.target_tree
    prev_tree = None
    if args.tax2name:
        tax2name = cPickle.load(open(args.tax2name))
    else:
        tax2name = {}

    if args.tax2track:
        tax2track = cPickle.load(open(args.tax2track))
    else:
        tax2track = {}
    print len(tax2track), len(tax2name)
    header = ("TargetTree", "Subtrees", "Ndups", "Broken subtrees", "Broken clades", "Clade sizes", "RF (avg)", "RF (med)", "RF (std)", "RF (max)", "Shared tips")
    print >>OUT, '|'.join([h.ljust(15) for h in header])
    if args.ref_tree:
        print >>sys.stderr, "Reading ref tree from", args.ref_tree
        reft = Tree(args.ref_tree, format=1)
    else:
        reft = None

    SHOW_TREE = False
    if args.show_tree or args.render:
        SHOW_TREE = True

        
    prev_broken = set()
    ENTRIES = []
    ncbi.connect_database()
    for tfile in target_trees:
        #print tfile
        t = PhyloTree(tfile, sp_naming_function=None)
        if GET_TAXID:
            for n in t.iter_leaves():
                n.name = GET_TAXID(n.name)
        
        if args.outgroup:
            if len(args.outgroup) == 1:
                out = t & args.outgroup[0]
            else:
                out = t.get_common_ancestor(args.outgroup)
                if set(out.get_leaf_names()) ^ set(args.outgroup):
                    raise ValueError("Outgroup is not monophyletic")
                
            t.set_outgroup(out)
        t.ladderize()

        if prev_tree:
            tree_compare(t, prev_tree)
        prev_tree = t
       
        
        if args.tax_info:
            tax2name, tax2track = annotate_tree_with_taxa(t, args.tax_info, tax2name, tax2track)
            if args.dump_tax_info:
                cPickle.dump(tax2track, open("tax2track.pkl", "w"))
                cPickle.dump(tax2name, open("tax2name.pkl", "w"))
                print "Tax info written into pickle files"
        else:
            for n in t.iter_leaves():
                spcode = n.name
                n.add_features(taxid=spcode)
                n.add_features(species=spcode)
            tax2name, tax2track = annotate_tree_with_taxa(t, None, tax2name, tax2track)
            
        # Split tree into species trees
        #subtrees =  t.get_speciation_trees()
        if not args.rf_only:
            #print "Calculating tree subparts..."
            t1 = time.time()
            if not args.is_sptree:
                subtrees =  t.split_by_dups()
                #print "Subparts:", len(subtrees), time.time()-t1
            else:
                subtrees = [t]

          
            valid_subtrees, broken_subtrees, ncbi_mistakes, broken_branches, total_rf, broken_clades, broken_sizes = analyze_subtrees(t, subtrees, show_tree=SHOW_TREE)
            
            #print valid_subtrees, broken_subtrees, ncbi_mistakes, total_rf
        else:
            subtrees = []
            valid_subtrees, broken_subtrees, ncbi_mistakes, broken_branches, total_rf, broken_clades, broken_sizes = 0, 0, 0, 0, 0, 0
            
        ndups = 0
        nsubtrees = len(subtrees)
           
        rf = 0
        rf_max = 0
        rf_std = 0
        rf_med = 0
        common_names = 0
        max_size = 0
        if reft and len(subtrees) == 1:
            rf = t.robinson_foulds(reft, attr_t1="realname")
            rf_max = rf[1]
            rf = rf[0]
            rf_med = rf
            
        elif reft:
            #print "Calculating avg RF..."
            nsubtrees, ndups, subtrees = t.get_speciation_trees(map_features=["taxid"])
            #print len(subtrees), "Sub-Species-trees found"
            avg_rf = []
            rf_max = 0.0 # reft.robinson_foulds(reft)[1]
            sum_size = 0.0
            print nsubtrees, "subtrees", ndups, "duplications"

            for ii, subt in enumerate(subtrees):
                print "\r%d" %ii,
                sys.stdout.flush()
                try:
                    partial_rf = subt.robinson_foulds(reft, attr_t1="taxid")
                except ValueError:
                    pass
                else:
                    sptree_size = len(set([n.taxid for n in subt.iter_leaves()]))
                    sum_size += sptree_size
                    avg_rf.append((partial_rf[0]/float(partial_rf[1])) * sptree_size)
                    common_names = len(partial_rf[3])
                    max_size = max(max_size, sptree_size)
                    rf_max = max(rf_max, partial_rf[1])
                #print  partial_rf[:2]
            rf = numpy.sum(avg_rf) / float(sum_size) # Treeko dist
            rf_std = numpy.std(avg_rf)
            rf_med = numpy.median(avg_rf)

        sizes_info = "%0.1f/%0.1f +- %0.1f" %( numpy.mean(broken_sizes), numpy.median(broken_sizes), numpy.std(broken_sizes))
        iter_values = [os.path.basename(tfile), nsubtrees, ndups,
                        broken_subtrees, ncbi_mistakes, broken_branches, sizes_info, rf, rf_med,
                       rf_std, rf_max, common_names] 
        print >>OUT, '|'.join(map(lambda x: str(x).strip().ljust(15), iter_values)) 
        fixed = sorted([n for n in prev_broken if n not in broken_clades])
        new_problems =  sorted(broken_clades - prev_broken)
        fixed_string = color(', '.join(fixed), "green") if fixed else ""
        problems_string = color(', '.join(new_problems), "red") if new_problems else ""
        OUT.write("    Fixed clades: %s\n" %fixed_string) if fixed else None
        OUT.write("    New broken:   %s\n" %problems_string) if new_problems else None
        prev_broken = broken_clades
        ENTRIES.append([os.path.basename(tfile), nsubtrees, ndups,
                        broken_subtrees, ncbi_mistakes, broken_branches, sizes_info, fixed_string, problems_string])
        OUT.flush()
        if args.show_tree or args.render:
            ts = TreeStyle()
            ts.force_topology = True
            #ts.tree_width = 500
            ts.show_leaf_name = False
            ts.layout_fn = ncbi_layout 
            ts.mode = "r"
            t.dist = 0
            if args.show_tree:
                #if args.hide_monophyletic:
                #    tax2monophyletic = {}
                #    n2content = t.get_node2content()
                #    for node in t.traverse():
                #        term2count = defaultdict(int)
                #        for leaf in n2content[node]:
                #            if leaf.lineage:
                #                for term in leaf.lineage:
                #                    term2count[term] += 1
                #        expected_size = len(n2content)
                #        for term, count in term2count.iteritems():
                #            if count > 1
                    
                print "Showing tree..."
                t.show(tree_style=ts)
            else:
                t.render("img.svg", tree_style=ts, dpi=300)
            print "dumping color config"
            cPickle.dump(name2color, open("ncbi_colors.pkl", "w"))

        if args.dump:
            cPickle.dump(t, open("ncbi_analysis.pkl", "w"))
            
    print
    print
    HEADER = ("TargetTree", "Subtrees", "Ndups", "Broken subtrees", "Broken clades", "Broken branches", "Clade sizes", "Fixed Groups", "New Broken Clades")
    print_table(ENTRIES, max_col_width = 50, row_line=True, header=HEADER)
            
    if args.output:
        OUT.close()
      seqA   MAEIPDETIQ QFMALT---H NIAVQYLSEF GDLNEALNSY YASQTDDIKD RREEAHQFMA
      seqB   MAEIPDATIQ QFMALTNVSH NIAVQY--EF GDLNEALNSY YAYQTDDQKD RREEAHQFMA
      seqC   MAEIPDATIQ ---ALTNVSH NIAVQYLSEF GDLNEALNSY YASQTDDQPD RREEAHQFMA
      seqD   MAEAPDETIQ QFMALTNVSH NIAVQYLSEF GDLNEAL--- ---------- -REEAHQ---
             LTNVSHQFMA LTNVSH
             LTNVSH---- ------
             LTNVSH---- ------
             -------FMA LTNVSH
"""
# Load a tree and link it to an alignment. As usual, 'alignment' can
# be the path to a file or data in text format.
t = PhyloTree("(((seqA,seqB),seqC),seqD);", alignment=fasta_txt, alg_format="fasta")

#We can now access the sequence of every leaf node
print "These are the nodes and its sequences:"
for leaf in t.iter_leaves():
    print leaf.name, leaf.sequence
#seqD MAEAPDETIQQFMALTNVSHNIAVQYLSEFGDLNEAL--------------REEAH
#seqC MAEIPDATIQ---ALTNVSHNIAVQYLSEFGDLNEALNSYYASQTDDQPDRREEAH
#seqA MAEIPDETIQQFMALT---HNIAVQYLSEFGDLNEALNSYYASQTDDIKDRREEAH
#seqB MAEIPDATIQQFMALTNVSHNIAVQY--EFGDLNEALNSYYAYQTDDQKDRREEAH
#
# The associated alignment can be changed at any time
t.link_to_alignment(alignment=iphylip_txt, alg_format="iphylip")
# Let's check that sequences have changed
print "These are the nodes and its re-linked sequences:"
for leaf in t.iter_leaves():
    print leaf.name, leaf.sequence
#seqD MAEAPDETIQQFMALTNVSHNIAVQYLSEFGDLNEAL--------------REEAHQ----------FMALTNVSH
#seqC MAEIPDATIQ---ALTNVSHNIAVQYLSEFGDLNEALNSYYASQTDDQPDRREEAHQFMALTNVSH----------
#seqA MAEIPDETIQQFMALT---HNIAVQYLSEFGDLNEALNSYYASQTDDIKDRREEAHQFMALTNVSHQFMALTNVSH
Example #10
0
def main(argv):
    
    parser = ArgumentParser(description=__DESCRIPTION__)

    parser.add_argument("--db",  dest="dbfile",
                        type=str,
                        help="""NCBI sqlite3 db file.""")

    input_args = parser.add_argument_group('TAXID INPUT OPTIONS')
    
    input_args.add_argument("-i", "--taxid", dest="taxid", nargs="+",  
                        type=int, 
                        help="""taxids (space separated)""")

    input_args.add_argument("-if", "--taxid_file", dest="taxid_file",   
                        type=str, 
                        help="""file containing a list of taxids (one per line)""")

    input_args.add_argument("-t", "--reftree", dest="reftree",   
                        type=str, 
                        help="""Read taxids from the provided tree.""")
    
    input_args.add_argument("--reftree_attr", dest="reftree_attr",   
                        type=str, default="name",
                        help="""tree attribute encoding for taxid numbers.""")

    name_input_args = parser.add_argument_group('NAME INPUT OPTIONS')
    
    name_input_args.add_argument("-n", "--name", dest="names", nargs="+",  
                        type=str, 
                        help="""species or taxa names (comma separated)""")

    name_input_args.add_argument("-nf", "--names_file", dest="names_file",   
                        type=str, 
                        help="""file containing a list of taxids (one per line)""")
    name_input_args.add_argument("--fuzzy", dest="fuzzy", type=float,
                        help=("Tries a fuzzy (and SLOW) search for those"
                              " species names that could not be translated"
                              " into taxids. A float number must be provided"
                              " indicating the minimum string similarity."))

    output_args = parser.add_argument_group('OUTPUT OPTIONS')
    
    output_args.add_argument("-x", "--taxonomy", dest="taxonomy",   
                        type=str, 
                        help=("dump a pruned version of the NCBI taxonomy"
                              " tree containing target species into the specified file"))

    output_args.add_argument("-l", "--list", dest="info_list",   
                        type=str, 
                        help="""dump NCBI taxonmy information for each target species into the specified file. """)

    output_args.add_argument("-a", "--annotated", dest="annotated_tree",   
                        type=str, 
                        help="dump the annotated tree of the input reftree provided with -t into the specified file.")                             
    
    output_args.add_argument("--collapse_subspecies", dest="collapse_subspecies",   
                        action="store_true",
                        help=("When used, all nodes under the the species rank"
                              " are collapsed, so all species and subspecies"
                              " are seen as sister nodes"))

    output_args.add_argument("--rank_limit", dest="rank_limit",   
                        type=str,
                        help=("When used, all nodes under the provided rank"
                              " are discarded"))
    
    output_args.add_argument("--full_lineage", dest="full_lineage",   
                        action="store_true",
                        help=("When used, topology is not pruned to avoid "
                              " one-child-nodes, so the complete lineage"
                              " track leading from root to tips is kept."))
        
    args = parser.parse_args(argv)

    taxid_source = args.taxid or args.taxid_file or args.reftree
    name_source = args.names or args.names_file
    if not taxid_source and not name_source:
        parser.error('At least one input source is required')
    if taxid_source and name_source:
        parser.error('taxid and name options are mutually exclusive')
        
    if not args.taxonomy and not args.info_list and not args.annotated_tree:
        parser.error('At least one output option is required')
        
    
    ncbi = NCBITaxa(args.dbfile)
        
    all_names = set([])
    all_taxids = []

    if args.names_file:
        all_names.update(map(strip, open(args.names_file, "rU").read().split("\n")))
        
    if args.names:
        all_names.update(map(strip, " ".join(args.names).split(",")))
        
    all_names.discard("")
    #all_names = set([n.lower() for n in all_names])
    not_found = set()
    name2realname = {}
    name2score = {}
    if all_names:
        log.info("Dumping name translations in %s.name_translation.txt ... ")
        name2id = ncbi.get_name_translator(all_names)
        not_found = all_names - set(name2id.keys())

        if args.fuzzy and not_found:
            log.info("%s unknown names", len(not_found))
            for name in not_found:
                # enable extension loading
                tax, realname, sim = ncbi.get_fuzzy_name_translation(name, args.fuzzy)
                if tax:
                    name2id[name] = tax
                    name2realname[name] = realname
                    name2score[name] = "Fuzzy:%0.2f" %sim
                    
        for name in all_names:
            taxid = name2id.get(name, "???")
            realname = name2realname.get(name, name)
            score = name2score.get(name, "Exact:1.0")
            print "\t".join(map(str, [score, name, realname.capitalize(), taxid]))
            
    if args.taxid_file:
        all_taxids.extend(map(strip, open(args.taxid_file, "rU").read().split("\n")))
    if args.taxid:
        all_taxids.extend(args.taxid)
        
    reftree = None
    if args.reftree:
        reftree = PhyloTree(args.reftree)
        all_taxids.extend(list(set([getattr(n, args.reftree_attr) for n in reftree.iter_leaves()])))
       
    if all_taxids and args.info_list:
        all_taxids = set(all_taxids)
        all_taxids.discard("")
        all_taxids, merge_conversion = ncbi._translate_merged(all_taxids)
        outfile = args.info_list+".info.txt"
        log.info("Dumping %d taxid translations in %s ..." %(len(all_taxids), outfile))
        all_taxids.discard("")
        translator = ncbi.get_taxid_translator(all_taxids)
        OUT = open(outfile, "w")
        for taxid, name in translator.iteritems():
            lineage = ncbi.get_sp_lineage(taxid)
            named_lineage = ','.join(ncbi.translate_to_names(lineage))
            lineage = ','.join(map(str, lineage))
            print >>OUT, "\t".join(map(str, [merge_conversion.get(int(taxid), taxid), name, named_lineage, lineage ]))
            
        OUT.close()    
        for notfound in set(map(str, all_taxids)) - set(str(k) for k in translator.iterkeys()):
            print >>sys.stderr, notfound, "NOT FOUND"
    
    if all_taxids and args.taxonomy:
        all_taxids = set(all_taxids)
        all_taxids.discard("")
        log.info("Dumping NCBI taxonomy of %d taxa in %s.*.nw ..." %(len(all_taxids), args.taxonomy))

        t = ncbi.get_topology(all_taxids, args.full_lineage, args.rank_limit)
        
        id2name = ncbi.get_taxid_translator([n.name for n in t.traverse()])
        for n in t.traverse():
            n.add_features(taxid=n.name)
            n.add_features(sci_name=str(id2name.get(int(n.name), "?")))
            n.name = "%s{%s}" %(id2name.get(int(n.name), n.name), n.name)
            lineage = ncbi.get_sp_lineage(n.taxid)
            n.add_features(named_lineage = '|'.join(ncbi.translate_to_names(lineage)))
            
        if args.collapse_subspecies:
            species_nodes = [n for n in t.traverse() if n.rank == "species"
                             if int(n.taxid) in all_taxids]
            for sp_node in species_nodes:
                bellow = sp_node.get_descendants()
                if bellow:
                    # creates a copy of the species node
                    connector = sp_node.__class__()
                    for f in sp_node.features:
                        connector.add_feature(f, getattr(sp_node, f))
                    connector.name = connector.name + "{species}"
                    for n in bellow:
                        n.detach()
                        n.name = n.name + "{%s}" %n.rank
                        sp_node.add_child(n)
                    sp_node.add_child(connector)
                    sp_node.add_feature("collapse_subspecies", "1")
                
            
        t.write(format=9, outfile=args.taxonomy+".names.nw")
        t.write(format=8, outfile=args.taxonomy+".allnames.nw")
        t.write(format=9, features=["taxid", "name", "rank", "bgcolor", "sci_name", "collapse_subspecies", "named_lineage"],
                outfile=args.taxonomy+".full_annotation.nw")
        
        for i in t.iter_leaves():
            i.name = i.taxid
        t.write(format=9, outfile=args.taxonomy+".taxids.nw")
        t.write(format=8, outfile=args.taxonomy+".alltaxids.nw")


    if all_taxids and reftree:
        translator = ncbi.get_taxid_translator(all_taxids)
        for n in reftree.iter_leaves():
            if not hasattr(n, "taxid"):
                n.add_features(taxid=int(getattr(n, args.reftree_attr)))
            n.add_features(sci_name = translator.get(int(n.taxid), n.name))
            lineage = ncbi.get_sp_lineage(n.taxid)
            named_lineage = '|'.join(ncbi._translate_to_names(lineage))
            n.add_features(ncbi_track=named_lineage)
            
        print reftree.write(features=["taxid", "sci_name", "ncbi_track"])
Example #11
0
def main(argv):

    parser = ArgumentParser(description=__DESCRIPTION__)

    parser.add_argument("--db",
                        dest="dbfile",
                        type=str,
                        help="""NCBI sqlite3 db file.""")

    input_args = parser.add_argument_group('TAXID INPUT OPTIONS')

    input_args.add_argument("-i",
                            "--taxid",
                            dest="taxid",
                            nargs="+",
                            type=int,
                            help="""taxids (space separated)""")

    input_args.add_argument(
        "-if",
        "--taxid_file",
        dest="taxid_file",
        type=str,
        help="""file containing a list of taxids (one per line)""")

    input_args.add_argument("-t",
                            "--reftree",
                            dest="reftree",
                            type=str,
                            help="""Read taxids from the provided tree.""")

    input_args.add_argument(
        "--reftree_attr",
        dest="reftree_attr",
        type=str,
        default="name",
        help="""tree attribute encoding for taxid numbers.""")

    name_input_args = parser.add_argument_group('NAME INPUT OPTIONS')

    name_input_args.add_argument(
        "-n",
        "--name",
        dest="names",
        nargs="+",
        type=str,
        help="""species or taxa names (comma separated)""")

    name_input_args.add_argument(
        "-nf",
        "--names_file",
        dest="names_file",
        type=str,
        help="""file containing a list of taxids (one per line)""")
    name_input_args.add_argument(
        "--fuzzy",
        dest="fuzzy",
        type=float,
        help=("Tries a fuzzy (and SLOW) search for those"
              " species names that could not be translated"
              " into taxids. A float number must be provided"
              " indicating the minimum string similarity."))

    output_args = parser.add_argument_group('OUTPUT OPTIONS')

    output_args.add_argument(
        "-x",
        "--taxonomy",
        dest="taxonomy",
        type=str,
        help=("dump a pruned version of the NCBI taxonomy"
              " tree containing target species into the specified file"))

    output_args.add_argument(
        "-l",
        "--list",
        dest="info_list",
        type=str,
        help=
        """dump NCBI taxonmy information for each target species into the specified file. """
    )

    output_args.add_argument(
        "-a",
        "--annotated",
        dest="annotated_tree",
        type=str,
        help=
        "dump the annotated tree of the input reftree provided with -t into the specified file."
    )

    output_args.add_argument(
        "--collapse_subspecies",
        dest="collapse_subspecies",
        action="store_true",
        help=("When used, all nodes under the the species rank"
              " are collapsed, so all species and subspecies"
              " are seen as sister nodes"))

    output_args.add_argument(
        "--rank_limit",
        dest="rank_limit",
        type=str,
        help=("When used, all nodes under the provided rank"
              " are discarded"))

    output_args.add_argument(
        "--full_lineage",
        dest="full_lineage",
        action="store_true",
        help=("When used, topology is not pruned to avoid "
              " one-child-nodes, so the complete lineage"
              " track leading from root to tips is kept."))

    args = parser.parse_args(argv)

    taxid_source = args.taxid or args.taxid_file or args.reftree
    name_source = args.names or args.names_file
    if not taxid_source and not name_source:
        parser.error('At least one input source is required')
    if taxid_source and name_source:
        parser.error('taxid and name options are mutually exclusive')

    if not args.taxonomy and not args.info_list and not args.annotated_tree:
        parser.error('At least one output option is required')

    ncbi = NCBITaxa(args.dbfile)

    all_names = set([])
    all_taxids = []

    if args.names_file:
        all_names.update(
            map(strip,
                open(args.names_file, "rU").read().split("\n")))

    if args.names:
        all_names.update(map(strip, " ".join(args.names).split(",")))

    all_names.discard("")
    #all_names = set([n.lower() for n in all_names])
    not_found = set()
    name2realname = {}
    name2score = {}
    if all_names:
        log.info("Dumping name translations in %s.name_translation.txt ... ")
        name2id = ncbi.get_name_translator(all_names)
        not_found = all_names - set(name2id.keys())

        if args.fuzzy and not_found:
            log.info("%s unknown names", len(not_found))
            for name in not_found:
                # enable extension loading
                tax, realname, sim = ncbi.get_fuzzy_name_translation(
                    name, args.fuzzy)
                if tax:
                    name2id[name] = tax
                    name2realname[name] = realname
                    name2score[name] = "Fuzzy:%0.2f" % sim

        for name in all_names:
            taxid = name2id.get(name, "???")
            realname = name2realname.get(name, name)
            score = name2score.get(name, "Exact:1.0")
            print "\t".join(
                map(str,
                    [score, name, realname.capitalize(), taxid]))

    if args.taxid_file:
        all_taxids.extend(
            map(strip,
                open(args.taxid_file, "rU").read().split("\n")))
    if args.taxid:
        all_taxids.extend(args.taxid)

    reftree = None
    if args.reftree:
        reftree = PhyloTree(args.reftree)
        all_taxids.extend(
            list(
                set([
                    getattr(n, args.reftree_attr)
                    for n in reftree.iter_leaves()
                ])))

    if all_taxids and args.info_list:
        all_taxids = set(all_taxids)
        all_taxids.discard("")
        all_taxids, merge_conversion = ncbi._translate_merged(all_taxids)
        outfile = args.info_list + ".info.txt"
        log.info("Dumping %d taxid translations in %s ..." %
                 (len(all_taxids), outfile))
        all_taxids.discard("")
        translator = ncbi.get_taxid_translator(all_taxids)
        OUT = open(outfile, "w")
        for taxid, name in translator.iteritems():
            lineage = ncbi.get_sp_lineage(taxid)
            named_lineage = ','.join(ncbi.translate_to_names(lineage))
            lineage = ','.join(map(str, lineage))
            print >> OUT, "\t".join(
                map(str, [
                    merge_conversion.get(int(taxid), taxid), name,
                    named_lineage, lineage
                ]))

        OUT.close()
        for notfound in set(map(str, all_taxids)) - set(
                str(k) for k in translator.iterkeys()):
            print >> sys.stderr, notfound, "NOT FOUND"

    if all_taxids and args.taxonomy:
        all_taxids = set(all_taxids)
        all_taxids.discard("")
        log.info("Dumping NCBI taxonomy of %d taxa in %s.*.nw ..." %
                 (len(all_taxids), args.taxonomy))

        t = ncbi.get_topology(all_taxids, args.full_lineage, args.rank_limit)

        id2name = ncbi.get_taxid_translator([n.name for n in t.traverse()])
        for n in t.traverse():
            n.add_features(taxid=n.name)
            n.add_features(sci_name=str(id2name.get(int(n.name), "?")))
            n.name = "%s{%s}" % (id2name.get(int(n.name), n.name), n.name)
            lineage = ncbi.get_sp_lineage(n.taxid)
            n.add_features(
                named_lineage='|'.join(ncbi.translate_to_names(lineage)))

        if args.collapse_subspecies:
            species_nodes = [
                n for n in t.traverse() if n.rank == "species"
                if int(n.taxid) in all_taxids
            ]
            for sp_node in species_nodes:
                bellow = sp_node.get_descendants()
                if bellow:
                    # creates a copy of the species node
                    connector = sp_node.__class__()
                    for f in sp_node.features:
                        connector.add_feature(f, getattr(sp_node, f))
                    connector.name = connector.name + "{species}"
                    for n in bellow:
                        n.detach()
                        n.name = n.name + "{%s}" % n.rank
                        sp_node.add_child(n)
                    sp_node.add_child(connector)
                    sp_node.add_feature("collapse_subspecies", "1")

        t.write(format=9, outfile=args.taxonomy + ".names.nw")
        t.write(format=8, outfile=args.taxonomy + ".allnames.nw")
        t.write(format=9,
                features=[
                    "taxid", "name", "rank", "bgcolor", "sci_name",
                    "collapse_subspecies", "named_lineage"
                ],
                outfile=args.taxonomy + ".full_annotation.nw")

        for i in t.iter_leaves():
            i.name = i.taxid
        t.write(format=9, outfile=args.taxonomy + ".taxids.nw")
        t.write(format=8, outfile=args.taxonomy + ".alltaxids.nw")

    if all_taxids and reftree:
        translator = ncbi.get_taxid_translator(all_taxids)
        for n in reftree.iter_leaves():
            if not hasattr(n, "taxid"):
                n.add_features(taxid=int(getattr(n, args.reftree_attr)))
            n.add_features(sci_name=translator.get(int(n.taxid), n.name))
            lineage = ncbi.get_sp_lineage(n.taxid)
            named_lineage = '|'.join(ncbi._translate_to_names(lineage))
            n.add_features(ncbi_track=named_lineage)

        print reftree.write(features=["taxid", "sci_name", "ncbi_track"])
Example #12
0
def main(argv):
    parser = argparse.ArgumentParser(
        description=__DESCRIPTION__,
        formatter_class=argparse.RawDescriptionHelpFormatter)
    # name or flags - Either a name or a list of option strings, e.g. foo or -f, --foo.
    # action - The basic type of action to be taken when this argument is encountered at the command line. (store, store_const, store_true, store_false, append, append_const, version)
    # nargs - The number of command-line arguments that should be consumed. (N, ? (one or default), * (all 1 or more), + (more than 1) )
    # const - A constant value required by some action and nargs selections.
    # default - The value produced if the argument is absent from the command line.
    # type - The type to which the command-line argument should be converted.
    # choices - A container of the allowable values for the argument.
    # required - Whether or not the command-line option may be omitted (optionals only).
    # help - A brief description of what the argument does.
    # metavar - A name for the argument in usage messages.
    # dest - The name of the attribute to be added to the object returned by parse_args().

    parser.add_argument("--show",
                        dest="show_tree",
                        action="store_true",
                        help="""Display tree after the analysis.""")

    parser.add_argument("--render",
                        dest="render",
                        action="store_true",
                        help="""Render tree.""")

    parser.add_argument("--dump",
                        dest="dump",
                        action="store_true",
                        help="""Dump analysis""")

    parser.add_argument(
        "--explore",
        dest="explore",
        type=str,
        help="""Reads a previously analyzed tree and visualize it""")

    input_args = parser.add_mutually_exclusive_group()
    input_args.required = True
    input_args.add_argument("-t",
                            "--tree",
                            dest="target_tree",
                            nargs="+",
                            type=str,
                            help="""Tree file in newick format""")

    input_args.add_argument("-tf",
                            dest="tree_list_file",
                            type=str,
                            help="File with the list of tree files")

    parser.add_argument("--tax",
                        dest="tax_info",
                        type=str,
                        help="If the taxid attribute is not set in the"
                        " newick file for all leaf nodes, a tab file file"
                        " with the translation of name and taxid can be"
                        " provided with this option.")

    parser.add_argument(
        "--sp_delimiter",
        dest="sp_delimiter",
        type=str,
        help=
        "If taxid is part of the leaf name, delimiter used to split the string"
    )

    parser.add_argument(
        "--sp_field",
        dest="sp_field",
        type=int,
        default=0,
        help="field position for taxid after splitting leaf names")

    parser.add_argument("--ref",
                        dest="ref_tree",
                        type=str,
                        help="Uses ref tree to compute robinson foulds"
                        " distances of the different subtrees")

    parser.add_argument("--rf-only",
                        dest="rf_only",
                        action="store_true",
                        help="Skip ncbi consensus analysis")

    parser.add_argument(
        "--outgroup",
        dest="outgroup",
        type=str,
        nargs="+",
        help="A list of node names defining the trees outgroup")

    parser.add_argument("--is_sptree",
                        dest="is_sptree",
                        action="store_true",
                        help="Assumes no duplication nodes in the tree")

    parser.add_argument("-o",
                        dest="output",
                        type=str,
                        help="Writes result into a file")

    parser.add_argument("--tax2name", dest="tax2name", type=str, help="")

    parser.add_argument("--tax2track", dest="tax2track", type=str, help="")

    parser.add_argument("--dump_tax_info",
                        dest="dump_tax_info",
                        action="store_true",
                        help="")

    args = parser.parse_args(argv)

    if args.sp_delimiter:
        GET_TAXID = lambda x: x.split(args.sp_delimiter)[args.sp_field]
    else:
        GET_TAXID = None

    reftree_name = os.path.basename(args.ref_tree) if args.ref_tree else ""
    if args.explore:
        print >> sys.stderr, "Reading tree from file:", args.explore
        t = cPickle.load(open(args.explore))
        ts = TreeStyle()
        ts.force_topology = True
        ts.show_leaf_name = False
        ts.layout_fn = ncbi_layout
        ts.mode = "r"
        t.show(tree_style=ts)
        print >> sys.stderr, "dumping color config"
        cPickle.dump(name2color, open("ncbi_colors.pkl", "w"))
        sys.exit()

    if args.output:
        OUT = open(args.output, "w")
    else:
        OUT = sys.stdout

    print >> sys.stderr, "Dumping results into", OUT
    target_trees = []
    if args.tree_list_file:
        target_trees = [line.strip() for line in open(args.tree_list_file)]
    if args.target_tree:
        target_trees += args.target_tree
    prev_tree = None
    if args.tax2name:
        tax2name = cPickle.load(open(args.tax2name))
    else:
        tax2name = {}

    if args.tax2track:
        tax2track = cPickle.load(open(args.tax2track))
    else:
        tax2track = {}
    print len(tax2track), len(tax2name)
    header = ("TargetTree", "Subtrees", "Ndups", "Broken subtrees",
              "Broken clades", "Clade sizes", "RF (avg)", "RF (med)",
              "RF (std)", "RF (max)", "Shared tips")
    print >> OUT, '|'.join([h.ljust(15) for h in header])
    if args.ref_tree:
        print >> sys.stderr, "Reading ref tree from", args.ref_tree
        reft = Tree(args.ref_tree, format=1)
    else:
        reft = None

    SHOW_TREE = False
    if args.show_tree or args.render:
        SHOW_TREE = True

    prev_broken = set()
    ENTRIES = []
    ncbi.connect_database()
    for tfile in target_trees:
        #print tfile
        t = PhyloTree(tfile, sp_naming_function=None)
        if GET_TAXID:
            for n in t.iter_leaves():
                n.name = GET_TAXID(n.name)

        if args.outgroup:
            if len(args.outgroup) == 1:
                out = t & args.outgroup[0]
            else:
                out = t.get_common_ancestor(args.outgroup)
                if set(out.get_leaf_names()) ^ set(args.outgroup):
                    raise ValueError("Outgroup is not monophyletic")

            t.set_outgroup(out)
        t.ladderize()

        if prev_tree:
            tree_compare(t, prev_tree)
        prev_tree = t

        if args.tax_info:
            tax2name, tax2track = annotate_tree_with_taxa(
                t, args.tax_info, tax2name, tax2track)
            if args.dump_tax_info:
                cPickle.dump(tax2track, open("tax2track.pkl", "w"))
                cPickle.dump(tax2name, open("tax2name.pkl", "w"))
                print "Tax info written into pickle files"
        else:
            for n in t.iter_leaves():
                spcode = n.name
                n.add_features(taxid=spcode)
                n.add_features(species=spcode)
            tax2name, tax2track = annotate_tree_with_taxa(
                t, None, tax2name, tax2track)

        # Split tree into species trees
        #subtrees =  t.get_speciation_trees()
        if not args.rf_only:
            #print "Calculating tree subparts..."
            t1 = time.time()
            if not args.is_sptree:
                subtrees = t.split_by_dups()
                #print "Subparts:", len(subtrees), time.time()-t1
            else:
                subtrees = [t]

            valid_subtrees, broken_subtrees, ncbi_mistakes, broken_branches, total_rf, broken_clades, broken_sizes = analyze_subtrees(
                t, subtrees, show_tree=SHOW_TREE)

            #print valid_subtrees, broken_subtrees, ncbi_mistakes, total_rf
        else:
            subtrees = []
            valid_subtrees, broken_subtrees, ncbi_mistakes, broken_branches, total_rf, broken_clades, broken_sizes = 0, 0, 0, 0, 0, 0

        ndups = 0
        nsubtrees = len(subtrees)

        rf = 0
        rf_max = 0
        rf_std = 0
        rf_med = 0
        common_names = 0
        max_size = 0
        if reft and len(subtrees) == 1:
            rf = t.robinson_foulds(reft, attr_t1="realname")
            rf_max = rf[1]
            rf = rf[0]
            rf_med = rf

        elif reft:
            #print "Calculating avg RF..."
            nsubtrees, ndups, subtrees = t.get_speciation_trees(
                map_features=["taxid"])
            #print len(subtrees), "Sub-Species-trees found"
            avg_rf = []
            rf_max = 0.0  # reft.robinson_foulds(reft)[1]
            sum_size = 0.0
            print nsubtrees, "subtrees", ndups, "duplications"

            for ii, subt in enumerate(subtrees):
                print "\r%d" % ii,
                sys.stdout.flush()
                try:
                    partial_rf = subt.robinson_foulds(reft, attr_t1="taxid")
                except ValueError:
                    pass
                else:
                    sptree_size = len(
                        set([n.taxid for n in subt.iter_leaves()]))
                    sum_size += sptree_size
                    avg_rf.append(
                        (partial_rf[0] / float(partial_rf[1])) * sptree_size)
                    common_names = len(partial_rf[3])
                    max_size = max(max_size, sptree_size)
                    rf_max = max(rf_max, partial_rf[1])
                #print  partial_rf[:2]
            rf = numpy.sum(avg_rf) / float(sum_size)  # Treeko dist
            rf_std = numpy.std(avg_rf)
            rf_med = numpy.median(avg_rf)

        sizes_info = "%0.1f/%0.1f +- %0.1f" % (numpy.mean(broken_sizes),
                                               numpy.median(broken_sizes),
                                               numpy.std(broken_sizes))
        iter_values = [
            os.path.basename(tfile), nsubtrees, ndups, broken_subtrees,
            ncbi_mistakes, broken_branches, sizes_info, rf, rf_med, rf_std,
            rf_max, common_names
        ]
        print >> OUT, '|'.join(
            map(lambda x: str(x).strip().ljust(15), iter_values))
        fixed = sorted([n for n in prev_broken if n not in broken_clades])
        new_problems = sorted(broken_clades - prev_broken)
        fixed_string = color(', '.join(fixed), "green") if fixed else ""
        problems_string = color(', '.join(new_problems),
                                "red") if new_problems else ""
        OUT.write("    Fixed clades: %s\n" % fixed_string) if fixed else None
        OUT.write("    New broken:   %s\n" %
                  problems_string) if new_problems else None
        prev_broken = broken_clades
        ENTRIES.append([
            os.path.basename(tfile), nsubtrees, ndups, broken_subtrees,
            ncbi_mistakes, broken_branches, sizes_info, fixed_string,
            problems_string
        ])
        OUT.flush()
        if args.show_tree or args.render:
            ts = TreeStyle()
            ts.force_topology = True
            #ts.tree_width = 500
            ts.show_leaf_name = False
            ts.layout_fn = ncbi_layout
            ts.mode = "r"
            t.dist = 0
            if args.show_tree:
                #if args.hide_monophyletic:
                #    tax2monophyletic = {}
                #    n2content = t.get_node2content()
                #    for node in t.traverse():
                #        term2count = defaultdict(int)
                #        for leaf in n2content[node]:
                #            if leaf.lineage:
                #                for term in leaf.lineage:
                #                    term2count[term] += 1
                #        expected_size = len(n2content)
                #        for term, count in term2count.iteritems():
                #            if count > 1

                print "Showing tree..."
                t.show(tree_style=ts)
            else:
                t.render("img.svg", tree_style=ts, dpi=300)
            print "dumping color config"
            cPickle.dump(name2color, open("ncbi_colors.pkl", "w"))

        if args.dump:
            cPickle.dump(t, open("ncbi_analysis.pkl", "w"))

    print
    print
    HEADER = ("TargetTree", "Subtrees", "Ndups", "Broken subtrees",
              "Broken clades", "Broken branches", "Clade sizes",
              "Fixed Groups", "New Broken Clades")
    print_table(ENTRIES, max_col_width=50, row_line=True, header=HEADER)

    if args.output:
        OUT.close()