def main():

    # do stuff
    parser = argparse.ArgumentParser(
        prog="Remove taxa taxonomy",
        description=
        """Removes taxa from a taxonomy (or indeed any) tree that aren't in a dataset""",
    )
    parser.add_argument('-v',
                        '--verbose',
                        action='store_true',
                        help="Verbose output: mainly progress reports.",
                        default=False)
    parser.add_argument('input_file',
                        metavar='input_file',
                        nargs=1,
                        help="Your input Phyml")
    parser.add_argument('input_tree',
                        metavar='input_tree',
                        nargs=1,
                        help="Your input tree files")
    parser.add_argument('new_file',
                        metavar='new_file',
                        nargs=1,
                        help="The new tree file")

    args = parser.parse_args()
    verbose = args.verbose
    input_file = args.input_file[0]
    input_tree = args.input_tree[0]
    new_file = args.new_file[0]

    # load tree
    tree = stk.import_tree(input_tree)
    tree_taxa = stk._getTaxaFromNewick(tree)

    # grab taxa in dataset
    XML = stk.load_phyml(input_file)
    taxa = stk.get_all_taxa(XML)

    # build our subs up
    deleteme = []
    for taxon in tree_taxa:
        if not taxon in taxa:
            deleteme.append(taxon)

    new_tree = stk._sub_taxa_in_tree(tree, deleteme)

    t = stk._parse_tree(new_tree)
    t.writeNexus(fName=new_file)

    tree_taxa = stk._getTaxaFromNewick(new_tree)
    tree_taxa.sort()
    for t in tree_taxa:
        print t
def main():

    # do stuff
    parser = argparse.ArgumentParser(
        prog="Create colours for iTOL",
        description=
        "Generate a asthetically pleasing colour scheme for iToL based" +
        " on a Phyml and a taxonomy csv file",
    )
    parser.add_argument('-v',
                        '--verbose',
                        action='store_true',
                        help="Verbose output: mainly progress reports.",
                        default=False)
    parser.add_argument('--scheme',
                        choices=['pastel', 'bright', 'dark', 'faded'],
                        default='pastel',
                        help="Choose a colour scheme")
    parser.add_argument(
        '--level',
        choices=[
            'Genus', 'Subfamily', 'Family', 'Superfamily', 'Infraorder',
            'Suborder', 'Order'
        ],
        default='Family',
        help=
        "Which taxonomic level to colour at. Note that not all will return data. Family and Order will always work."
    )
    parser.add_argument(
        '--tree',
        help=
        "Give a tree to colour and the colour will go around the tree, rather than be sorted alphabetically",
        action='store_true',
        default=False,
    )
    parser.add_argument(
        'input_file',
        metavar='input_file',
        nargs=1,
        help=
        "Your Phyml *or* a taxa lis *or* a tree file (use --tree in this case)t"
    )
    parser.add_argument('input_taxonomy',
                        metavar='input_taxonomy',
                        nargs=1,
                        help="Your taxonomy")
    parser.add_argument('output_file',
                        metavar='output_file',
                        nargs=1,
                        help="The output file for iToL")
    args = parser.parse_args()
    verbose = args.verbose
    level = args.level
    scheme = args.scheme
    input_file = args.input_file[0]
    input_taxonomy = args.input_taxonomy[0]
    output_file = args.output_file[0]
    tree = args.tree

    saturation = 0.5
    value = 0.95
    if (scheme == 'bright'):
        saturation = 0.99
        value = 0.99
    elif (scheme == 'dark'):
        saturation = 0.6
        value = 0.8
    elif (scheme == 'faded'):
        saturation = 0.25
        value = 0.8

    if (tree):
        tree_data = stk.import_tree(input_file)
        # rather than simply grabbing taxa, just go through in "tree order"
        tree_data = tree_data.replace("(", "")
        tree_data = tree_data.replace(")", "")
        tree_data = tree_data.replace(";", "")
        taxa = tree_data.split(",")
        for i in range(0, len(taxa)):
            taxa[i] = taxa[i].strip()
    else:
        # grab taxa in dataset - ignore if tree
        fileName, fileExtension = os.path.splitext(input_file)
        if (fileExtension == '.phyml'):
            print "Parsing PHYML"
            XML = stk.load_phyml(input_file)
            taxa = stk.get_all_taxa(XML)
        else:
            f = open(input_file, "r")
            taxa = []
            for line in f:
                taxa.append(line.strip())
            f.close()

    print len(taxa)
    taxonomy = {}
    index = 0
    with open(input_taxonomy, 'r') as f:
        reader = csv.reader(f)
        i = 0
        for row in reader:
            if i == 0:
                # find index of the level required
                j = 0
                for r in row:
                    if r.lower() == level.lower():
                        index = j
                        i = 1
                        break
                    j = j + 1
                if j == len(row):
                    print "Error finding the desired level in your taxonomy file."
                    print "You asked for: " + level.lower()
                    print "Your taxonomy contains: " + " ".join(row)
                    sys.exit()
            else:
                taxonomy[row[0].replace(" ", "_")] = row[
                    index]  # the replace is to make sure we're consistant across PHYML, tree and taxonomy

    print len(taxonomy)
    values = taxonomy.values()
    values = _uniquify(values)
    n = len(values)
    colours = get_colours(n, format="HEX", saturation=saturation, value=value)
    output_colours = {}
    i = 0
    for v in values:
        output_colours[v] = colours[i]
        i += 1

    f = open(output_file, "w")
    # write header
    f.write("""DATASET_COLORSTRIP
#=================================================================#
#                    MANDATORY SETTINGS                           #
#=================================================================#
SEPARATOR COMMA

#label is used in the legend table (can be changed later)
DATASET_LABEL,""")
    f.write(level + "\n")
    f.write("""
#dataset color (can be changed later)
COLOR,#000000

#=================================================================#
#                    OPTIONAL SETTINGS                            #
#=================================================================#
COLOR_BRANCHES,1

#=================================================================#
#       Actual data follows after the "DATA" keyword              #
#=================================================================#
DATA

""")
    for t in taxa:
        tt = t.replace(" ", "_")
        try:
            if (taxonomy[tt] == "-"):
                f.write(t + ",#000000\n")
            else:
                f.write(t + ",#" + output_colours[taxonomy[tt]] + "," +
                        taxonomy[tt] + "\n")
        except KeyError:
            #print "Couldn't find "+tt
            f.write(t + ",#000000\n")

    f.close()
def main():

    # do stuff
    parser = argparse.ArgumentParser(
        prog="plot tree-taxa matrix",
        description="""Plot a matrix of trees against taxa""",
    )
    parser.add_argument('-v',
                        '--verbose',
                        action='store_true',
                        help="Verbose output: mainly progress reports.",
                        default=False)
    parser.add_argument(
        '-t',
        '--taxonomy',
        help=
        "Use taxonomy to sort the taxa on the axis. Supply a STK taxonomy file",
    )
    parser.add_argument(
        '--level',
        choices=['family', 'superfamily', 'infraorder', 'suborder', 'order'],
        default='family',
        help="""What level to group the taxonomy at. Default is family. 
                    Note data for a particular levelmay be missing in taxonomy."""
    )
    parser.add_argument('input_file',
                        metavar='input_file',
                        nargs=1,
                        help="Your pyhml")
    parser.add_argument('output_file',
                        metavar='output_file',
                        nargs=1,
                        help="The output graphics. .png, .pdf, or .svg")

    args = parser.parse_args()
    verbose = args.verbose
    input_file = args.input_file[0]
    output_file = args.output_file[0]
    taxonomy = args.taxonomy
    level = args.level

    XML = stk.load_phyml(input_file)
    if not taxonomy == None:
        taxonomy = stk.load_taxonomy(taxonomy)

    all_taxa = stk.get_all_taxa(XML)

    taxa_tree_matrix = {}
    for t in all_taxa:
        taxa_tree_matrix[t] = []

    if not taxonomy == None:
        tax_data = {}
        new_all_taxa = []
        for t in all_taxa:
            taxon = t.replace("_", " ")
            try:
                if taxonomy[taxon][level] == "":
                    # skip this
                    continue
                tax_data[t] = taxonomy[taxon][level]
            except KeyError:
                print "Couldn't find " + t + " in taxonomy. Adding as null data"
                tax_data[t] = 'zzzzz'  # it's at the end...

        from sets import Set
        unique = set(tax_data.values())
        unique = list(unique)
        unique.sort()
        print "Groups are:"
        print unique
        counts = []
        for u in unique:
            count = 0
            for t in tax_data:
                if tax_data[t] == u:
                    count += 1
                    new_all_taxa.append(t)
            counts.append(count)

        all_taxa = new_all_taxa
        # cumulate counts
        count_cumulate = []
        count_cumulate.append(counts[0])
        for c in counts[1::]:
            count_cumulate.append(c + count_cumulate[-1])

        print count_cumulate

    trees = stk.obtain_trees(XML)
    i = 0
    for t in trees:
        taxa = stk.get_taxa_from_tree(XML, t, sort=True)
        for taxon in taxa:
            taxon = taxon.replace(" ", "_")
            taxa_tree_matrix[taxon].append(i)
        i += 1

    # create a map
    y = []
    for i in range(0, len(all_taxa)):
        for j in range(0, len(trees)):
            if (j in taxa_tree_matrix[all_taxa[i]]):
                y.append(j)

    tree_count = Counter(y)
    tree_dict = dict(tree_count)
    tree_order = sorted(tree_dict.items(), key=lambda x: x[1], reverse=True)

    new_x = []
    new_y = []
    for i in range(0, len(all_taxa)):
        counter = 0
        for t in tree_order:
            j = t[0]
            if (j in taxa_tree_matrix[all_taxa[i]]):
                new_x.append(i)
                new_y.append(counter)
            counter += 1

    fig = figure(figsize=(22, 17), dpi=90)
    fig.subplots_adjust(left=0.3)
    ax = fig.add_subplot(1, 1, 1)
    ax.scatter(new_x, new_y, 50, marker='o', c='k', lw=0)
    ax.set_xlim(0, len(all_taxa))
    ax.set_ylim(0, len(trees))
    xlabel('Taxa')
    ylabel('Tree Number')
    savefig(output_file, dpi=90)
def main():

    # do stuff
    parser = argparse.ArgumentParser(
         prog="remove poorly contrained taxa",
         description="""Remove taxa that appea in one source tree only.""",
         )
    parser.add_argument(
            '-v', 
            '--verbose', 
            action='store_true', 
            help="Verbose output: mainly progress reports.",
            default=False
            )
    parser.add_argument(
            '--delete_list', 
            help="Produce a deleted taxa list. Give filename."
            )
    parser.add_argument(
            '--poly_only', 
            default=False,
            action='store_true',
            help="Restrict removal of taxa that are in polytomies only in source trees. Default"+
                 " to removal those in polytomies *and* only in one other tree."
            )
    parser.add_argument(
            '--tree_only', 
            default=False,
            action='store_true',
            help="Restrict removal of taxa that only occur in one source tree. Default"+
                 " to removal those in polytomies *and* only in one other tree."
            )
    parser.add_argument(
            'input_phyml', 
            metavar='input_phyml',
            nargs=1,
            help="Your input phyml"
            )
    parser.add_argument(
            'input_tree', 
            metavar='input_tree',
            nargs=1,
            help="Your tree - can be NULL or None"
            )
    parser.add_argument(
            'output_tree', 
            metavar='output_tree',
            nargs=1,
            help="Your output tree or phyml - if input_tree is none, this is the Phyml"
            )


    args = parser.parse_args()
    verbose = args.verbose
    delete_list_file = args.delete_list
    if (delete_list_file == None):
        dl = False
    else:
        dl = True
    poly_only = args.poly_only
    input_tree = args.input_tree[0]
    if input_tree == 'NULL' or input_tree == 'None':
        input_tree = None
    output_file = args.output_tree[0]
    input_phyml = args.input_phyml[0]

    XML = stk.load_phyml(input_phyml)
    # load tree
    if (not input_tree == None):
        supertree = stk.import_tree(input_tree)
        taxa = stk._getTaxaFromNewick(supertree)
    else:
        supertree = None
        taxa = stk.get_all_taxa(XML) 
    # grab taxa
    delete_list = []

    # loop over taxa in supertree and get some stats
    for t in taxa:
        #print "Looking at "+t
        nTrees = 0
        nResolved = 0
        nPoly = 0

        # search each source tree
        xml_root = stk._parse_xml(XML)
        # By getting source, we can then loop over each source_tree
        find = etree.XPath("//source")
        sources = find(xml_root)
        # loop through all sources
        for s in sources:
            # for each source, get source name
            name = s.attrib['name']
            for tr in s.xpath("source_tree/tree/tree_string"):
                tree = tr.xpath("string_value")[0].text
                current_taxa = stk._getTaxaFromNewick(tree)
                # if tree contains taxa
                if (t in current_taxa):
                    nTrees += 1
                    tree_obj = stk._parse_tree(tree,fixDuplicateTaxa=True)
                    siblings = stk._get_all_siblings(tree_obj.node(t))
                    
                    # check where it occurs - polytomies only?
                    if (len(siblings) > 3): #2?
                        nPoly += 1
                    else:
                        nResolved += 1
        
        # record stats for this taxon and decide if to delete it
        if (poly_only):
            if (nPoly == nTrees): # all in polytomies
                delete_list.append(t)
        else:
            if (nPoly == nTrees or # all in polytomies
                 (nResolved == 1 and (nPoly+nResolved)==nTrees) # only 1 resolved and rest (if any) polytomies
               ):
                delete_list.append(t)

    print "Taxa: "+str(len(taxa))
    print "Deleting: "+str(len(delete_list))

    if not supertree == None:
        # done, so delete the problem taxa from the supertree
        for t in delete_list:
            # remove taxa from supertree
            supertree = stk._sub_taxa_in_tree(supertree,t)

        # save supertree
        tree = {}
        tree['Tree_1'] = supertree
        output = stk._amalgamate_trees(tree,format='nexus')
        # write file
        f = open(output_file,"w")
        f.write(output)
        f.close()
    else:
        new_phyml =  stk.substitute_taxa(XML,delete_list)
        # write file
        f = open(output_file,"w")
        f.write(new_phyml)
        f.close()



    if (dl):
        # write file
        delete_list.sort()
        f = open(delete_list_file,"w")
        string = '\n'.join(delete_list)
        f.write(string)
        f.close()
Example #5
0
def main():

    # do stuff
    parser = argparse.ArgumentParser(
         prog="plot chracter taxa matrix",
         description="""Plot a matrix of character availability against taxa""",
         )
    parser.add_argument(
            '-v', 
            '--verbose', 
            action='store_true', 
            help="Verbose output: mainly progress reports.",
            default=False
            )
    parser.add_argument(
            '-t', 
            '--taxonomy', 
            help="Use taxonomy to sort the taxa on the axis. Supply a STK taxonomy file",
            )
    parser.add_argument(
            '--level',
            choices=['family','superfamily','infraorder','suborder','order'],
            default='family',
            help="""What level to group the taxonomy at. Default is family. 
                    Note data for a particular levelmay be missing in taxonomy."""
            )
    parser.add_argument(
            'input_file', 
            metavar='input_file',
            nargs=1,
            help="Your pyhml"
            )
    parser.add_argument(
            'output_file', 
            metavar='output_file',
            nargs=1,
            help="The output graphics. .png, .pdf, or .svg"
            )


    args = parser.parse_args()
    verbose = args.verbose
    input_file = args.input_file[0]
    output_file = args.output_file[0]
    taxonomy = args.taxonomy
    level = args.level

    XML = stk.load_phyml(input_file)
    if not taxonomy == None:
        taxonomy = stk.load_taxonomy(taxonomy)

    all_taxa = stk.get_all_taxa(XML)
    all_chars_d = stk.get_all_characters(XML)
    all_chars = []
    for c in all_chars_d:
        all_chars.extend(all_chars_d[c])

    if not taxonomy == None:
        tax_data = {}
        new_all_taxa = []
        for t in all_taxa:
            taxon = t.replace("_"," ")
            try:
                if taxonomy[taxon][level] == "":
                    # skip this
                    continue
                tax_data[t] = taxonomy[taxon][level]
            except KeyError:
                print "Couldn't find "+t+" in taxonomy. Adding as null data"
                tax_data[t] = 'zzzzz' # it's at the end...

        from sets import Set
        unique = set(tax_data.values())
        unique = list(unique)
        unique.sort()
        print "Groups are:"
        print unique
        counts = []
        for u in unique:
            count = 0
            for t in tax_data:
                if tax_data[t] == u:
                    count += 1
                    new_all_taxa.append(t)
            counts.append(count)

        all_taxa = new_all_taxa
        # cumulate counts
        count_cumulate = []
        count_cumulate.append(counts[0])
        for c in counts[1::]:
            count_cumulate.append(c+count_cumulate[-1])

        print count_cumulate
            

    taxa_character_matrix = {}
    for t in all_taxa:
        taxa_character_matrix[t] = []

    trees = stk.obtain_trees(XML)
    for t in trees:
        chars = stk.get_characters_from_tree(XML,t,sort=True)
        taxa = stk.get_taxa_from_tree(XML,t, sort=True)
        for taxon in taxa:
            taxon = taxon.replace(" ","_")
            if taxon in all_taxa:
                taxa_character_matrix[taxon].extend(chars)
    
    for t in taxa_character_matrix:
        array = taxa_character_matrix[t]
        taxa_character_matrix[t] = list(set(array))

    # create a map
    x = []
    y = []
    for i in range(0,len(all_taxa)):
        for j in range(0,len(all_chars)):
            if (all_chars[j] in taxa_character_matrix[all_taxa[i]]):
                x.append(i)
                y.append(j)


    i = 0
    for j in all_chars:
        # do a substitution of character names to tidy things up
        if j.lower().startswith('mitochondrial carrier; adenine nucleotide translocator'):
            j = "ANT"
        if j.lower().startswith('mitochondrially encoded 12s'):
            j = '12S'
        if j.lower().startswith('complete mitochondrial genome'):
            j = 'Mitogenome'
        if j.lower().startswith('mtdna'):
            j = "mtDNA restriction sites"
        if j.lower().startswith('h3 histone'):
            j = 'H3'
        if j.lower().startswith('mitochondrially encoded cytochrome'):
            j = 'COI'
        if j.lower().startswith('rna, 28s'):
            j = '28S'
        if j.lower().startswith('rna, 18s'):
            j = '18S'
        if j.lower().startswith('mitochondrially encoded 16s'):
            j = '16S'
        all_chars[i] = j
        i += 1

    fig=figure(figsize=(22,17),dpi=90)
    fig.subplots_adjust(left=0.3)
    ax = fig.add_subplot(1,1,1)
    ax.scatter(x,y,50,marker='o',c='r',lw=0)
    yticks(range(0,len(all_chars)), all_chars)    
    ax.set_xlim(0,len(all_taxa))
    ax.set_ylim(0,len(all_chars))
    xlabel('Taxa')
    ylabel('Characters')
    savefig(output_file, dpi=90)
Example #6
0
def main():

    # do stuff
    parser = argparse.ArgumentParser(
        prog="Create a taxonomy",
        description=
        "Generate a taxonomy from Phyml. Fills in most taxonomic levels. Uses EOL and ITIS",
    )
    parser.add_argument('-v',
                        '--verbose',
                        action='store_true',
                        help="Verbose output: mainly progress reports.",
                        default=False)
    parser.add_argument(
        '--pref_db',
        help="Preferred database. Need to be able to list avialable databases?"
    )
    parser.add_argument('input_file',
                        metavar='input_file',
                        nargs=1,
                        help="Your input taxa list or phyml")
    parser.add_argument('output_file',
                        metavar='output_file',
                        nargs=1,
                        help="The output file. A CSV-based taxonomy")

    args = parser.parse_args()
    verbose = args.verbose
    input_file = args.input_file[0]
    output_file = args.output_file[0]
    pref_db = args.pref_db

    # grab taxa in dataset
    fileName, fileExtension = os.path.splitext(input_file)
    if (fileExtension == '.phyml'):
        XML = stk.load_phyml(input_file)
        taxa = stk.get_all_taxa(XML)
    else:
        f = open(input_file, "r")
        taxa = []
        for line in f:
            taxa.append(line.strip())
        f.close()

    taxonomy = {}

    for taxon in taxa:
        taxon = taxon.replace("_", " ")
        if (verbose):
            print "Looking up ", taxon
        # get the data from EOL on taxon
        # What about synonyms?
        taxonq = quote_plus(taxon)
        URL = "http://eol.org/api/search/1.0.json?q=" + taxonq
        req = urllib2.Request(URL)
        opener = urllib2.build_opener()
        f = opener.open(req)
        data = json.load(f)
        # check if there's some data
        if len(data['results']) == 0:
            taxonomy[taxon] = {}
            continue
        ID = str(data['results'][0]['id'])  # take first hit
        # Now look for taxonomies
        URL = "http://eol.org/api/pages/1.0/" + ID + ".json"
        req = urllib2.Request(URL)
        opener = urllib2.build_opener()
        f = opener.open(req)
        data = json.load(f)
        if len(data['taxonConcepts']) == 0:
            taxonomy[taxon] = {}
            continue
        TID = str(data['taxonConcepts'][0]['identifier'])  # take first hit
        currentdb = str(data['taxonConcepts'][0]['nameAccordingTo'])
        # loop through and get preferred one if specified
        # now get taxonomy
        if (not pref_db == None):
            for db in data['taxonConcepts']:
                currentdb = db['nameAccordingTo'].lower()
                if (pref_db.lower() in currentdb):
                    TID = str(db['identifier'])
                    break
        URL = "http://eol.org/api/hierarchy_entries/1.0/" + TID + ".json"
        req = urllib2.Request(URL)
        opener = urllib2.build_opener()
        f = opener.open(req)
        data = json.load(f)
        this_taxonomy = {}
        this_taxonomy['provider'] = currentdb
        for a in data['ancestors']:
            try:
                this_taxonomy[a['taxonRank']] = a['scientificName']
            except KeyError:
                continue
        try:
            if (not data['taxonRank'].lower() == 'species'):
                # higher taxa, add it in to the taxonomy!
                this_taxonomy[data['taxonRank'].lower()] = taxon
        except KeyError:
            continue
        taxonomy[taxon] = this_taxonomy

    if (verbose):
        print "Done basic taxonomy, getting more info from ITIS"

    # fill in the rest of the taxonomy
    # get all genera
    genera = []
    for t in taxonomy:
        try:
            genera.append(taxonomy[t]['genus'])
        except KeyError:
            continue

    genera = _uniquify(genera)
    for g in genera:
        if (verbose):
            print "Looking up ", g
        try:
            URL = "http://www.itis.gov/ITISWebService/jsonservice/searchByScientificName?srchKey=" + quote_plus(
                g.strip())
        except:
            continue
        req = urllib2.Request(URL)
        opener = urllib2.build_opener()
        f = opener.open(req)
        string = unicode(f.read(), "ISO-8859-1")
        data = json.loads(string)
        if data['scientificNames'][0] == None:
            continue
        tsn = data["scientificNames"][0]["tsn"]
        URL = "http://www.itis.gov/ITISWebService/jsonservice/getFullHierarchyFromTSN?tsn=" + str(
            tsn)
        req = urllib2.Request(URL)
        opener = urllib2.build_opener()
        f = opener.open(req)
        try:
            string = unicode(f.read(), "ISO-8859-1")
        except:
            continue
        data = json.loads(string)
        this_taxonomy = {}
        for level in data['hierarchyList']:
            if not level['rankName'].lower() in current_taxonomy_levels:
                this_taxonomy[level['rankName'].lower()] = level['taxonName']

        for t in taxonomy:
            try:
                if taxonomy[t]['genus'] == g:
                    taxonomy[t].update(this_taxonomy)
            except KeyError:
                continue

    stk.save_taxonomy(taxonomy, output_file)