def _pruneTreesToComplement(file1, file2):
    p4.var.trees = []

    p4.read(file1)
    p4.read(file2)
    tree1 = p4.var.trees[0]
    tree2 = p4.var.trees[1]
    # Delete out the missing taxa from trees so that they intersect:
    tree1_taxa = [n.name for n in tree1.nodes if n.isLeaf]
    tree2_taxa = [n.name for n in tree2.nodes if n.isLeaf]
    commontaxa = list(set(tree1_taxa) & set(tree2_taxa))
    numberOfSharedTaxa = len(commontaxa)
    if len(commontaxa) > 3:
        delete_from_tree1 = set([t for t in tree1_taxa if t not in tree2_taxa])
        delete_from_tree2 = set([t for t in tree2_taxa if t not in tree1_taxa])
        for t in delete_from_tree1:
            tree1.removeNode(t, alsoRemoveBiRoot=False)
        for t in delete_from_tree2:
            tree2.removeNode(t, alsoRemoveBiRoot=False)
        if len([n.name for n in tree1.nodes if n.isLeaf]) != \
              len([n.name for n in tree2.nodes if n.isLeaf]):
            raise TCTError, 'Something unexpected went wrong with the taxon ' + \
                  'pruning. After pruning trees have different number of taxa.'
        t1 = tree1.writeNewick(toString=True).split('\n')[0]
        t2 = tree2.writeNewick(toString=True).split('\n')[0]
        return (t1, t2, numberOfSharedTaxa)
    else:
        return ('nope', 'nope', 'nope')
Example #2
0
    def __init__(self, inal=None):
        ''' Instantiate an object intended to wrap an alignment for the
        purposes of running phylogenetic inference. 
        
        :param inal: An alignment file path (most formats are accepted). 
        
        '''

        # Intergrate the p4 phylogenetic library.
        if inal is None:
            self.data = p4.Alignment()

        else:
            p4.read(inal)  # Read the alignment file/string.
            self.data = p4.var.alignments[-1]

        # Augment alignment with a discrete state model.
        self.model = model.DiscreteStateModel(self)

        # Keep track of temporary files.
        self.paths = {}
Example #3
0
    def __init__(self,inal=None):
        
        ''' Instantiate an object intended to wrap an alignment for the
        purposes of running phylogenetic inference. 
        
        :param inal: An alignment file path (most formats are accepted). 
        
        '''
        
        # Intergrate the p4 phylogenetic library.
        if inal is None:
            self.data = p4.Alignment()
            
        else:
            p4.read(inal) # Read the alignment file/string.
            self.data  = p4.var.alignments[-1]

        # Augment alignment with a discrete state model.
        self.model = model.DiscreteStateModel(self)

        # Keep track of temporary files.
        self.paths = {} 
def setuploop(file_1, file_2, outfile):
    p4.var.trees = []
    p4.read(file_1)
    fileCount1 = len(p4.var.trees)
    p4.read(file_2)
    fileCount2 = len(p4.var.trees) - fileCount1

    for i in range(0, fileCount1):
        for j in range(fileCount1, fileCount1 + fileCount2):

            temp_tree1 = copy.deepcopy(p4.var.trees[i])
            temp_tree2 = copy.deepcopy(p4.var.trees[j])

            tree1, tree2, numberOfTaxa = _pruneTreesToComplement(
                temp_tree1, temp_tree2)
            if tree1 != 'nope':
                t1 = dendropy.Tree.get_from_string(tree1, 'newick')
                t2 = dendropy.Tree.get_from_string(tree2, 'newick')
                symdiff = t1.symmetric_difference(t2)  #/ len(t1.nodes())
                towrite2 = file_1 + '\t' + file_2 + '\t' + str(
                    symdiff) + '\t' + str(numberOfTaxa) + '\n'
                with open(outfile, 'a') as out2:
                    out2.write(towrite2)
def treeFinderMAPAnalysis(alignment, groups,
                          gamma=True, invariant=True, bootstrap=False,
                          nreplicates=100,
                          remove_files=False, run_analysis=True, verbose=False):
    """
    Uses TreeFinder to estimate a Maximum Likelihood tree using the MAP
    substitution model for grouped amino-acids.

    - *alignment*: p4 alignment object of original (un-recoded) protein data from
      which the "groups" are derived
    - *groups*: list of grouped amino-acids, possibly resuling from
      :meth:`Alignment.getKosiolAISGroups()` or :meth:`Alignment.getMinmaxChiSqGroups()`
    - *gamma*: include gamma distribution of among-site rate variation
    - *bootstrap*: run bootstrap analysis
    - *nreplicates*: number of bootstrap replicates
    - *invariant*: include a proportion of invariant sites
    - *run_analysis*: run the analysis if TreeFinder in $PATH, else just write the
      control file
    - *remove_files*: remove analysis files. Only available if run_analysis=True

    """

    gm = ["p4.alignment_recoding.treeFinderMAPAnalysis()"]

    if not isinstance(alignment, Alignment):
        msg = "alignment must be a Alignment object"
        gm.append(msg)
        raise P4Error(gm)

    if alignment.dataType != "protein":
        msg = "alignment should be the original protein data from" + \
              "which the groups were defined. Doing nothing."
        gm.append(msg)
        raise P4Error(gm)

    for param in [gamma, invariant, bootstrap,
                  remove_files, run_analysis, verbose]:
        if not isinstance(param, types.BooleanType):
            msg = "%s value must be either True or False" % param
            gm.append(msg)
            raise P4Error(gm)

    if not isinstance(nreplicates, types.IntType):
        msg = "nreplictes must be an integer"
        gm.append(msg)
        raise P4Error(gm)

    if run_analysis:
        if not func.which2("tf"):
            msg = "tf (treefinder) is not in your $PATH" + \
                  "Cannot run analysis"
            gm.append(msg)
            raise P4Error(gm)

    datafile_name = "tf_data.phy"

    #tf commands
    tls = """ReconstructPhylogeny[
             "%(datafile)s",
             SubstitutionModel->MAP[%(map)s][Optimum,Optimum]%(ifH)s,
             WithEdgeSupport->%(bootstrap)s%(nreplicates)s
             ],
             "%(outfile)s",SaveReport"""
    od = {}
    od["datafile"] = datafile_name
    if gamma:
        if invariant:
            od["ifH"] = ":GI[Optimum]"
        else:
            od["ifH"] = ":G[Optimum]"
    else:
        if invariant:
            od["ifH"] = ":I[Optimum]"
        else:
            od["ifH"] = ""
    if bootstrap:
        od["bootstrap"] = "True"
        od["nreplicates"] = ",NReplicates->%i" % nreplicates
    else:
        od["bootstrap"] = "False"
        od["nreplicates"] = ""
    od["outfile"] = "tf_reconstruction.output"
    od["map"] = ",".join(['"%s"' % i for i in [group.upper() for group in groups]])

    if run_analysis:

        #Write data file
        alignment.writePhylip(datafile_name)

        #Write control file
        tl_file = "tf_control.tl"
        fh = open(tl_file, "w")
        fh.write(tls % od)
        fh.close()

        if verbose:
            direct = subprocess.STDOUT
        else:
            direct = open("/dev/null", "w")

        child = subprocess.Popen("tf tf_control.tl", stderr=direct, shell=True)

        if verbose:
            print "Running TreeFinder, this could take some time...",
            sys.stdout.flush()

        child.communicate()

        if verbose:
            print "done."
            sys.stdout.flush()

        #This doesnt seem to work, why?
        #while child.poll() is None:
        #    time.sleep(60)
        #    if verbose:
        #        sys.stdout.write(".")
        #        sys.stdout.flush()

        if child.returncode != 0:
            msg = "TreeFinder returned error code %s"
            gm.append(msg % (child.returncode))
            raise P4Error(gm)

        fh = open(od["outfile"], "r")
        line = fh.readlines()[1]
        fh.close()

        rd = {}
        #Likelihood
        rd["Likelihood"] = float(line[line.index("Likelihood->")+12:line.index(",")])
        #Tree
        ts = line[line.index("Phylogeny->")+11:line.index("SubstitutionModel->")-1]
        rd["Phylogeny"] = ts
        #SubstitutionModel
        sm = line[line.index("SubstitutionModel->")+19:line.index("OSubstitutionModel->")-1]
        rd["SubstitutionModel"] = sm
        #OSubstitutionModel
        osm = line[line.index("OSubstitutionModel->")+20:line.index("OEdgeOptimizationOff->")-1]
        rd["OSubstitutionModel"] = osm
        #NSites
        ns = line[line.index("NSites->")+8:line.index("NParameters->")-1]
        rd["Nsites"] = int(ns)
        #NParameters
        np = line[line.index("NParameters->")+13:line.index("AIC->")-1]
        rd["NParameters"] = int(np)
        #AIC
        rd["AIC"] = float(line[line.index("AIC->")+5:line.index("AICc->")-1])
        #AICc->
        rd["AICc"] = float(line[line.index("AICc->")+6:line.index("HQ->")-1])
        #HQ
        rd["HQ"] = float(line[line.index("HQ->")+4:line.index("BIC->")-1])
        #BIC
        rd["BIC"] = float(line[line.index("BIC->")+5:line.index("Checksum->")-1])
        #LikelihoodTime
        lt = line[line.index("LikelihoodTime->")+16:line.index("LikelihoodMemory->")-1]
        rd["LikelihoodTime"] = float(lt)
        #LikelihoodMemory
        lm = line[line.index("LikelihoodMemory->")+18:-3]
        rd["LikelihoodMemory"] = int(lm)

        #Make a tree object
        tree = rd["Phylogeny"].replace("{", "(")
        tree = tree.replace("}", ")")
        tree = tree.replace("\"", "")
        tree = tree + ";"
        if bootstrap:
            #Tree viewer has the brlen before bootstrap value plus an extra colon
            # turn "xxx):0.00001:87.999,yyy" into "xxx)87.999:0.00001,yyy"
            patt = re.compile(r"\):([0-9]+\.[0-9e-]+):([0-9]+\.[0-9e-]*)")
            repl = r")\2:\1"
            tree = re.sub(patt, repl, tree)
        origw = var.warnReadNoFile
        var.warnReadNoFile = False
        read(tree)
        var.warnReadNoFile = origw
        result_tree = var.trees.pop()
        if bootstrap:
            #Round up floats to percentages
            for node in result_tree.iterInternalsNoRoot():
                node.name = "%2.f" % float(node.name)

        if remove_files:
            os.remove("tf_control.tl")
            os.remove("tf_data.phy")
            os.remove("tf_reconstruction.output")

        if verbose:
            print "\n"
            result_tree.draw()
            print "\nLikelihood: %.4f\n" % rd["Likelihood"]

        return result_tree, rd

    else:
        print tls % od
        return (None, None)
def treeFinderMAPAnalysis(alignment,
                          groups,
                          gamma=True,
                          invariant=True,
                          bootstrap=False,
                          nreplicates=100,
                          remove_files=False,
                          run_analysis=True,
                          verbose=False):
    """
    Uses TreeFinder to estimate a Maximum Likelihood tree using the MAP
    substitution model for grouped amino-acids.

    - *alignment*: p4 alignment object of original (un-recoded) protein data from
      which the "groups" are derived
    - *groups*: list of grouped amino-acids, possibly resuling from
      :meth:`Alignment.getKosiolAISGroups()` or :meth:`Alignment.getMinmaxChiSqGroups()`
    - *gamma*: include gamma distribution of among-site rate variation
    - *bootstrap*: run bootstrap analysis
    - *nreplicates*: number of bootstrap replicates
    - *invariant*: include a proportion of invariant sites
    - *run_analysis*: run the analysis if TreeFinder in $PATH, else just write the
      control file
    - *remove_files*: remove analysis files. Only available if run_analysis=True

    """

    gm = ["p4.alignment_recoding.treeFinderMAPAnalysis()"]

    if not isinstance(alignment, Alignment):
        msg = "alignment must be a Alignment object"
        gm.append(msg)
        raise P4Error(gm)

    if alignment.dataType != "protein":
        msg = "alignment should be the original protein data from" + \
              "which the groups were defined. Doing nothing."
        gm.append(msg)
        raise P4Error(gm)

    for param in [
            gamma, invariant, bootstrap, remove_files, run_analysis, verbose
    ]:
        if not isinstance(param, bool):
            msg = "%s value must be either True or False" % param
            gm.append(msg)
            raise P4Error(gm)

    if not isinstance(nreplicates, int):
        msg = "nreplictes must be an integer"
        gm.append(msg)
        raise P4Error(gm)

    if run_analysis:
        if not func.which2("tf"):
            msg = "tf (treefinder) is not in your $PATH" + \
                  "Cannot run analysis"
            gm.append(msg)
            raise P4Error(gm)

    datafile_name = "tf_data.phy"

    #tf commands
    tls = """ReconstructPhylogeny[
             "%(datafile)s",
             SubstitutionModel->MAP[%(map)s][Optimum,Optimum]%(ifH)s,
             WithEdgeSupport->%(bootstrap)s%(nreplicates)s
             ],
             "%(outfile)s",SaveReport"""
    od = {}
    od["datafile"] = datafile_name
    if gamma:
        if invariant:
            od["ifH"] = ":GI[Optimum]"
        else:
            od["ifH"] = ":G[Optimum]"
    else:
        if invariant:
            od["ifH"] = ":I[Optimum]"
        else:
            od["ifH"] = ""
    if bootstrap:
        od["bootstrap"] = "True"
        od["nreplicates"] = ",NReplicates->%i" % nreplicates
    else:
        od["bootstrap"] = "False"
        od["nreplicates"] = ""
    od["outfile"] = "tf_reconstruction.output"
    od["map"] = ",".join(
        ['"%s"' % i for i in [group.upper() for group in groups]])

    if run_analysis:

        #Write data file
        alignment.writePhylip(datafile_name)

        #Write control file
        tl_file = "tf_control.tl"
        fh = open(tl_file, "w")
        fh.write(tls % od)
        fh.close()

        if verbose:
            direct = subprocess.STDOUT
        else:
            direct = open("/dev/null", "w")

        child = subprocess.Popen("tf tf_control.tl", stderr=direct, shell=True)

        if verbose:
            print("Running TreeFinder, this could take some time...", end=' ')
            sys.stdout.flush()

        child.communicate()

        if verbose:
            print("done.")
            sys.stdout.flush()

        #This doesnt seem to work, why?
        #while child.poll() is None:
        #    time.sleep(60)
        #    if verbose:
        #        sys.stdout.write(".")
        #        sys.stdout.flush()

        if child.returncode != 0:
            msg = "TreeFinder returned error code %s"
            gm.append(msg % (child.returncode))
            raise P4Error(gm)

        fh = open(od["outfile"], "r")
        line = fh.readlines()[1]
        fh.close()

        rd = {}
        #Likelihood
        rd["Likelihood"] = float(line[line.index("Likelihood->") +
                                      12:line.index(",")])
        #Tree
        ts = line[line.index("Phylogeny->") +
                  11:line.index("SubstitutionModel->") - 1]
        rd["Phylogeny"] = ts
        #SubstitutionModel
        sm = line[line.index("SubstitutionModel->") +
                  19:line.index("OSubstitutionModel->") - 1]
        rd["SubstitutionModel"] = sm
        #OSubstitutionModel
        osm = line[line.index("OSubstitutionModel->") +
                   20:line.index("OEdgeOptimizationOff->") - 1]
        rd["OSubstitutionModel"] = osm
        #NSites
        ns = line[line.index("NSites->") + 8:line.index("NParameters->") - 1]
        rd["Nsites"] = int(ns)
        #NParameters
        np = line[line.index("NParameters->") + 13:line.index("AIC->") - 1]
        rd["NParameters"] = int(np)
        #AIC
        rd["AIC"] = float(line[line.index("AIC->") + 5:line.index("AICc->") -
                               1])
        #AICc->
        rd["AICc"] = float(line[line.index("AICc->") + 6:line.index("HQ->") -
                                1])
        #HQ
        rd["HQ"] = float(line[line.index("HQ->") + 4:line.index("BIC->") - 1])
        #BIC
        rd["BIC"] = float(line[line.index("BIC->") +
                               5:line.index("Checksum->") - 1])
        #LikelihoodTime
        lt = line[line.index("LikelihoodTime->") +
                  16:line.index("LikelihoodMemory->") - 1]
        rd["LikelihoodTime"] = float(lt)
        #LikelihoodMemory
        lm = line[line.index("LikelihoodMemory->") + 18:-3]
        rd["LikelihoodMemory"] = int(lm)

        #Make a tree object
        tree = rd["Phylogeny"].replace("{", "(")
        tree = tree.replace("}", ")")
        tree = tree.replace("\"", "")
        tree = tree + ";"
        if bootstrap:
            #Tree viewer has the brlen before bootstrap value plus an extra colon
            # turn "xxx):0.00001:87.999,yyy" into "xxx)87.999:0.00001,yyy"
            patt = re.compile(r"\):([0-9]+\.[0-9e-]+):([0-9]+\.[0-9e-]*)")
            repl = r")\2:\1"
            tree = re.sub(patt, repl, tree)
        origw = var.warnReadNoFile
        var.warnReadNoFile = False
        read(tree)
        var.warnReadNoFile = origw
        result_tree = var.trees.pop()
        if bootstrap:
            #Round up floats to percentages
            for node in result_tree.iterInternalsNoRoot():
                node.name = "%2.f" % float(node.name)

        if remove_files:
            os.remove("tf_control.tl")
            os.remove("tf_data.phy")
            os.remove("tf_reconstruction.output")

        if verbose:
            print("\n")
            result_tree.draw()
            print("\nLikelihood: %.4f\n" % rd["Likelihood"])

        return result_tree, rd

    else:
        print(tls % od)
        return (None, None)
in_file = argv[1]
n = argv[2]

# don't check for empty sequences or sites since
# p4 does not consider those in the test anyways
p4.var.doCheckForAllGapColumns = False

print(
'''

========== calculating test stats for {} ========== 

'''.format(in_file)
)

p4.read(in_file)

a = p4.var.alignments[0]
dm = a.pDistances()
t = dm.bionj()
d = p4.Data()
t.data = d
t.newComp(free=1, spec='empirical')
t.newRMatrix(free=1, spec='ones')
t.setNGammaCat(nGammaCat=4)
t.newGdasrv(free=1, val=0.5)
t.setPInvar(free=0, val=0.0)
t.optLogLike()
t.name = 'homogOpt'
t.tPickle()
Example #8
0
    print "usage: "+sys.argv[0]+" <starting tree>"
    sys.exit(0)

def f5(seq, idfun=None):
   if idfun is None:
       def idfun(x): return x
   seen = {}
   result = []
   for item in seq:
       marker = idfun(item)
       if marker in seen: continue
       seen[marker] = 1
       result.append(item)
   return result

p4.read(sys.argv[1])
t = p4.var.trees[0]
di = []
alt = True
alt2 = False
for i in range(10000):
    d = t.dupe()
    if alt == True:
        d.nni()
        alt = False
    else:
        if alt2 == False:
            d.randomSpr()
            alt2 = True
        else:
            d.nni()
import pandas as pd

print '''
The order of the arguments is:
- test set file
- log file
- phylograms file
'''


print sys.argv

test_set_file = sys.argv[1]
phylogs_file = sys.argv[2]

p4.read(test_set_file)
a = p4.var.alignments[0]

p4.read(phylogs_file)

for i in range(len(p4.var.trees)):
    print i
    t = p4.var.trees[i]
    t.data = p4.Data(a)
    t.newComp(free = 1, spec = 'equal')
    t.newRMatrix(free = 0, spec = 'ones')
    t.setNGammaCat(partNum = 0, nGammaCat=1)
#    t.newGdasrv(partNum=0,free = 1, val = gamma.ix[i])
    t.setPInvar(free = 0, val = 0.0)
    t.calcLogLike()
    t.model = None