Example #1
0
 def testTraversals(self):
     for tree in self.trees:
         nxTree = NXTree(tree)
         dfs = [x for x in nxTree.preOrderTraversal()]
         assert len(set(dfs)) == len(nxTree.nxDg.nodes())
         dfs = [x for x in nxTree.postOrderTraversal()]
         assert len(set(dfs)) == len(nxTree.nxDg.nodes())
         bfs = [x for x in nxTree.breadthFirstTraversal()]
         assert len(set(bfs)) == len(nxTree.nxDg.nodes())
Example #2
0
 def testTraversals(self):
     for tree in self.trees:
         nxTree = NXTree(tree)
         dfs = [x for x in nxTree.preOrderTraversal()]
         assert len(set(dfs)) == len(nxTree.nxDg.nodes())
         dfs = [x for x in nxTree.postOrderTraversal()]
         assert len(set(dfs)) == len(nxTree.nxDg.nodes())
         bfs = [x for x in nxTree.breadthFirstTraversal()]
         assert len(set(bfs)) == len(nxTree.nxDg.nodes())
Example #3
0
class SeqFile:
    branchLen = 1

    def __init__(self, path=None):
        if path is not None:
            self.parseFile(path)

    def parseFile(self, path):
        if not os.path.isfile(path):
            raise RuntimeError("File not found: %s" % path)
        self.tree = None
        self.pathMap = dict()
        self.outgroups = []
        seqFile = open(path, "r")
        for l in seqFile:
            line = l.strip()
            if line:
                if line[0] == "#":
                    continue
                tokens = line.split()
                if self.tree is None and (len(tokens) == 1 or line[0] == '('):
                    newickParser = NXNewick()
                    try:
                        self.tree = newickParser.parseString(line)
                    except:
                        raise RuntimeError("Failed to parse newick tree: %s" %
                                           line)
                elif len(tokens) > 0 and tokens[0] == '*':
                    sys.stderr.write("Skipping line %s\n" % l)
                elif line[0] != '(' and len(tokens) >= 2:
                    name = tokens[0]
                    if name[0] == '*':
                        name = name[1:]
                        self.outgroups.append(name)
                    path = string.join(tokens[1:])
                    if name in self.pathMap:
                        raise RuntimeError("Duplicate name found: %s" % name)
                    self.pathMap[name] = path
                elif len(tokens) > 0:
                    sys.stderr.write("Skipping line %s\n" % l)

        if self.tree is None:
            self.starTree()
        self.cleanTree()
        self.validate()

    def starTree(self):
        self.tree = NXTree()
        label = 0
        self.tree.nxDg.add_node(label)
        self.tree.rootId = label
        for name in self.pathMap.keys():
            label += 1
            self.tree.nxDg.add_edge(0, label)
            self.tree.setName(label, name)
            self.tree.setWeight(0, label, SeqFile.branchLen)

    def validate(self):
        if len([i for i in self.tree.postOrderTraversal()]) <= 2:
            raise RuntimeError("At least two valid leaf genomes required in"
                               " input tree")
        for node in self.tree.postOrderTraversal():
            if self.tree.isLeaf(node):
                name = self.tree.getName(node)
                if name not in self.pathMap:
                    raise RuntimeError("No sequence specified for %s" % name)
                else:
                    path = self.pathMap[name]
                    #if not os.path.exists(path):
                    #    raise RuntimeError("Sequence path not found: %s" % path)
                    #self.sanityCheckSequence(path)

    def sanityCheckSequence(self, path):
        """Warns the user about common problems with the input sequences."""
        # Relies on cactus_analyseAssembly output staying in the
        # format it's currently in.
        cmdline = "cactus_analyseAssembly"
        if os.path.isdir(path):
            cmdline = "cat %s/* | %s -" % (path, cmdline)
        else:
            cmdline += " %s" % path
        output = popenCatch(cmdline)
        try:
            repeatMaskedFrac = float(
                re.search(r'Proportion-repeat-masked: ([0-9.]*)',
                          output).group(1))
            nFrac = float(
                re.search(r'ProportionNs: ([0-9.]*)', output).group(1))
        except ValueError:
            # This can happen if the genome has 0 length, making the fractions NaN.
            # We warn the user but return afterwards, as the rest of the checks are
            # dependent on the fraction values.
            sys.stderr.write(
                "WARNING: sequence path %s has 0 length. Consider "
                "removing it from your input file.\n\n" % path)
            return
        # These thresholds are pretty arbitrary, but should be good for
        # badly- to well-assembled vertebrate genomes.
        if repeatMaskedFrac > 0.70:
            sys.stderr.write(
                "WARNING: sequence path %s has an extremely high "
                "proportion of masked bases: %f. progressiveCactus"
                " expects a soft-masked genome, i.e. all lowercase"
                " characters are considered masked. The process "
                "will proceed normally, but make sure you haven't "
                "accidentally provided an all-lowercase genome, "
                "in which case nothing will be aligned to "
                "it!\n\n" % (path, repeatMaskedFrac))
        if nFrac > 0.30:
            sys.stderr.write("WARNING: sequence path %s has an extremely high "
                             "proportion of 'N' bases: %f. The process will "
                             "proceed normally, but make sure your genome "
                             "isn't hard-masked! Alignments to hard-masked "
                             "genomes are much worse than to soft-masked "
                             "genomes. If the genome just has a lot of "
                             "poorly assembled regions, feel free to "
                             "ignore this message.\n\n" % (path, nFrac))

    # remove leaves that do not have sequence data associated with them
    def cleanTree(self):
        numLeaves = 0
        removeList = []
        for node in self.tree.postOrderTraversal():
            if self.tree.isLeaf(node):
                name = self.tree.getName(node)
                if name not in self.pathMap:
                    removeList.append(node)
                numLeaves += 1
        if numLeaves < 2:
            raise RuntimeError("At least two valid leaf genomes required in"
                               " input tree")
        if len(removeList) == numLeaves:
            raise RuntimeError(
                "No sequence path specified for any leaves in the tree")
        for leaf in removeList:
            sys.stderr.write("No sequence path found for %s: skipping\n" %
                             (self.tree.getName(leaf)))
            self.tree.removeLeaf(leaf)

        for node in self.tree.postOrderTraversal():
            if self.tree.hasParent(node):
                parent = self.tree.getParent(node)
                if self.tree.getWeight(parent, node) is None:
                    sys.stderr.write(
                        "No branch length for %s: setting to %d\n" %
                        (self.tree.getName(node), SeqFile.branchLen))
                    self.tree.setWeight(parent, node, SeqFile.branchLen)

    # create the cactus_workflow_experiment xml element which serves as
    # the root node of the experiment template file needed by
    # cactus_createMultiCactusProject.  Note the element is incomplete
    # until the cactus_disk child element has been added
    def toXMLElement(self):
        assert self.tree is not None
        elem = ET.Element("cactus_workflow_experiment")
        seqString = ""
        for node in self.tree.postOrderTraversal():
            if self.tree.isLeaf(node):
                name = self.tree.getName(node)
                path = self.pathMap[name]
                seqString += path + " "
        elem.attrib["sequences"] = seqString
        elem.attrib["species_tree"] = NXNewick().writeString(self.tree)
        elem.attrib["config"] = "defaultProgressive"
        return elem
Example #4
0
class SeqFile:
    branchLen = 1
    def __init__(self, path=None):
        if path is not None:
            self.parseFile(path)

    def parseFile(self, path):
        if not os.path.isfile(path):
            raise RuntimeError("File not found: %s" % path)
        self.tree = None
        self.pathMap = dict()
        self.outgroups = []
        seqFile = open(path, "r")
        for l in seqFile:
            line = l.strip()
            if line:
                if line[0] == "#":
                    continue
                tokens = line.split()
                if self.tree is None and (len(tokens) == 1 or line[0] == '('):
                    newickParser = NXNewick()
                    try:
                        self.tree = newickParser.parseString(line)
                    except:
                        raise RuntimeError("Failed to parse newick tree: %s" %
                                           line)
                elif len(tokens) > 0 and tokens[0] == '*':
                    sys.stderr.write("Skipping line %s\n" % l)
                elif line[0] != '(' and len(tokens) >= 2:
                    name = tokens[0]
                    if name[0] == '*':
                        name = name[1:]
                        self.outgroups.append(name)
                    path = string.join(tokens[1:])
                    if name in self.pathMap:
                        raise RuntimeError("Duplicate name found: %s" % name)
                    self.pathMap[name] = path
                elif len(tokens) > 0:
                    sys.stderr.write("Skipping line %s\n" % l)
        
        if self.tree is None:
            self.starTree()
        self.cleanTree()
        self.validate()

    def starTree(self):
        self.tree = NXTree()
        label = 0
        self.tree.nxDg.add_node(label)
        self.tree.rootId = label
        for name in self.pathMap.keys():
            label += 1
            self.tree.nxDg.add_edge(0, label)
            self.tree.setName(label, name)
            self.tree.setWeight(0, label, SeqFile.branchLen)
        
    def validate(self):
        if len([i for i in self.tree.postOrderTraversal()]) <= 2:
            raise RuntimeError("At least two valid leaf genomes required in"
                               " input tree")
        for node in self.tree.postOrderTraversal():
            if self.tree.isLeaf(node):
                name = self.tree.getName(node)
                if name not in self.pathMap:
                    raise RuntimeError("No sequence specified for %s" % name)
                else:
                    path = self.pathMap[name]
                    if not os.path.exists:
                        raise RuntimeError("Sequence path not found: %s" % path)

    # remove leaves that do not have sequence data associated with them
    def cleanTree(self):
        numLeaves = 0
        removeList = []
        for node in self.tree.postOrderTraversal():
            if self.tree.isLeaf(node):
                name = self.tree.getName(node)
                if name not in self.pathMap:
                    removeList.append(node)
                numLeaves += 1
        if numLeaves < 2:
            raise RuntimeError("At least two valid leaf genomes required in"
                               " input tree")
        if len(removeList) == numLeaves:
            raise RuntimeError("No sequence path specified for any leaves in the tree")
        for leaf in removeList:
             sys.stderr.write("No sequence path found for %s: skipping\n" % (
                 self.tree.getName(leaf)))
             self.tree.removeLeaf(leaf)

        for node in self.tree.postOrderTraversal():
            if self.tree.hasParent(node):
                parent = self.tree.getParent(node)
                if self.tree.getWeight(parent, node) is None:
                    sys.stderr.write(
                        "No branch length for %s: setting to %d\n" % (
                            self.tree.getName(node), SeqFile.branchLen))
                    self.tree.setWeight(parent, node, SeqFile.branchLen)
                    

    # create the cactus_workflow_experiment xml element which serves as
    # the root node of the experiment template file needed by
    # cactus_createMultiCactusProject.  Note the element is incomplete
    # until the cactus_disk child element has been added
    def toXMLElement(self):
        assert self.tree is not None
        elem = ET.Element("cactus_workflow_experiment")
        seqString = ""
        for node in self.tree.postOrderTraversal():
            if self.tree.isLeaf(node):
                name = self.tree.getName(node)
                path = self.pathMap[name]
                path.replace(" ", "\ ")
                seqString += absSymPath(path) + " "
        elem.attrib["sequences"] = seqString
        elem.attrib["species_tree"] = NXNewick().writeString(self.tree)
        elem.attrib["config"] = "defaultProgressive"
        return elem
Example #5
0
class SeqFile:
    branchLen = 1
    def __init__(self, path=None):
        if path is not None:
            self.parseFile(path)

    def parseFile(self, path):
        if not os.path.isfile(path):
            raise RuntimeError("File not found: %s" % path)
        self.tree = None
        self.pathMap = dict()
        self.outgroups = []
        seqFile = open(path, "r")
        for l in seqFile:
            line = l.strip()
            if line:
                if line[0] == "#":
                    continue
                tokens = line.split()
                if self.tree is None and (len(tokens) == 1 or line[0] == '('):
                    newickParser = NXNewick()
                    try:
                        self.tree = newickParser.parseString(line)
                    except:
                        raise RuntimeError("Failed to parse newick tree: %s" %
                                           line)
                elif len(tokens) > 0 and tokens[0] == '*':
                    sys.stderr.write("Skipping line %s\n" % l)
                elif line[0] != '(' and len(tokens) >= 2:
                    name = tokens[0]
                    if name[0] == '*':
                        name = name[1:]
                        self.outgroups.append(name)
                    path = string.join(tokens[1:])
                    if name in self.pathMap:
                        raise RuntimeError("Duplicate name found: %s" % name)
                    self.pathMap[name] = path
                elif len(tokens) > 0:
                    sys.stderr.write("Skipping line %s\n" % l)
        
        if self.tree is None:
            self.starTree()
        self.cleanTree()
        self.validate()

    def starTree(self):
        self.tree = NXTree()
        label = 0
        self.tree.nxDg.add_node(label)
        self.tree.rootId = label
        for name in self.pathMap.keys():
            label += 1
            self.tree.nxDg.add_edge(0, label)
            self.tree.setName(label, name)
            self.tree.setWeight(0, label, SeqFile.branchLen)
        
    def validate(self):
        if len([i for i in self.tree.postOrderTraversal()]) <= 2:
            raise RuntimeError("At least two valid leaf genomes required in"
                               " input tree")
        for node in self.tree.postOrderTraversal():
            if self.tree.isLeaf(node):
                name = self.tree.getName(node)
                if name not in self.pathMap:
                    raise RuntimeError("No sequence specified for %s" % name)
                else:
                    path = self.pathMap[name]
                    #if not os.path.exists(path):
                    #    raise RuntimeError("Sequence path not found: %s" % path)
                    #self.sanityCheckSequence(path)

    def sanityCheckSequence(self, path):
        """Warns the user about common problems with the input sequences."""
        # Relies on cactus_analyseAssembly output staying in the
        # format it's currently in.
        cmdline = "cactus_analyseAssembly"
        if os.path.isdir(path):
            cmdline = "cat %s/* | %s -" % (path, cmdline)
        else:
            cmdline += " %s" % path
        output = popenCatch(cmdline)
        try:
            repeatMaskedFrac = float(re.search(r'Proportion-repeat-masked: ([0-9.]*)', output).group(1))
            nFrac = float(re.search(r'ProportionNs: ([0-9.]*)', output).group(1))
        except ValueError:
            # This can happen if the genome has 0 length, making the fractions NaN.
            # We warn the user but return afterwards, as the rest of the checks are
            # dependent on the fraction values.
            sys.stderr.write("WARNING: sequence path %s has 0 length. Consider "
                             "removing it from your input file.\n\n" % path)
            return
        # These thresholds are pretty arbitrary, but should be good for
        # badly- to well-assembled vertebrate genomes.
        if repeatMaskedFrac > 0.70:
            sys.stderr.write("WARNING: sequence path %s has an extremely high "
                             "proportion of masked bases: %f. progressiveCactus"
                             " expects a soft-masked genome, i.e. all lowercase"
                             " characters are considered masked. The process "
                             "will proceed normally, but make sure you haven't "
                             "accidentally provided an all-lowercase genome, "
                             "in which case nothing will be aligned to "
                             "it!\n\n" % (path, repeatMaskedFrac))
        if nFrac > 0.30:
            sys.stderr.write("WARNING: sequence path %s has an extremely high "
                             "proportion of 'N' bases: %f. The process will "
                             "proceed normally, but make sure your genome "
                             "isn't hard-masked! Alignments to hard-masked "
                             "genomes are much worse than to soft-masked "
                             "genomes. If the genome just has a lot of "
                             "poorly assembled regions, feel free to "
                             "ignore this message.\n\n" % (path, nFrac))

    # remove leaves that do not have sequence data associated with them
    def cleanTree(self):
        numLeaves = 0
        removeList = []
        for node in self.tree.postOrderTraversal():
            if self.tree.isLeaf(node):
                name = self.tree.getName(node)
                if name not in self.pathMap:
                    removeList.append(node)
                numLeaves += 1
        if numLeaves < 2:
            raise RuntimeError("At least two valid leaf genomes required in"
                               " input tree")
        if len(removeList) == numLeaves:
            raise RuntimeError("No sequence path specified for any leaves in the tree")
        for leaf in removeList:
             sys.stderr.write("No sequence path found for %s: skipping\n" % (
                 self.tree.getName(leaf)))
             self.tree.removeLeaf(leaf)

        for node in self.tree.postOrderTraversal():
            if self.tree.hasParent(node):
                parent = self.tree.getParent(node)
                if self.tree.getWeight(parent, node) is None:
                    sys.stderr.write(
                        "No branch length for %s: setting to %d\n" % (
                            self.tree.getName(node), SeqFile.branchLen))
                    self.tree.setWeight(parent, node, SeqFile.branchLen)
                    

    # create the cactus_workflow_experiment xml element which serves as
    # the root node of the experiment template file needed by
    # cactus_createMultiCactusProject.  Note the element is incomplete
    # until the cactus_disk child element has been added
    def toXMLElement(self):
        assert self.tree is not None
        elem = ET.Element("cactus_workflow_experiment")
        seqString = ""
        for node in self.tree.postOrderTraversal():
            if self.tree.isLeaf(node):
                name = self.tree.getName(node)
                path = self.pathMap[name]
                seqString += path + " "
        elem.attrib["sequences"] = seqString
        elem.attrib["species_tree"] = NXNewick().writeString(self.tree)
        elem.attrib["config"] = "defaultProgressive"
        return elem