def test_sequence_conversion(self): pseq = PX.Sequence( type='protein', # id_ref=None, # id_source=None, symbol='ADHX', accession=PX.Accession('P81431', source='UniProtKB'), name='Alcohol dehydrogenase class-3', # location=None, mol_seq=PX.MolSeq( 'TDATGKPIKCMAAIAWEAKKPLSIEEVEVAPPKSGEVRIKILHSGVCHTD'), uri=None, annotations=[ PX.Annotation(ref='EC:1.1.1.1'), PX.Annotation(ref='GO:0004022') ], domain_architecture=PX.DomainArchitecture( length=50, domains=[ PX.ProteinDomain(*args) for args in ( # value, start, end, confidence ('FOO', 0, 5, 7.0e-26), ('BAR', 8, 13, 7.2e-117), ('A-OK', 21, 34, 2.4e-06), ('WD40', 40, 50, 0.3)) ], )) srec = pseq.to_seqrecord() # TODO: check seqrec-specific traits (see args) # Seq(letters, alphabet), id, name, description, features pseq2 = PX.Sequence.from_seqrecord(srec)
def test_sequence_conversion(self): pseq = PX.Sequence( type="protein", # id_ref=None, # id_source=None, symbol="ADHX", accession=PX.Accession("P81431", source="UniProtKB"), name="Alcohol dehydrogenase class-3", # location=None, mol_seq=PX.MolSeq( "TDATGKPIKCMAAIAWEAKKPLSIEEVEVAPPKSGEVRIKILHSGVCHTD"), uri=None, annotations=[ PX.Annotation(ref="EC:1.1.1.1"), PX.Annotation(ref="GO:0004022") ], domain_architecture=PX.DomainArchitecture( length=50, domains=[ PX.ProteinDomain(*args) for args in ( # value, start, end, confidence ("FOO", 0, 5, 7.0e-26), ("BAR", 8, 13, 7.2e-117), ("A-OK", 21, 34, 2.4e-06), ("WD40", 40, 50, 0.3)) ], )) srec = pseq.to_seqrecord() # TODO: check seqrec-specific traits (see args) # Seq(letters, alphabet), id, name, description, features pseq2 = PX.Sequence.from_seqrecord(srec)
def events(self, elem): """Create events object.""" return PX.Events(type=_get_child_text(elem, 'type'), duplications=_get_child_text(elem, 'duplications', int), speciations=_get_child_text(elem, 'speciations', int), losses=_get_child_text(elem, 'losses', int), confidence=_get_child_as(elem, 'confidence', self.confidence))
def date(self, elem): """Create date object.""" return PX.Date( unit=elem.get("unit"), desc=_collapse_wspace(_get_child_text(elem, "desc")), value=_get_child_text(elem, "value", float), minimum=_get_child_text(elem, "minimum", float), maximum=_get_child_text(elem, "maximum", float), )
def annotation(self, elem): """Create annotation object.""" return PX.Annotation( desc=_collapse_wspace(_get_child_text(elem, "desc")), confidence=_get_child_as(elem, "confidence", self.confidence), properties=_get_children_as(elem, "property", self.property), uri=_get_child_as(elem, "uri", self.uri), **elem.attrib, )
def events(self, elem): """Create events object.""" return PX.Events( type=_get_child_text(elem, "type"), duplications=_get_child_text(elem, "duplications", int), speciations=_get_child_text(elem, "speciations", int), losses=_get_child_text(elem, "losses", int), confidence=_get_child_as(elem, "confidence", self.confidence), )
def clade_relation(self, elem): """Create clade relationship object.""" return PX.CladeRelation( elem.get("type"), elem.get("id_ref_0"), elem.get("id_ref_1"), distance=elem.get("distance"), confidence=_get_child_as(elem, "confidence", self.confidence), )
def other(self, elem, namespace, localtag): return PX.Other(localtag, namespace, elem.attrib, value=elem.text and elem.text.strip() or None, children=[ self.other(child, *_split_namespace(child.tag)) for child in elem ])
def point(self, elem): """Create point object, coordinates of a point.""" return PX.Point( elem.get("geodetic_datum"), _get_child_text(elem, "lat", float), _get_child_text(elem, "long", float), alt=_get_child_text(elem, "alt", float), alt_unit=elem.get("alt_unit"), )
def date(self, elem): """Create date object.""" return PX.Date( unit=elem.get('unit'), desc=_collapse_wspace(_get_child_text(elem, 'desc')), value=_get_child_text(elem, 'value', float), minimum=_get_child_text(elem, 'minimum', float), maximum=_get_child_text(elem, 'maximum', float), )
def domain(self, elem): """Create protein domain object.""" return PX.ProteinDomain( elem.text.strip(), int(elem.get("from")) - 1, int(elem.get("to")), confidence=_float(elem.get("confidence")), id=elem.get("id"), )
def sequence_relation(self, elem): """Create sequence relationship object, relationship between two sequences.""" return PX.SequenceRelation( elem.get("type"), elem.get("id_ref_0"), elem.get("id_ref_1"), distance=_float(elem.get("distance")), confidence=_get_child_as(elem, "confidence", self.confidence), )
def property(self, elem): """Create properties from external resources.""" return PX.Property( elem.text.strip(), elem.get("ref"), elem.get("applies_to"), elem.get("datatype"), unit=elem.get("unit"), id_ref=elem.get("id_ref"), )
def other(self, elem, namespace, localtag): """Create an Other object, a non-phyloXML element.""" return PX.Other(localtag, namespace, elem.attrib, value=elem.text and elem.text.strip() or None, children=[ self.other(child, *_split_namespace(child.tag)) for child in elem ])
def main(): ##################### get parser ############################### parser = get_parser() ######## Print parser help if arguments missed ################# if len(sys.argv) == 1: parser.print_help() sys.exit(1) ########### Manage workflow accorded to Args ################## Arguments = parser.parse_args() # parser = etree.XMLParser(remove_blank_text=True) # xmlfile = Arguments.xml # tree = etree.ElementTree() # tree =etree.parse(xmlfile,parser) # print tree # root = tree.getroot() # print root # test = root.xpath("phylogeny") # print test # for var in test : # toto = var.findall("clade")[0] # print toto.text # print var tree = Phylo.read("jpp2.xml", 'phyloxml') for clade in tree.find_clades(name=True): clade.other = [ PhyloXML.Other(tag="annotation", namespace="", children=[ PhyloXML.Other('desc', value='youpi'), PhyloXML.Other('uri', value='http://lol.com') ]) ] clade.annotation = PhyloXML.Annotation(desc="youpi", uri="http://lol.com") Phylo.write(tree, "jpp.xml", 'phyloxml')
def output_xml(coloured_trees, path, colour_branches): # adding font as a property of each tip clade, to show colour coloured_trees = [ PhyloXML.Phylogeny.from_tree(tree) for tree in coloured_trees ]# convert to PhyloNexus for tree in coloured_trees: for clade in tree.get_terminals(): value = BranchColor.to_hex(clade.color) # value of the property (ie the colour) clade.properties = [PhyloXML.Property(value, "style:font_color", "node", "xsd:token")] if not colour_branches: for clade in tree.get_nonterminals() + tree.get_terminals(): clade.color = None Phylo.write(coloured_trees, path, "phyloxml")
def prepare_species_tree(FILE_TREE_IN, FILE_TREE_OUT): clan_taxa = {} treexml = PhyloXMLIO.read(open(FILE_TREE_IN, 'r')) tree = treexml[0] treexml.attributes.pop('schemaLocation', None) # not supported by Forester tree.rooted = True leaf_dict = {} for node in tree.clade.find_clades(): if node.name: tax_id = node.name if tax_id.startswith('INT'): tax_id = tax_id[3:] taxon = PhyloXML.Taxonomy( id=PhyloXML.Id(tax_id, provider='ncbi_taxonomy')) try: taxon.scientific_name = find_tax_name(tax_id) except KeyError: taxon.scientific_name = '(NA)' node._set_taxonomy(taxon) node.name = None else: pass PhyloXMLIO.write(treexml, FILE_TREE_OUT)
def binary_characters(self, elem): def bc_getter(elem): return _get_children_text(elem, 'bc') return PX.BinaryCharacters( type=elem.get('type'), gained_count=_int(elem.get('gained_count')), lost_count=_int(elem.get('lost_count')), present_count=_int(elem.get('present_count')), absent_count=_int(elem.get('absent_count')), # Flatten BinaryCharacterList sub-nodes into lists of strings gained=_get_child_as(elem, 'gained', bc_getter), lost=_get_child_as(elem, 'lost', bc_getter), present=_get_child_as(elem, 'present', bc_getter), absent=_get_child_as(elem, 'absent', bc_getter))
def binary_characters(self, elem): """Create binary characters object.""" def bc_getter(elem): """Get binary characters from subnodes.""" return _get_children_text(elem, "bc") return PX.BinaryCharacters( type=elem.get("type"), gained_count=_int(elem.get("gained_count")), lost_count=_int(elem.get("lost_count")), present_count=_int(elem.get("present_count")), absent_count=_int(elem.get("absent_count")), # Flatten BinaryCharacterList sub-nodes into lists of strings gained=_get_child_as(elem, "gained", bc_getter), lost=_get_child_as(elem, "lost", bc_getter), present=_get_child_as(elem, "present", bc_getter), absent=_get_child_as(elem, "absent", bc_getter))
def _parse_phylogeny(self, parent): """Parse a single phylogeny within the phyloXML tree (PRIVATE). Recursively builds a phylogenetic tree with help from parse_clade, then clears the XML event history for the phylogeny element and returns control to the top-level parsing function. """ phylogeny = PX.Phylogeny( **_dict_str2bool(parent.attrib, ["rooted", "rerootable"])) list_types = { # XML tag, plural attribute "confidence": "confidences", "property": "properties", "clade_relation": "clade_relations", "sequence_relation": "sequence_relations", } for event, elem in self.context: namespace, tag = _split_namespace(elem.tag) if event == "start" and tag == "clade": if phylogeny.root is not None: raise ValueError( "Phylogeny object should only have 1 clade") phylogeny.root = self._parse_clade(elem) continue if event == "end": if tag == "phylogeny": parent.clear() break # Handle the other non-recursive children if tag in list_types: getattr(phylogeny, list_types[tag]).append(getattr(self, tag)(elem)) # Complex types elif tag in ("date", "id"): setattr(phylogeny, tag, getattr(self, tag)(elem)) # Simple types elif tag in ("name", "description"): setattr(phylogeny, tag, _collapse_wspace(elem.text)) # Unknown tags elif namespace != NAMESPACES["phy"]: phylogeny.other.append(self.other(elem, namespace, tag)) parent.clear() else: # NB: This shouldn't happen in valid files raise PhyloXMLError("Misidentified tag: " + tag) return phylogeny
def _parse_phylogeny(self, parent): """Parse a single phylogeny within the phyloXML tree (PRIVATE). Recursively builds a phylogenetic tree with help from parse_clade, then clears the XML event history for the phylogeny element and returns control to the top-level parsing function. """ phylogeny = PX.Phylogeny(**_dict_str2bool(parent.attrib, ['rooted', 'rerootable'])) list_types = { # XML tag, plural attribute 'confidence': 'confidences', 'property': 'properties', 'clade_relation': 'clade_relations', 'sequence_relation': 'sequence_relations', } for event, elem in self.context: namespace, tag = _split_namespace(elem.tag) if event == 'start' and tag == 'clade': assert phylogeny.root is None, \ "Phylogeny object should only have 1 clade" phylogeny.root = self._parse_clade(elem) continue if event == 'end': if tag == 'phylogeny': parent.clear() break # Handle the other non-recursive children if tag in list_types: getattr(phylogeny, list_types[tag]).append( getattr(self, tag)(elem)) # Complex types elif tag in ('date', 'id'): setattr(phylogeny, tag, getattr(self, tag)(elem)) # Simple types elif tag in ('name', 'description'): setattr(phylogeny, tag, _collapse_wspace(elem.text)) # Unknown tags elif namespace != NAMESPACES['phy']: phylogeny.other.append(self.other(elem, namespace, tag)) parent.clear() else: # NB: This shouldn't happen in valid files raise PhyloXMLError('Misidentified tag: ' + tag) return phylogeny
def binary_characters(self, elem): """Create binary characters object.""" # This comment stops black style adding a blank line here, which causes flake8 D202. def bc_getter(elem): """Get binary characters from subnodes.""" return _get_children_text(elem, "bc") return PX.BinaryCharacters( type=elem.get("type"), gained_count=_int(elem.get("gained_count")), lost_count=_int(elem.get("lost_count")), present_count=_int(elem.get("present_count")), absent_count=_int(elem.get("absent_count")), # Flatten BinaryCharacterList sub-nodes into lists of strings gained=_get_child_as(elem, "gained", bc_getter), lost=_get_child_as(elem, "lost", bc_getter), present=_get_child_as(elem, "present", bc_getter), absent=_get_child_as(elem, "absent", bc_getter), )
def _parse_sequence(self, parent): """Parse a molecular sequence (PRIVATE).""" sequence = PX.Sequence(**parent.attrib) for event, elem in self.context: namespace, tag = _split_namespace(elem.tag) if event == "end": if tag == "sequence": parent.clear() break if tag in ("accession", "mol_seq", "uri", "domain_architecture"): setattr(sequence, tag, getattr(self, tag)(elem)) elif tag == "annotation": sequence.annotations.append(self.annotation(elem)) elif tag == "name": sequence.name = _collapse_wspace(elem.text) elif tag in ("symbol", "location"): setattr(sequence, tag, elem.text) elif namespace != NAMESPACES["phy"]: sequence.other.append(self.other(elem, namespace, tag)) parent.clear() return sequence
def _parse_taxonomy(self, parent): taxonomy = PX.Taxonomy(**parent.attrib) for event, elem in self.context: namespace, tag = _split_namespace(elem.tag) if event == 'end': if tag == 'taxonomy': parent.clear() break if tag in ('id', 'uri'): setattr(taxonomy, tag, getattr(self, tag)(elem)) elif tag == 'common_name': taxonomy.common_names.append(_collapse_wspace(elem.text)) elif tag == 'synonym': taxonomy.synonyms.append(elem.text) elif tag in ('code', 'scientific_name', 'authority', 'rank'): # ENH: check_str on rank setattr(taxonomy, tag, elem.text) elif namespace != NAMESPACES['phy']: taxonomy.other.append(self.other(elem, namespace, tag)) parent.clear() return taxonomy
def _parse_sequence(self, parent): sequence = PX.Sequence(**parent.attrib) for event, elem in self.context: namespace, tag = _split_namespace(elem.tag) if event == 'end': if tag == 'sequence': parent.clear() break if tag in ('accession', 'mol_seq', 'uri', 'domain_architecture'): setattr(sequence, tag, getattr(self, tag)(elem)) elif tag == 'annotation': sequence.annotations.append(self.annotation(elem)) elif tag == 'name': sequence.name = _collapse_wspace(elem.text) elif tag in ('symbol', 'location'): setattr(sequence, tag, elem.text) elif namespace != NAMESPACES['phy']: sequence.other.append(self.other(elem, namespace, tag)) parent.clear() return sequence
def read(self): """Parse the phyloXML file and create a single Phyloxml object.""" phyloxml = PX.Phyloxml({_local(key): val for key, val in self.root.items()}) other_depth = 0 for event, elem in self.context: namespace, localtag = _split_namespace(elem.tag) if event == "start": if namespace != NAMESPACES["phy"]: other_depth += 1 continue if localtag == "phylogeny": phylogeny = self._parse_phylogeny(elem) phyloxml.phylogenies.append(phylogeny) if event == "end" and namespace != NAMESPACES["phy"]: # Deal with items not specified by phyloXML other_depth -= 1 if other_depth == 0: # We're directly under the root node -- evaluate otr = self.other(elem, namespace, localtag) phyloxml.other.append(otr) self.root.clear() return phyloxml
def _parse_taxonomy(self, parent): """Parse taxonomic information for a clade (PRIVATE).""" taxonomy = PX.Taxonomy(**parent.attrib) for event, elem in self.context: namespace, tag = _split_namespace(elem.tag) if event == "end": if tag == "taxonomy": parent.clear() break if tag in ("id", "uri"): setattr(taxonomy, tag, getattr(self, tag)(elem)) elif tag == "common_name": taxonomy.common_names.append(_collapse_wspace(elem.text)) elif tag == "synonym": taxonomy.synonyms.append(elem.text) elif tag in ("code", "scientific_name", "authority", "rank"): # ENH: check_str on rank setattr(taxonomy, tag, elem.text) elif namespace != NAMESPACES["phy"]: taxonomy.other.append(self.other(elem, namespace, tag)) parent.clear() return taxonomy
def write(obj, file, encoding=DEFAULT_ENCODING, indent=True): """Write a phyloXML file. :Parameters: obj an instance of ``Phyloxml``, ``Phylogeny`` or ``BaseTree.Tree``, or an iterable of either of the latter two. The object will be converted to a Phyloxml object before serialization. file either an open handle or a file name. """ def fix_single(tree): if isinstance(tree, PX.Phylogeny): return tree if isinstance(tree, PX.Clade): return tree.to_phylogeny() if isinstance(tree, PX.BaseTree.Tree): return PX.Phylogeny.from_tree(tree) if isinstance(tree, PX.BaseTree.Clade): return PX.Phylogeny.from_tree(PX.BaseTree.Tree(root=tree)) else: raise ValueError("iterable must contain Tree or Clade types") if isinstance(obj, PX.Phyloxml): pass elif isinstance(obj, (PX.BaseTree.Tree, PX.BaseTree.Clade)): obj = fix_single(obj).to_phyloxml() elif hasattr(obj, "__iter__"): obj = PX.Phyloxml({}, phylogenies=(fix_single(t) for t in obj)) else: raise ValueError( "First argument must be a Phyloxml, Phylogeny, " "Tree, or iterable of Trees or Phylogenies." ) return Writer(obj).write(file, encoding=encoding, indent=indent)
def _parse_clade(self, parent): """Parse a Clade node and its children, recursively.""" clade = BPrecPhyloXML.Clade(**parent.attrib) if clade.branch_length is not None: clade.branch_length = float(clade.branch_length) # NB: Only evaluate nodes at the current level tag_stack = [] for event, elem in self.context: namespace, tag = PhyloXMLIO._split_namespace(elem.tag) #print event, namespace, tag if event == 'start': if tag == 'clade': clade.clades.append(self._parse_clade(elem)) continue if tag == 'taxonomy': clade.taxonomies.append(self._parse_taxonomy(elem)) continue if tag == 'sequence': clade.sequences.append(self._parse_sequence(elem)) continue if tag == EVENTSRECTAG: ## list of reconciliation events clade.eventsRec = self._parse_eventsRec(elem) continue if tag in self._clade_tracked_tags: tag_stack.append(tag) if event == 'end': if tag == 'clade': elem.clear() break if tag != tag_stack[-1]: continue tag_stack.pop() # Handle the other non-recursive children if tag in self._clade_list_types: getattr(clade, self._clade_list_types[tag]).append( getattr(self, tag)(elem)) elif tag in self._clade_complex_types: setattr(clade, tag, getattr(self, tag)(elem)) elif tag == 'branch_length': # NB: possible collision with the attribute if clade.branch_length is not None: raise PhyloXMLIO.PhyloXMLError( 'Attribute branch_length was already set ' 'for this Clade.') clade.branch_length = PhyloXMLIO._float(elem.text) elif tag == 'width': clade.width = PhyloXMLIO._float(elem.text) elif tag == 'name': clade.name = PhyloXMLIO._collapse_wspace(elem.text) elif tag == 'node_id': clade.node_id = PX.Id(elem.text.strip(), elem.attrib.get('provider')) elif namespace != PhyloXMLIO.NAMESPACES['phy']: clade.other.append(self.other(elem, namespace, tag)) elem.clear() elif tag in self._clade_recPhyloXML_list_type: #clade.eventsRec = self.other(elem, namespace, tag) continue #getattr(clade, self._clade_recPhyloXML_list_type[tag]).append( # getattr(self, tag)(elem)) else: raise PhyloXMLIO.PhyloXMLError('Misidentified tag: ' + tag) return clade
def uri(self, elem): return PX.Uri(elem.text.strip(), desc=_collapse_wspace(elem.get('desc')), type=elem.get('type'))