def _parse_clade(self, parent): """Parse a Clade node and its children, recursively (PRIVATE).""" clade = PX.Clade(**parent.attrib) if clade.branch_length is not None: clade.branch_length = float(clade.branch_length) # NB: Only evaluate nodes at the current level tag_stack = [] for event, elem in self.context: namespace, tag = _split_namespace(elem.tag) if event == "start": if tag == "clade": clade.clades.append(self._parse_clade(elem)) continue if tag == "taxonomy": clade.taxonomies.append(self._parse_taxonomy(elem)) continue if tag == "sequence": clade.sequences.append(self._parse_sequence(elem)) continue if tag in self._clade_tracked_tags: tag_stack.append(tag) if event == "end": if tag == "clade": elem.clear() break if tag != tag_stack[-1]: continue tag_stack.pop() # Handle the other non-recursive children if tag in self._clade_list_types: getattr(clade, self._clade_list_types[tag]).append( getattr(self, tag)(elem) ) elif tag in self._clade_complex_types: setattr(clade, tag, getattr(self, tag)(elem)) elif tag == "branch_length": # NB: possible collision with the attribute if clade.branch_length is not None: raise PhyloXMLError( "Attribute branch_length was already set for this Clade." ) clade.branch_length = _float(elem.text) elif tag == "width": clade.width = _float(elem.text) elif tag == "name": clade.name = _collapse_wspace(elem.text) elif tag == "node_id": clade.node_id = PX.Id( elem.text.strip(), elem.attrib.get("provider") ) elif namespace != NAMESPACES["phy"]: clade.other.append(self.other(elem, namespace, tag)) elem.clear() else: raise PhyloXMLError("Misidentified tag: " + tag) return clade
def prepare_species_tree(FILE_TREE_IN, FILE_TREE_OUT): clan_taxa = {} treexml = PhyloXMLIO.read(open(FILE_TREE_IN, 'r')) tree = treexml[0] treexml.attributes.pop('schemaLocation', None) # not supported by Forester tree.rooted = True leaf_dict = {} for node in tree.clade.find_clades(): if node.name: tax_id = node.name if tax_id.startswith('INT'): tax_id = tax_id[3:] taxon = PhyloXML.Taxonomy( id=PhyloXML.Id(tax_id, provider='ncbi_taxonomy')) try: taxon.scientific_name = find_tax_name(tax_id) except KeyError: taxon.scientific_name = '(NA)' node._set_taxonomy(taxon) node.name = None else: pass PhyloXMLIO.write(treexml, FILE_TREE_OUT)
def _parse_clade(self, parent): """Parse a Clade node and its children, recursively.""" clade = BPrecPhyloXML.Clade(**parent.attrib) if clade.branch_length is not None: clade.branch_length = float(clade.branch_length) # NB: Only evaluate nodes at the current level tag_stack = [] for event, elem in self.context: namespace, tag = PhyloXMLIO._split_namespace(elem.tag) #print event, namespace, tag if event == 'start': if tag == 'clade': clade.clades.append(self._parse_clade(elem)) continue if tag == 'taxonomy': clade.taxonomies.append(self._parse_taxonomy(elem)) continue if tag == 'sequence': clade.sequences.append(self._parse_sequence(elem)) continue if tag == EVENTSRECTAG: ## list of reconciliation events clade.eventsRec = self._parse_eventsRec(elem) continue if tag in self._clade_tracked_tags: tag_stack.append(tag) if event == 'end': if tag == 'clade': elem.clear() break if tag != tag_stack[-1]: continue tag_stack.pop() # Handle the other non-recursive children if tag in self._clade_list_types: getattr(clade, self._clade_list_types[tag]).append( getattr(self, tag)(elem)) elif tag in self._clade_complex_types: setattr(clade, tag, getattr(self, tag)(elem)) elif tag == 'branch_length': # NB: possible collision with the attribute if clade.branch_length is not None: raise PhyloXMLIO.PhyloXMLError( 'Attribute branch_length was already set ' 'for this Clade.') clade.branch_length = PhyloXMLIO._float(elem.text) elif tag == 'width': clade.width = PhyloXMLIO._float(elem.text) elif tag == 'name': clade.name = PhyloXMLIO._collapse_wspace(elem.text) elif tag == 'node_id': clade.node_id = PX.Id(elem.text.strip(), elem.attrib.get('provider')) elif namespace != PhyloXMLIO.NAMESPACES['phy']: clade.other.append(self.other(elem, namespace, tag)) elem.clear() elif tag in self._clade_recPhyloXML_list_type: #clade.eventsRec = self.other(elem, namespace, tag) continue #getattr(clade, self._clade_recPhyloXML_list_type[tag]).append( # getattr(self, tag)(elem)) else: raise PhyloXMLIO.PhyloXMLError('Misidentified tag: ' + tag) return clade
def id(self, elem): provider = elem.get('provider') or elem.get('type') return PX.Id(elem.text.strip(), provider)
def id(self, elem): """Create identifier object.""" provider = elem.get("provider") or elem.get("type") return PX.Id(elem.text.strip(), provider)
def reconcile_tree(gene_tree_file, reconciled_file, rec_tag, pfam_id, db): if (os.path.isfile(rec_tag + 'ids.pickle')) and (pplacer_flag == 1): id_information = pickle.load(open(rec_tag + 'ids.pickle', 'rb')) existing_genes = id_information['existing_genes'] Sequnces = [] p_ids = [] new_genes = set([w['id'] for w in pplacer_queries[pfam_id]]) if not (new_genes - set(existing_genes)): print "All %s Genes for family %s have already been placed in the reconciled tree." % ( len(new_genes), pfam_id) print "Skip Reconciliation for %s" % pfam_id return txid_file = rec_tag + 'txid.xml' if not (os.path.isfile(rec_tag + 'ids.pickle')) or not ( os.path.isfile(reconciled_file + '.gz')) or (pplacer_flag == 1): print "Running Reconciliation for: %s" % pfam_id rand_id = random.randint(1000000, 9999999) subprocess.check_call("gunzip -c %s/%s.nw.gz > %s.%d" % (tree_folder, pfam_id, gene_tree_file, rand_id), shell=True) tree = ete2.PhyloTree('%s.%d' % (gene_tree_file, rand_id), format=0) tree.resolve_polytomy() tree.write(format=0, outfile=txid_file + '.tmp.nw') if os.path.exists('%s.%d' % (gene_tree_file, rand_id)): subprocess.check_call("rm %s.%d" % (gene_tree_file, rand_id), shell=True) Phylo.convert(txid_file + '.tmp.nw', 'newick', txid_file + '.tmp.xml', 'phyloxml') treexml = PhyloXMLIO.read(open(txid_file + '.tmp.xml', 'r')) tree = treexml[0] treexml.attributes.pop('schemaLocation', None) # not supported by Forester tree.rooted = True my_ids = set([]) my_query_by_taxid = {} for leaf in tree.clade.find_clades(terminal=True): up_name = leaf.name.split('/')[0] tax_id, tax_name = find_tax_id_unip(up_name, db) if tax_id not in all_species_txids: if tax_id in merged_taxid.keys(): tax_id = merged_taxid[tax_id] tax_name = find_tax_name(tax_id, db) if tax_id in best_taxid_map.keys(): tax_id = best_taxid_map[tax_id] tax_name = find_tax_name(tax_id, db) else: tax_id0 = tax_id tax_id, tax_name = find_best_taxid(tax_id, db) if tax_id > 0: best_taxid_map[tax_id0] = tax_id if tax_id < 0: if (-tax_id) in merged_taxid.keys(): tax_id = merged_taxid[-tax_id] tax_name = find_tax_name(tax_id, db) if tax_id in my_query_by_taxid: my_query_by_taxid[tax_id].append(up_name) else: my_query_by_taxid[tax_id] = [up_name] my_ids.add(tax_id) my_tax_id = PhyloXML.Id(tax_id, provider='ncbi_taxonomy') taxon = PhyloXML.Taxonomy(id=my_tax_id) taxon.scientific_name = tax_name leaf._set_taxonomy(taxon) PhyloXMLIO.write(treexml, open(txid_file, 'w')) os.system('rm ' + txid_file + '.tmp.nw') os.system('rm ' + txid_file + '.tmp.xml') print "Taxid file done for: %s" % pfam_id existing_ids = list(set(my_ids) & set(all_species_txids)) existing_genes = [ g for txid in my_query_by_taxid.keys() for g in my_query_by_taxid[txid] if txid in existing_ids ] pickle.dump( { 'pfam_id': pfam_id, 'existing_ids': existing_ids, 'existing_genes': existing_genes }, open(rec_tag + 'ids.pickle', 'wb')) print "Pickle file done for: %s" % pfam_id if os.path.exists(reconciled_file): os.system('rm ' + reconciled_file) os.system( "java -Xmx4g -cp %s/forester_1038.jar org.forester.application.gsdi -g %s %s/ncbi_2_fixed.xml %s" % (lib_path, txid_file, species_tree_data_path, reconciled_file)) if os.path.exists(reconciled_file): if os.path.exists(reconciled_file + '.gz'): subprocess.check_call("rm %s.gz" % (reconciled_file), shell=True) subprocess.check_call("gzip %s" % (reconciled_file), shell=True) os.system('rm ' + rec_tag + 'reconciled_species_tree_used.xml') os.system('rm ' + rec_tag + 'reconciled_gsdi_log.txt') os.system('rm ' + txid_file) print "Reconciliation file done for: %s" % pfam_id