def _parse_tree_statement(self, taxon_set=None): """ Processes a TREE command. Assumes that the file reader is positioned right after the "TREE" token in a TREE command. Calls on the NewickStatementParser of the trees module. """ token = self.stream_tokenizer.read_next_token() if token == '*': token = self.stream_tokenizer.read_next_token() tree_name = token token = self.stream_tokenizer.read_next_token() if token != '=': raise self.data_format_error("Expecting '=' in definition of Tree '%s' but found '%s'" % (tree_name, token)) tree_comments = self.stream_tokenizer.comments tree = nexustokenizer.tree_from_token_stream(stream_tokenizer=self.stream_tokenizer, taxon_set=taxon_set, translate_dict=self.tree_translate_dict, encode_splits=self.encode_splits, rooting_interpreter=self.rooting_interpreter, finish_node_func=self.finish_node_func, extract_comment_metadata=self.extract_comment_metadata, store_tree_weights=self.store_tree_weights, preserve_underscores=self.preserve_underscores, suppress_internal_node_taxa=self.suppress_internal_node_taxa, edge_len_type=self.edge_len_type, case_sensitive_taxon_labels=self.case_sensitive_taxon_labels) tree.label = tree_name if tree_comments is not None and len(tree_comments) > 0: tree.comments.extend(tree_comments) if self.stream_tokenizer.current_token != ';': self.stream_tokenizer.skip_to_semicolon() return tree
def _parse_tree_statement(self, taxon_set=None): """ Processes a TREE command. Assumes that the file reader is positioned right after the "TREE" token in a TREE command. Calls on the NewickStatementParser of the trees module. """ token = self.stream_tokenizer.read_next_token() if token == '*': token = self.stream_tokenizer.read_next_token() tree_name = token token = self.stream_tokenizer.read_next_token() if token != '=': raise self.data_format_error( "Expecting '=' in definition of Tree '%s' but found '%s'" % (tree_name, token)) tree_comments = self.stream_tokenizer.comments tree = nexustokenizer.tree_from_token_stream( stream_tokenizer=self.stream_tokenizer, taxon_set=taxon_set, translate_dict=self.tree_translate_dict, encode_splits=self.encode_splits, rooting_interpreter=self.rooting_interpreter, finish_node_func=self.finish_node_func, extract_comment_metadata=self.extract_comment_metadata, store_tree_weights=self.store_tree_weights, preserve_underscores=self.preserve_underscores, suppress_internal_node_taxa=self.suppress_internal_node_taxa, edge_len_type=self.edge_len_type, case_sensitive_taxon_labels=self.case_sensitive_taxon_labels) tree.label = tree_name if tree_comments is not None and len(tree_comments) > 0: tree.comments.extend(tree_comments) if self.stream_tokenizer.current_token != ';': self.stream_tokenizer.skip_to_semicolon() return tree
def _ncl_tree_tokens_to_native_tree(self, ncl_tb, taxa_block, tree_tokens, rooted_flag=None): if not tree_tokens: return None iid = ncl_tb.GetInstanceIdentifierString() if taxa_block is None: taxa_block = self._ncl_taxa_block_to_native(ncl_tb) # self.taxa_block = taxa_block lti = ListOfTokenIterator(tree_tokens) lti.tree_rooted = rooted_flag if iid not in self.tree_translate_dicts: self.tree_translate_dicts[ncl_tb] = {} for n, t in enumerate(taxa_block): self.tree_translate_dicts[ncl_tb][str(n + 1)] = t if self.encode_splits: t.clade_mask = (1 << n) return nexustokenizer.tree_from_token_stream( lti, taxon_set=taxa_block, translate_dict=self.tree_translate_dicts[ncl_tb], encode_splits=self.encode_splits, rooting_interpreter=self.rooting_interpreter, finish_node_func=self.finish_node_func)
def _ncl_tree_tokens_to_native_tree(self, ncl_tb, taxa_block, tree_tokens, rooted_flag=None): if not tree_tokens: return None iid = ncl_tb.GetInstanceIdentifierString() if taxa_block is None: taxa_block = self._ncl_taxa_block_to_native(ncl_tb) # self.taxa_block = taxa_block lti = ListOfTokenIterator(tree_tokens) lti.tree_rooted = rooted_flag if iid not in self.tree_translate_dicts: self.tree_translate_dicts[ncl_tb] = {} for n, t in enumerate(taxa_block): self.tree_translate_dicts[ncl_tb][str(n + 1)] = t if self.encode_splits: t.clade_mask = (1 << n) return nexustokenizer.tree_from_token_stream(lti, taxon_set=taxa_block, translate_dict=self.tree_translate_dicts[ncl_tb], encode_splits=self.encode_splits, rooting_interpreter=self.rooting_interpreter, finish_node_func=self.finish_node_func)
def tree_source_iter(stream, **kwargs): """ Iterates over a NEWICK-formatted source of trees given by file-like object `stream` Note that if `encode_splits` is True, then a `taxon_set` has to be given. This is because adding Taxon objects to a taxon set may invalidate split bitmasks. Because NEWICK tree taxa are added to a TaxonSet as they are found on a tree, there is a strong possibility that all split bitmasks get invalidated in the middle of parsing a tree. To avoid this, and, more importantly to avoid errors downstream in client code due to this, we force specification of a `taxon_set` if `encode_splits` is requested. The following optional keyword arguments are also recognized: `taxon_set` TaxonSet object to use when reading data. `as_rooted=True` (or `as_unrooted=False`) Unconditionally interprets all trees as rooted. `as_unrooted=True` (or `as_rooted=False`) Unconditionally interprets all trees as unrooted. `default_as_rooted=True` (or `default_as_unrooted=False`) Interprets all trees as rooted if rooting not given by `[&R]` or `[&U]` comments. `default_as_unrooted=True` (or `default_as_rooted=False`) Interprets all trees as rooted if rooting not given by `[&R]` or `[&U]` comments. `edge_len_type` Specifies the type of the edge lengths (int or float). `extract_comment_metadata` If True, any comments that begin with '&' or '&&' associated with items will be processed and stored as part of the annotation set of the object (`annotations`) If False, this will be skipped. Defaults to False. `store_tree_weights` If True, process the tree weight ("[&W 1/2]") comment associated with each tree, if any. `encode_splits` Specifies whether or not split bitmasks will be calculated and attached to the edges. `finish_node_func` Is a function that will be applied to each node after it has been constructed. `case_sensitive_taxon_labels` If True, then taxon labels are case sensitive (different cases = different taxa); defaults to False. `allow_duplicate_taxon_labels` if True, allow duplicate labels on trees `preserve_underscores` If True, unquoted underscores in labels will *not* converted to spaces. Defaults to False: all underscores not protected by quotes will be converted to spaces. `suppress_internal_node_taxa` If False, internal node labels will be instantantiatd into Taxon objects. Defaults to True: internal node labels will *not* be treated as taxa. `allow_duplicate_taxon_labels` If True, then multiple identical taxon labels will be allowed. Defaults to False: treat multiple identical taxon labels as an error. `hyphens_as_tokens` If True, hyphens will be treated as special punctuation characters. Defaults to False, hyphens not treated as special punctuation characters. """ if "taxon_set" in kwargs: taxon_set = kwargs["taxon_set"] del(kwargs["taxon_set"]) else: taxon_set = None if "encode_splits" in kwargs and taxon_set is None: raise Exception('When encoding splits on trees, a pre-populated TaxonSet instance ' \ + "must be provided using the 'taxon_set' keyword to avoid taxon/split bitmask values "\ + "changing as new Taxon objects are added to the set.") preserve_underscores = kwargs.get('preserve_underscores', False) hyphens_as_tokens = kwargs.get('hyphens_as_tokens', nexustokenizer.DEFAULT_HYPHENS_AS_TOKENS) extract_comment_metadata = kwargs.get("extract_comment_metadata", False) newick_stream = nexustokenizer.NexusTokenizer(stream, preserve_underscores=preserve_underscores, hyphens_as_tokens=hyphens_as_tokens, extract_comment_metadata=extract_comment_metadata, case_sensitive_taxon_labels=kwargs.get('case_sensitive_taxon_labels', False)) while not newick_stream.eof: t = nexustokenizer.tree_from_token_stream(newick_stream, taxon_set=taxon_set, **kwargs) if t is not None: yield t else: raise StopIteration()
def tree_source_iter(stream, **kwargs): """ Iterates over a NEWICK-formatted source of trees given by file-like object `stream` Note that if `encode_splits` is True, then a `taxon_set` has to be given. This is because adding Taxon objects to a taxon set may invalidate split bitmasks. Because NEWICK tree taxa are added to a TaxonSet as they are found on a tree, there is a strong possibility that all split bitmasks get invalidated in the middle of parsing a tree. To avoid this, and, more importantly to avoid errors downstream in client code due to this, we force specification of a `taxon_set` if `encode_splits` is requested. The following optional keyword arguments are also recognized: `taxon_set` TaxonSet object to use when reading data. `as_rooted=True` (or `as_unrooted=False`) Unconditionally interprets all trees as rooted. `as_unrooted=True` (or `as_rooted=False`) Unconditionally interprets all trees as unrooted. `default_as_rooted=True` (or `default_as_unrooted=False`) Interprets all trees as rooted if rooting not given by `[&R]` or `[&U]` comments. `default_as_unrooted=True` (or `default_as_rooted=False`) Interprets all trees as rooted if rooting not given by `[&R]` or `[&U]` comments. `edge_len_type` Specifies the type of the edge lengths (int or float). `extract_comment_metadata` If True, any comments that begin with '&' or '&&' associated with items will be processed and stored as part of the annotation set of the object (`annotations`) If False, this will be skipped. Defaults to False. `store_tree_weights` If True, process the tree weight ("[&W 1/2]") comment associated with each tree, if any. `encode_splits` Specifies whether or not split bitmasks will be calculated and attached to the edges. `finish_node_func` Is a function that will be applied to each node after it has been constructed. `case_sensitive_taxon_labels` If True, then taxon labels are case sensitive (different cases = different taxa); defaults to False. `allow_duplicate_taxon_labels` if True, allow duplicate labels on trees `preserve_underscores` If True, unquoted underscores in labels will *not* converted to spaces. Defaults to False: all underscores not protected by quotes will be converted to spaces. `suppress_internal_node_taxa` If False, internal node labels will be instantantiatd into Taxon objects. Defaults to True: internal node labels will *not* be treated as taxa. `allow_duplicate_taxon_labels` If True, then multiple identical taxon labels will be allowed. Defaults to False: treat multiple identical taxon labels as an error. `hyphens_as_tokens` If True, hyphens will be treated as special punctuation characters. Defaults to False, hyphens not treated as special punctuation characters. """ if "taxon_set" in kwargs: taxon_set = kwargs["taxon_set"] del (kwargs["taxon_set"]) else: taxon_set = None if "encode_splits" in kwargs and taxon_set is None: raise Exception('When encoding splits on trees, a pre-populated TaxonSet instance ' \ + "must be provided using the 'taxon_set' keyword to avoid taxon/split bitmask values "\ + "changing as new Taxon objects are added to the set.") preserve_underscores = kwargs.get('preserve_underscores', False) hyphens_as_tokens = kwargs.get('hyphens_as_tokens', nexustokenizer.DEFAULT_HYPHENS_AS_TOKENS) extract_comment_metadata = kwargs.get("extract_comment_metadata", False) newick_stream = nexustokenizer.NexusTokenizer( stream, preserve_underscores=preserve_underscores, hyphens_as_tokens=hyphens_as_tokens, extract_comment_metadata=extract_comment_metadata, case_sensitive_taxon_labels=kwargs.get('case_sensitive_taxon_labels', False)) while not newick_stream.eof: t = nexustokenizer.tree_from_token_stream(newick_stream, taxon_set=taxon_set, **kwargs) if t is not None: yield t else: raise StopIteration()
def tree_source_iter(stream, **kwargs): """ Iterates over a NEWICK-formatted source of trees given by file-like object `stream` Note that if `encode_splits` is True, then a `taxon_set` has to be given. This is because adding Taxon objects to a taxon set may invalidate split bitmasks. Because NEWICK tree taxa are added to a TaxonSet as they are found on a tree, there is a strong possibility that all split bitmasks get invalidated in the middle of parsing a tree. To avoid this, and, more importantly to avoid errors downstream in client code due to this, we force specification of a `taxon_set` if `encode_splits` is requested. The following optional keyword arguments are also recognized: - `taxon_set`: TaxonSet object to use when reading data - `as_rooted=True` (or `as_unrooted=False`): interprets trees as rooted - `as_unrooted=True` (or `as_rooted=False`): interprets trees as unrooted - `default_as_rooted=True` (or `default_as_unrooted=False`): interprets all trees as rooted if rooting not given by `[&R]` or `[&U]` comments - `default_as_unrooted=True` (or `default_as_rooted=False`): interprets all trees as rooted if rooting not given by `[&R]` or `[&U]` comments - `edge_len_type`: specifies the type of the edge lengths (int or float) - `encode_splits`: specifies whether or not split bitmasks will be calculated and attached to the edges. - `extract_comment_metadata`: if True, any 'hot comments' (i.e., comments that begin with '&') or NHX comments associated with items will be processed and stored as a dictionary attribute of the object: "comment_metadata". - `store_tree_weights`: if True, process the tree weight ("[&W 1/2]") comment associated with each tree, if any. - `finish_node_func`: is a function that will be applied to each node after it has been constructed. - `case_insensitive_taxon_labels`: If False, then taxon labels are case sensitive (different cases = different taxa); defaults to True """ if "taxon_set" in kwargs: taxon_set = kwargs["taxon_set"] del (kwargs["taxon_set"]) else: taxon_set = None if "encode_splits" in kwargs and taxon_set is None: raise Exception( "When encoding splits on trees, a pre-populated TaxonSet instance " + "must be provided using the 'taxon_set' keyword to avoid taxon/split bitmask values " + "changing as new Taxon objects are added to the set." ) preserve_underscores = kwargs.get("preserve_underscores", False) hyphens_as_tokens = kwargs.get("hyphens_as_tokens", nexustokenizer.DEFAULT_HYPHENS_AS_TOKENS) extract_comment_metadata = kwargs.get("extract_comment_metadata", False) newick_stream = nexustokenizer.NexusTokenizer( stream, preserve_underscores=preserve_underscores, hyphens_as_tokens=hyphens_as_tokens, extract_comment_metadata=extract_comment_metadata, case_insensitive_taxon_labels=kwargs.get("case_insensitive_taxon_labels", True), ) while not newick_stream.eof: t = nexustokenizer.tree_from_token_stream(newick_stream, taxon_set=taxon_set, **kwargs) if t is not None: yield t else: raise StopIteration()