def read(self, stream): """ Instantiates and returns a `DataSet` object based on the NEWICK-formatted contents read from the file-like object source `stream`. """ if self.exclude_trees: return self.dataset if self.dataset is None: self.dataset = dataobject.DataSet() taxon_set = self.get_default_taxon_set() tree_list = self.dataset.new_tree_list(taxon_set=taxon_set) for t in tree_source_iter( stream=stream, taxon_set=taxon_set, rooting_interpreter=self.rooting_interpreter, hyphens_as_tokens=self.hyphens_as_tokens, extract_comment_metadata=self.extract_comment_metadata, store_tree_weights=self.store_tree_weights, encode_splits=self.encode_splits, preserve_underscores=self.preserve_underscores, suppress_internal_node_taxa=self.suppress_internal_node_taxa, edge_len_type=self.edge_len_type, case_sensitive_taxon_labels=self.case_sensitive_taxon_labels): tree_list.append(t, reindex_taxa=False) return self.dataset
def read(self, stream): """ Instantiates and returns a DataSet object based on the NEXUS-formatted contents given in the file-like object `stream`. """ self.reset() if self.dataset is None: self.dataset = dataobject.DataSet() self._prepare_to_read_from_stream(stream) self._parse_nexus_file() return self.dataset
def read(self, stream): if self.exclude_chars: return self.dataset if self.dataset is None: self.dataset = dataobject.DataSet() taxon_set = self.get_default_taxon_set() self.char_matrix = self.dataset.new_char_matrix( char_matrix_type=self.char_matrix_type, taxon_set=taxon_set) if isinstance(self.char_matrix, dataobject.StandardCharacterMatrix) \ and len(self.char_matrix.state_alphabets) == 0: self.char_matrix.state_alphabets.append( dataobject.get_state_alphabet_from_symbols("0123456789")) self.char_matrix.default_state_alphabet = self.char_matrix.state_alphabets[ 0] if self.char_matrix.default_state_alphabet is not None: self.symbol_state_map = self.char_matrix.default_state_alphabet.symbol_state_map( ) elif len(self.char_matrix.state_alphabets) == 0: raise ValueError("No state alphabets defined") elif len(self.char_matrix.state_alphabets) > 1: raise NotImplementedError( "Mixed state-alphabet matrices not supported") else: self.symbol_state_map = self.char_matrix.state_alphabets[0] self.stream = stream lines = fileutils.get_lines(self.stream) if len(lines) == 0: raise error.DataSourceError("No data in source", stream=self.stream) elif len(lines) <= 2: raise error.DataParseError( "Expecting at least 2 lines in PHYLIP format data source", stream=self.stream) desc_line = lines[0] lines = lines[1:] m = re.match('\s*(\d+)\s+(\d+)\s*$', desc_line) if m is None: raise self._data_parse_error( "Invalid data description line: '%s'" % desc_line) self.ntax = int(m.groups()[0]) self.nchar = int(m.groups()[1]) if self.ntax == 0 or self.nchar == 0: raise error.DataSourceError("No data in source", stream=self.stream) if self.interleaved: self._parse_interleaved(lines) else: self._parse_sequential(lines) self.stream = None return self.dataset
def read(self, stream): """ Main file parsing driver. """ if self.exclude_chars: return self.dataset if self.dataset is None: self.dataset = dataobject.DataSet() taxon_set = self.get_default_taxon_set() self.char_matrix = self.dataset.new_char_matrix( char_matrix_type=self.char_matrix_type, taxon_set=taxon_set) if isinstance(self.char_matrix, dataobject.StandardCharacterMatrix) \ and len(self.char_matrix.state_alphabets) == 0: self.char_matrix.state_alphabets.append( dataobject.get_state_alphabet_from_symbols("0123456789")) self.char_matrix.default_state_alphabet = self.char_matrix.state_alphabets[ 0] if self.char_matrix.default_state_alphabet is not None: self.symbol_state_map = self.char_matrix.default_state_alphabet.symbol_state_map( ) elif len(self.char_matrix.state_alphabets) == 0: raise ValueError("No state alphabets defined") elif len(self.char_matrix.state_alphabets) > 1: raise NotImplementedError( "Mixed state-alphabet matrices not supported") else: self.symbol_state_map = self.char_matrix.state_alphabets[0] curr_vec = None curr_taxon = None if self.simple_rows: legal_chars = self.char_matrix.default_state_alphabet.get_legal_symbols_as_str( ) for line_index, line in enumerate(stream): s = line.strip() if not s: continue if s.startswith('>'): if self.simple_rows and curr_taxon and curr_vec: self.char_matrix[curr_taxon] = "".join(curr_vec) name = s[1:].strip() curr_taxon = taxon_set.require_taxon(label=name) if curr_taxon in self.char_matrix: raise DataParseError( message="Fasta error: Repeated sequence name (%s) found" % name, row=line_index + 1, stream=stream) if curr_vec is not None and len(curr_vec) == 0: raise DataParseError( message= "Fasta error: Expected sequence, but found another sequence name (%s)" % name, row=line_index + 1, stream=stream) if self.simple_rows: curr_vec = [] else: curr_vec = dataobject.CharacterDataVector(taxon=curr_taxon) self.char_matrix[curr_taxon] = curr_vec elif curr_vec is None: raise DataParseError( message= "Fasta error: Expecting a lines starting with > before sequences", row=line_index + 1, stream=stream) else: if self.simple_rows: for col_ind, c in enumerate(s): c = c.strip() if not c: continue if c not in legal_chars: DataParseError( message='Unrecognized sequence symbol "%s"' % c, row=line_index + 1, column=col_ind + 1, stream=stream) curr_vec.append(c) else: for col_ind, c in enumerate(s): c = c.strip() if not c: continue try: state = self.symbol_state_map[c] curr_vec.append( dataobject.CharacterDataCell(value=state)) except: raise DataParseError( message='Unrecognized sequence symbol "%s"' % c, row=line_index + 1, column=col_ind + 1, stream=stream) if self.simple_rows and curr_taxon and curr_vec: self.char_matrix[curr_taxon] = "".join(curr_vec) return self.dataset
def tree_source_iter(self, stream): """ Iterates over a NEXUS-formatted source of trees. Only trees will be returned, and any and all character data will be skipped. The iterator will span over multiple tree blocks, but, because our NEXUS data model implementation currently does not recognize multiple taxon collection definnitions, taxa in those tree blocks will be aggregated into the same `TaxonSet` (a new one created, or the one passed to this method via the `taxon_set` argument). This behavior is similar to how multiple tree blocks are handled by a full NEXUS data file read. """ self.reset() if self.dataset is None: self.dataset = dataobject.DataSet() self.stream_tokenizer = nexustokenizer.NexusTokenizer( stream, preserve_underscores=self.preserve_underscores, hyphens_as_tokens=self.hyphens_as_tokens, extract_comment_metadata=self.extract_comment_metadata) token = self.stream_tokenizer.read_next_token_ucase() if token.upper() != "#NEXUS": raise self.data_format_error("Expecting '#NEXUS', but found '%s'" % token) while not self.stream_tokenizer.eof: token = self.stream_tokenizer.read_next_token_ucase() while token != None and token != 'BEGIN' and not self.stream_tokenizer.eof: token = self.stream_tokenizer.read_next_token_ucase() token = self.stream_tokenizer.read_next_token_ucase() if token == 'TAXA': self._parse_taxa_block() elif token == 'TREES': self.stream_tokenizer.skip_to_semicolon( ) # move past BEGIN command link_title = None taxon_set = None self.tree_translate_dict.clear() while not (token == 'END' or token == 'ENDBLOCK') \ and not self.stream_tokenizer.eof \ and not token==None: token = self.stream_tokenizer.read_next_token_ucase() if token == 'LINK': link_title = self._parse_link_statement().get('taxa') if token == 'TRANSLATE': if not taxon_set: taxon_set = self._get_taxon_set(link_title) self._prepopulate_translate_dict(taxon_set) self._parse_translate_statement(taxon_set) if token == 'TREE': if not taxon_set: taxon_set = self._get_taxon_set(link_title) self._prepopulate_translate_dict(taxon_set) tree = self._parse_tree_statement(taxon_set) yield tree self.stream_tokenizer.skip_to_semicolon( ) # move past END command else: # unknown block while not (token == 'END' or token == 'ENDBLOCK') \ and not self.stream_tokenizer.eof \ and not token==None: self.stream_tokenizer.skip_to_semicolon() token = self.stream_tokenizer.read_next_token_ucase() self.reset()
def tree_source_iter(self, stream): """ Generator to iterate over trees in data file. Primary goal is to be memory efficient, storing no more than one tree at a time. Speed might have to be sacrificed for this! """ n, use_ncl = self._get_fp(stream) if not use_ncl: pure_python_reader = nexusreader_py.NexusReader( encode_splits=self.encode_splits, rooting_interpreter=self.rooting_interpreter, finish_node_func=self.finish_node_func, allow_duplicate_taxon_labels=self. allow_duplicate_taxon_labels, preserve_underscores=self.preserve_underscores, suppress_internal_node_taxa=self. suppress_internal_node_taxa, taxon_set=self.attached_taxon_set, dataset=self.dataset) for tree in pure_python_reader.tree_source_iter(stream): yield tree return need_tree_event = Event() tree_ready_event = Event() die_event = Event() ntst = NCLTreeStreamThread(n, need_tree_event=need_tree_event, ready_event=tree_ready_event, die_event=die_event, format=self.format) if self.dataset is None: self.dataset = dataobject.DataSet() # if self.attached_taxon_set is not None and len(self.attached_taxon_set) == 0: # self._taxa_to_fill = self.attached_taxon_set # else: # self._taxa_to_fill = None # if self.attached_taxon_set is not None: # self._register_taxa_context(ntst.reader, [self.attached_taxon_set]) ncl_streamer = ntst.nts ntst.start() try: need_tree_event.set() self.curr_tree_tokens = None self.curr_tree = None while True: if ntst.done: break tree_ready_event.wait() tree_ready_event.clear() ncl_taxa_block = ncl_streamer.ncl_taxa_block self.curr_tree_tokens = ncl_streamer.tree_tokens if self.curr_tree_tokens is None: break rooted_flag = ncl_streamer.rooted_flag ncl_streamer.tree_tokens = None need_tree_event.set() self.curr_tree = self._ncl_tree_tokens_to_native_tree( ncl_taxa_block, self.attached_taxon_set, self.curr_tree_tokens, rooted_flag=rooted_flag) if self.curr_tree: yield self.curr_tree del self.curr_tree_tokens del self.curr_tree except Exception, v: _LOG.debug("%s" % str(v)) die_event.set() need_tree_event.set() raise
def read_filepath_into_dataset(self, file_path): _LOG.debug("Creating MultiFormatReader") ncl_nxs_reader_handle = nclwrapper.MultiFormatReader() _LOG.debug("Setting MultiFormatReader's WarningOutput Level") ncl_nxs_reader_handle.SetWarningOutputLevel( DENDROPY_NCL_WARNING_LEVEL) _LOG.debug( "Calling MultiFormatReader.cullIdenticalTaxaBlocks(True)") ncl_nxs_reader_handle.cullIdenticalTaxaBlocks(True) if self.dataset is None: self.dataset = dataobject.DataSet() if self.attached_taxon_set is not None and len( self.attached_taxon_set) == 0: self._taxa_to_fill = self.attached_taxon_set else: self._taxa_to_fill = None if self.attached_taxon_set is not None: self._register_taxa_context(ncl_nxs_reader_handle, [self.attached_taxon_set]) _LOG.debug("Calling MultiFormatReader.ReadFilepath(%s, %s)" % (file_path, self.format)) ncl_nxs_reader_handle.ReadFilepath(file_path, self.format) _LOG.debug("Calling MultiFormatReader.GetNumTaxaBlocks()") num_taxa_blocks = ncl_nxs_reader_handle.GetNumTaxaBlocks() for i in xrange(num_taxa_blocks): _LOG.debug("Calling MultiFormatReader.GetTaxaBlock(%d)" % i) ncl_tb = ncl_nxs_reader_handle.GetTaxaBlock(i) taxa_block = self._ncl_taxa_block_to_native(ncl_tb) self.dataset.add(taxa_block) #nab = ncl_nxs_reader_handle.GetNumAssumptionsBlocks(ncl_tb) #for k in xrange(nab): # a = ncl_nxs_reader_handle.GetAssumptionsBlock(ncl_tb, k) # cs = a.GetTaxSetNames() # print "TaxSets have the names " , str(cs) _LOG.debug( "Calling MultiFormatReader.GetNumCharactersBlocks()") num_char_blocks = ncl_nxs_reader_handle.GetNumCharactersBlocks( ncl_tb) for j in xrange(num_char_blocks): _LOG.debug( "Calling MultiFormatReader.GetCharactersBlock(taxablock, %d)" % j) ncl_cb = ncl_nxs_reader_handle.GetCharactersBlock( ncl_tb, j) char_block = self._ncl_characters_block_to_native( taxa_block, ncl_cb, ncl_nxs_reader_handle) if char_block: self.dataset.add(char_block) _LOG.debug("Calling MultiFormatReader.GetNumTreesBlocks()") ntrb = ncl_nxs_reader_handle.GetNumTreesBlocks(ncl_tb) for j in xrange(ntrb): trees_block = dataobject.TreeList() trees_block.taxon_set = taxa_block _LOG.debug("Calling MultiFormatReader.GetTreesBlock(%d)" % j) ncl_trb = ncl_nxs_reader_handle.GetTreesBlock(ncl_tb, j) for k in xrange(ncl_trb.GetNumTrees()): ftd = ncl_trb.GetFullTreeDescription(k) tokens = ftd.GetTreeTokens() rooted_flag = ftd.IsRooted() t = self._ncl_tree_tokens_to_native_tree( ncl_tb, taxa_block, tokens, rooted_flag=rooted_flag) if t: trees_block.append(t) self.dataset.add(trees_block) return self.dataset