Example #1
0
 def read(self, stream):
     """
     Instantiates and returns a `DataSet` object based on the
     NEWICK-formatted contents read from the file-like object source
     `stream`.
     """
     if self.exclude_trees:
         return self.dataset
     if self.dataset is None:
         self.dataset = dataobject.DataSet()
     taxon_set = self.get_default_taxon_set()
     tree_list = self.dataset.new_tree_list(taxon_set=taxon_set)
     for t in tree_source_iter(
             stream=stream,
             taxon_set=taxon_set,
             rooting_interpreter=self.rooting_interpreter,
             hyphens_as_tokens=self.hyphens_as_tokens,
             extract_comment_metadata=self.extract_comment_metadata,
             store_tree_weights=self.store_tree_weights,
             encode_splits=self.encode_splits,
             preserve_underscores=self.preserve_underscores,
             suppress_internal_node_taxa=self.suppress_internal_node_taxa,
             edge_len_type=self.edge_len_type,
             case_sensitive_taxon_labels=self.case_sensitive_taxon_labels):
         tree_list.append(t, reindex_taxa=False)
     return self.dataset
Example #2
0
 def read(self, stream):
     """
     Instantiates and returns a DataSet object based on the
     NEXUS-formatted contents given in the file-like object `stream`.
     """
     self.reset()
     if self.dataset is None:
         self.dataset = dataobject.DataSet()
     self._prepare_to_read_from_stream(stream)
     self._parse_nexus_file()
     return self.dataset
Example #3
0
    def read(self, stream):
        if self.exclude_chars:
            return self.dataset
        if self.dataset is None:
            self.dataset = dataobject.DataSet()
        taxon_set = self.get_default_taxon_set()
        self.char_matrix = self.dataset.new_char_matrix(
            char_matrix_type=self.char_matrix_type, taxon_set=taxon_set)
        if isinstance(self.char_matrix, dataobject.StandardCharacterMatrix) \
            and len(self.char_matrix.state_alphabets) == 0:
            self.char_matrix.state_alphabets.append(
                dataobject.get_state_alphabet_from_symbols("0123456789"))
            self.char_matrix.default_state_alphabet = self.char_matrix.state_alphabets[
                0]
        if self.char_matrix.default_state_alphabet is not None:
            self.symbol_state_map = self.char_matrix.default_state_alphabet.symbol_state_map(
            )
        elif len(self.char_matrix.state_alphabets) == 0:
            raise ValueError("No state alphabets defined")
        elif len(self.char_matrix.state_alphabets) > 1:
            raise NotImplementedError(
                "Mixed state-alphabet matrices not supported")
        else:
            self.symbol_state_map = self.char_matrix.state_alphabets[0]

        self.stream = stream
        lines = fileutils.get_lines(self.stream)
        if len(lines) == 0:
            raise error.DataSourceError("No data in source",
                                        stream=self.stream)
        elif len(lines) <= 2:
            raise error.DataParseError(
                "Expecting at least 2 lines in PHYLIP format data source",
                stream=self.stream)

        desc_line = lines[0]
        lines = lines[1:]
        m = re.match('\s*(\d+)\s+(\d+)\s*$', desc_line)
        if m is None:
            raise self._data_parse_error(
                "Invalid data description line: '%s'" % desc_line)
        self.ntax = int(m.groups()[0])
        self.nchar = int(m.groups()[1])
        if self.ntax == 0 or self.nchar == 0:
            raise error.DataSourceError("No data in source",
                                        stream=self.stream)
        if self.interleaved:
            self._parse_interleaved(lines)
        else:
            self._parse_sequential(lines)
        self.stream = None
        return self.dataset
Example #4
0
    def read(self, stream):
        """
        Main file parsing driver.
        """

        if self.exclude_chars:
            return self.dataset
        if self.dataset is None:
            self.dataset = dataobject.DataSet()
        taxon_set = self.get_default_taxon_set()
        self.char_matrix = self.dataset.new_char_matrix(
            char_matrix_type=self.char_matrix_type, taxon_set=taxon_set)
        if isinstance(self.char_matrix, dataobject.StandardCharacterMatrix) \
            and len(self.char_matrix.state_alphabets) == 0:
            self.char_matrix.state_alphabets.append(
                dataobject.get_state_alphabet_from_symbols("0123456789"))
            self.char_matrix.default_state_alphabet = self.char_matrix.state_alphabets[
                0]
        if self.char_matrix.default_state_alphabet is not None:
            self.symbol_state_map = self.char_matrix.default_state_alphabet.symbol_state_map(
            )
        elif len(self.char_matrix.state_alphabets) == 0:
            raise ValueError("No state alphabets defined")
        elif len(self.char_matrix.state_alphabets) > 1:
            raise NotImplementedError(
                "Mixed state-alphabet matrices not supported")
        else:
            self.symbol_state_map = self.char_matrix.state_alphabets[0]

        curr_vec = None
        curr_taxon = None

        if self.simple_rows:
            legal_chars = self.char_matrix.default_state_alphabet.get_legal_symbols_as_str(
            )

        for line_index, line in enumerate(stream):
            s = line.strip()
            if not s:
                continue
            if s.startswith('>'):
                if self.simple_rows and curr_taxon and curr_vec:
                    self.char_matrix[curr_taxon] = "".join(curr_vec)
                name = s[1:].strip()
                curr_taxon = taxon_set.require_taxon(label=name)
                if curr_taxon in self.char_matrix:
                    raise DataParseError(
                        message="Fasta error: Repeated sequence name (%s) found"
                        % name,
                        row=line_index + 1,
                        stream=stream)
                if curr_vec is not None and len(curr_vec) == 0:
                    raise DataParseError(
                        message=
                        "Fasta error: Expected sequence, but found another sequence name (%s)"
                        % name,
                        row=line_index + 1,
                        stream=stream)
                if self.simple_rows:
                    curr_vec = []
                else:
                    curr_vec = dataobject.CharacterDataVector(taxon=curr_taxon)
                    self.char_matrix[curr_taxon] = curr_vec
            elif curr_vec is None:
                raise DataParseError(
                    message=
                    "Fasta error: Expecting a lines starting with > before sequences",
                    row=line_index + 1,
                    stream=stream)
            else:
                if self.simple_rows:
                    for col_ind, c in enumerate(s):
                        c = c.strip()
                        if not c:
                            continue
                        if c not in legal_chars:
                            DataParseError(
                                message='Unrecognized sequence symbol "%s"' %
                                c,
                                row=line_index + 1,
                                column=col_ind + 1,
                                stream=stream)
                        curr_vec.append(c)
                else:
                    for col_ind, c in enumerate(s):
                        c = c.strip()
                        if not c:
                            continue
                        try:
                            state = self.symbol_state_map[c]
                            curr_vec.append(
                                dataobject.CharacterDataCell(value=state))
                        except:
                            raise DataParseError(
                                message='Unrecognized sequence symbol "%s"' %
                                c,
                                row=line_index + 1,
                                column=col_ind + 1,
                                stream=stream)
        if self.simple_rows and curr_taxon and curr_vec:
            self.char_matrix[curr_taxon] = "".join(curr_vec)
        return self.dataset
Example #5
0
 def tree_source_iter(self, stream):
     """
     Iterates over a NEXUS-formatted source of trees.
     Only trees will be returned, and any and all character data will
     be skipped. The iterator will span over multiple tree blocks,
     but, because our NEXUS data model implementation currently does
     not recognize multiple taxon collection definnitions, taxa in
     those tree blocks will be aggregated into the same `TaxonSet` (a
     new one created, or the one passed to this method via the
     `taxon_set` argument). This behavior is similar to how multiple
     tree blocks are handled by a full NEXUS data file read.
     """
     self.reset()
     if self.dataset is None:
         self.dataset = dataobject.DataSet()
     self.stream_tokenizer = nexustokenizer.NexusTokenizer(
         stream,
         preserve_underscores=self.preserve_underscores,
         hyphens_as_tokens=self.hyphens_as_tokens,
         extract_comment_metadata=self.extract_comment_metadata)
     token = self.stream_tokenizer.read_next_token_ucase()
     if token.upper() != "#NEXUS":
         raise self.data_format_error("Expecting '#NEXUS', but found '%s'" %
                                      token)
     while not self.stream_tokenizer.eof:
         token = self.stream_tokenizer.read_next_token_ucase()
         while token != None and token != 'BEGIN' and not self.stream_tokenizer.eof:
             token = self.stream_tokenizer.read_next_token_ucase()
         token = self.stream_tokenizer.read_next_token_ucase()
         if token == 'TAXA':
             self._parse_taxa_block()
         elif token == 'TREES':
             self.stream_tokenizer.skip_to_semicolon(
             )  # move past BEGIN command
             link_title = None
             taxon_set = None
             self.tree_translate_dict.clear()
             while not (token == 'END' or token == 'ENDBLOCK') \
                     and not self.stream_tokenizer.eof \
                     and not token==None:
                 token = self.stream_tokenizer.read_next_token_ucase()
                 if token == 'LINK':
                     link_title = self._parse_link_statement().get('taxa')
                 if token == 'TRANSLATE':
                     if not taxon_set:
                         taxon_set = self._get_taxon_set(link_title)
                         self._prepopulate_translate_dict(taxon_set)
                     self._parse_translate_statement(taxon_set)
                 if token == 'TREE':
                     if not taxon_set:
                         taxon_set = self._get_taxon_set(link_title)
                         self._prepopulate_translate_dict(taxon_set)
                     tree = self._parse_tree_statement(taxon_set)
                     yield tree
             self.stream_tokenizer.skip_to_semicolon(
             )  # move past END command
         else:
             # unknown block
             while not (token == 'END' or token == 'ENDBLOCK') \
                 and not self.stream_tokenizer.eof \
                 and not token==None:
                 self.stream_tokenizer.skip_to_semicolon()
                 token = self.stream_tokenizer.read_next_token_ucase()
     self.reset()
Example #6
0
        def tree_source_iter(self, stream):
            """
            Generator to iterate over trees in data file.
            Primary goal is to be memory efficient, storing no more than one tree
            at a time. Speed might have to be sacrificed for this!
            """

            n, use_ncl = self._get_fp(stream)
            if not use_ncl:
                pure_python_reader = nexusreader_py.NexusReader(
                    encode_splits=self.encode_splits,
                    rooting_interpreter=self.rooting_interpreter,
                    finish_node_func=self.finish_node_func,
                    allow_duplicate_taxon_labels=self.
                    allow_duplicate_taxon_labels,
                    preserve_underscores=self.preserve_underscores,
                    suppress_internal_node_taxa=self.
                    suppress_internal_node_taxa,
                    taxon_set=self.attached_taxon_set,
                    dataset=self.dataset)
                for tree in pure_python_reader.tree_source_iter(stream):
                    yield tree
                return

            need_tree_event = Event()
            tree_ready_event = Event()
            die_event = Event()
            ntst = NCLTreeStreamThread(n,
                                       need_tree_event=need_tree_event,
                                       ready_event=tree_ready_event,
                                       die_event=die_event,
                                       format=self.format)

            if self.dataset is None:
                self.dataset = dataobject.DataSet()
#            if self.attached_taxon_set is not None and len(self.attached_taxon_set) == 0:
#                self._taxa_to_fill = self.attached_taxon_set
#            else:
#                self._taxa_to_fill = None
#            if self.attached_taxon_set is not None:
#                self._register_taxa_context(ntst.reader, [self.attached_taxon_set])

            ncl_streamer = ntst.nts
            ntst.start()
            try:
                need_tree_event.set()
                self.curr_tree_tokens = None
                self.curr_tree = None
                while True:
                    if ntst.done:
                        break
                    tree_ready_event.wait()
                    tree_ready_event.clear()
                    ncl_taxa_block = ncl_streamer.ncl_taxa_block

                    self.curr_tree_tokens = ncl_streamer.tree_tokens
                    if self.curr_tree_tokens is None:
                        break
                    rooted_flag = ncl_streamer.rooted_flag
                    ncl_streamer.tree_tokens = None
                    need_tree_event.set()
                    self.curr_tree = self._ncl_tree_tokens_to_native_tree(
                        ncl_taxa_block,
                        self.attached_taxon_set,
                        self.curr_tree_tokens,
                        rooted_flag=rooted_flag)
                    if self.curr_tree:
                        yield self.curr_tree
                del self.curr_tree_tokens
                del self.curr_tree
            except Exception, v:
                _LOG.debug("%s" % str(v))
                die_event.set()
                need_tree_event.set()
                raise
Example #7
0
        def read_filepath_into_dataset(self, file_path):

            _LOG.debug("Creating MultiFormatReader")
            ncl_nxs_reader_handle = nclwrapper.MultiFormatReader()
            _LOG.debug("Setting MultiFormatReader's WarningOutput Level")
            ncl_nxs_reader_handle.SetWarningOutputLevel(
                DENDROPY_NCL_WARNING_LEVEL)
            _LOG.debug(
                "Calling MultiFormatReader.cullIdenticalTaxaBlocks(True)")
            ncl_nxs_reader_handle.cullIdenticalTaxaBlocks(True)

            if self.dataset is None:
                self.dataset = dataobject.DataSet()

            if self.attached_taxon_set is not None and len(
                    self.attached_taxon_set) == 0:
                self._taxa_to_fill = self.attached_taxon_set
            else:
                self._taxa_to_fill = None
            if self.attached_taxon_set is not None:
                self._register_taxa_context(ncl_nxs_reader_handle,
                                            [self.attached_taxon_set])

            _LOG.debug("Calling MultiFormatReader.ReadFilepath(%s, %s)" %
                       (file_path, self.format))
            ncl_nxs_reader_handle.ReadFilepath(file_path, self.format)

            _LOG.debug("Calling MultiFormatReader.GetNumTaxaBlocks()")
            num_taxa_blocks = ncl_nxs_reader_handle.GetNumTaxaBlocks()
            for i in xrange(num_taxa_blocks):
                _LOG.debug("Calling MultiFormatReader.GetTaxaBlock(%d)" % i)
                ncl_tb = ncl_nxs_reader_handle.GetTaxaBlock(i)
                taxa_block = self._ncl_taxa_block_to_native(ncl_tb)
                self.dataset.add(taxa_block)

                #nab = ncl_nxs_reader_handle.GetNumAssumptionsBlocks(ncl_tb)
                #for k in xrange(nab):
                #    a = ncl_nxs_reader_handle.GetAssumptionsBlock(ncl_tb, k)
                #    cs = a.GetTaxSetNames()
                #    print "TaxSets have the names " , str(cs)

                _LOG.debug(
                    "Calling MultiFormatReader.GetNumCharactersBlocks()")
                num_char_blocks = ncl_nxs_reader_handle.GetNumCharactersBlocks(
                    ncl_tb)
                for j in xrange(num_char_blocks):
                    _LOG.debug(
                        "Calling MultiFormatReader.GetCharactersBlock(taxablock, %d)"
                        % j)
                    ncl_cb = ncl_nxs_reader_handle.GetCharactersBlock(
                        ncl_tb, j)
                    char_block = self._ncl_characters_block_to_native(
                        taxa_block, ncl_cb, ncl_nxs_reader_handle)
                    if char_block:
                        self.dataset.add(char_block)
                _LOG.debug("Calling MultiFormatReader.GetNumTreesBlocks()")
                ntrb = ncl_nxs_reader_handle.GetNumTreesBlocks(ncl_tb)
                for j in xrange(ntrb):
                    trees_block = dataobject.TreeList()
                    trees_block.taxon_set = taxa_block
                    _LOG.debug("Calling MultiFormatReader.GetTreesBlock(%d)" %
                               j)
                    ncl_trb = ncl_nxs_reader_handle.GetTreesBlock(ncl_tb, j)
                    for k in xrange(ncl_trb.GetNumTrees()):
                        ftd = ncl_trb.GetFullTreeDescription(k)
                        tokens = ftd.GetTreeTokens()
                        rooted_flag = ftd.IsRooted()
                        t = self._ncl_tree_tokens_to_native_tree(
                            ncl_tb,
                            taxa_block,
                            tokens,
                            rooted_flag=rooted_flag)
                        if t:
                            trees_block.append(t)
                    self.dataset.add(trees_block)
            return self.dataset