Esempio n. 1
0
 def __init__(self, row, column, taxon_set, max_taxa, label):
     self.taxon_set = taxon_set
     self.label = label
     message = "Cannot add '%s': Declared number of taxa (%d) already defined: %s" % \
                     (label,
                     max_taxa,
                     str([("%s" % t.label) for t in taxon_set]))
     DataParseError.__init__(self, message=message, row=row, column=column)
 def __init__(self, row, column, taxon_set, max_taxa, label):
     self.taxon_set = taxon_set
     self.label = label
     message = "Cannot add '%s': Declared number of taxa (%d) already defined: %s" % (
         label,
         max_taxa,
         str([("%s" % t.label) for t in taxon_set]),
     )
     DataParseError.__init__(self, message=message, row=row, column=column)
Esempio n. 3
0
 def data_format_error(self, message):
     """
     Returns an exception object parameterized with line and
     column number values.
     """
     return DataParseError(message=message,
                           row=self.current_line_number,
                           column=self.current_col_number)
Esempio n. 4
0
    def read(self, stream):
        """
        Main file parsing driver.
        """

        if self.exclude_chars:
            return self.dataset
        if self.dataset is None:
            self.dataset = dataobject.DataSet()
        taxon_set = self.get_default_taxon_set()
        self.char_matrix = self.dataset.new_char_matrix(
            char_matrix_type=self.char_matrix_type, taxon_set=taxon_set)
        if isinstance(self.char_matrix, dataobject.StandardCharacterMatrix) \
            and len(self.char_matrix.state_alphabets) == 0:
            self.char_matrix.state_alphabets.append(
                dataobject.get_state_alphabet_from_symbols("0123456789"))
            self.char_matrix.default_state_alphabet = self.char_matrix.state_alphabets[
                0]
        if self.char_matrix.default_state_alphabet is not None:
            self.symbol_state_map = self.char_matrix.default_state_alphabet.symbol_state_map(
            )
        elif len(self.char_matrix.state_alphabets) == 0:
            raise ValueError("No state alphabets defined")
        elif len(self.char_matrix.state_alphabets) > 1:
            raise NotImplementedError(
                "Mixed state-alphabet matrices not supported")
        else:
            self.symbol_state_map = self.char_matrix.state_alphabets[0]

        curr_vec = None
        curr_taxon = None

        if self.simple_rows:
            legal_chars = self.char_matrix.default_state_alphabet.get_legal_symbols_as_str(
            )

        for line_index, line in enumerate(stream):
            s = line.strip()
            if not s:
                continue
            if s.startswith('>'):
                if self.simple_rows and curr_taxon and curr_vec:
                    self.char_matrix[curr_taxon] = "".join(curr_vec)
                name = s[1:].strip()
                curr_taxon = taxon_set.require_taxon(label=name)
                if curr_taxon in self.char_matrix:
                    raise DataParseError(
                        message="Fasta error: Repeated sequence name (%s) found"
                        % name,
                        row=line_index + 1,
                        stream=stream)
                if curr_vec is not None and len(curr_vec) == 0:
                    raise DataParseError(
                        message=
                        "Fasta error: Expected sequence, but found another sequence name (%s)"
                        % name,
                        row=line_index + 1,
                        stream=stream)
                if self.simple_rows:
                    curr_vec = []
                else:
                    curr_vec = dataobject.CharacterDataVector(taxon=curr_taxon)
                    self.char_matrix[curr_taxon] = curr_vec
            elif curr_vec is None:
                raise DataParseError(
                    message=
                    "Fasta error: Expecting a lines starting with > before sequences",
                    row=line_index + 1,
                    stream=stream)
            else:
                if self.simple_rows:
                    for col_ind, c in enumerate(s):
                        c = c.strip()
                        if not c:
                            continue
                        if c not in legal_chars:
                            DataParseError(
                                message='Unrecognized sequence symbol "%s"' %
                                c,
                                row=line_index + 1,
                                column=col_ind + 1,
                                stream=stream)
                        curr_vec.append(c)
                else:
                    for col_ind, c in enumerate(s):
                        c = c.strip()
                        if not c:
                            continue
                        try:
                            state = self.symbol_state_map[c]
                            curr_vec.append(
                                dataobject.CharacterDataCell(value=state))
                        except:
                            raise DataParseError(
                                message='Unrecognized sequence symbol "%s"' %
                                c,
                                row=line_index + 1,
                                column=col_ind + 1,
                                stream=stream)
        if self.simple_rows and curr_taxon and curr_vec:
            self.char_matrix[curr_taxon] = "".join(curr_vec)
        return self.dataset
Esempio n. 5
0
 def _read(self,
           stream,
           taxon_namespace_factory=None,
           tree_list_factory=None,
           char_matrix_factory=None,
           state_alphabet_factory=None,
           global_annotations_target=None):
     taxon_namespace = taxon_namespace_factory(label=None)
     if self.data_type is None:
         raise TypeError("Data type must be specified for this schema")
     if self.data_type == "standard" and self.default_state_alphabet is not None:
         char_matrix = char_matrix_factory(
             self.data_type,
             label=None,
             taxon_namespace=taxon_namespace,
             default_state_alphabet=self.default_state_alphabet,
         )
     else:
         char_matrix = char_matrix_factory(self.data_type,
                                           label=None,
                                           taxon_namespace=taxon_namespace)
     symbol_state_map = char_matrix.default_state_alphabet.full_symbol_state_map
     curr_vec = None
     curr_taxon = None
     for line_index, line in enumerate(stream):
         s = line.strip()
         if not s:
             continue
         if s.startswith('>'):
             name = s[1:].strip()
             curr_taxon = taxon_namespace.require_taxon(label=name)
             if curr_taxon in char_matrix:
                 raise DataParseError(
                     message=
                     "FASTA error: Repeated sequence name ('{}') found".
                     format(name),
                     line_num=line_index + 1,
                     stream=stream)
             if curr_vec is not None and len(curr_vec) == 0:
                 raise DataParseError(
                     message=
                     "FASTA error: Expected sequence, but found another sequence name ('{}')"
                     .format(name),
                     line_num=line_index + 1,
                     stream=stream)
             curr_vec = char_matrix[curr_taxon]
         elif curr_vec is None:
             raise DataParseError(
                 message=
                 "FASTA error: Expecting a lines starting with > before sequences",
                 line_num=line_index + 1,
                 stream=stream)
         else:
             states = []
             for col_ind, c in enumerate(s):
                 c = c.strip()
                 if not c:
                     continue
                 try:
                     state = symbol_state_map[c]
                 except KeyError:
                     raise DataParseError(
                         message="Unrecognized sequence symbol '{}'".format(
                             c),
                         line_num=line_index + 1,
                         col_num=col_ind + 1,
                         stream=stream)
                 states.append(state)
             curr_vec.extend(states)
     product = self.Product(taxon_namespaces=None,
                            tree_lists=None,
                            char_matrices=[char_matrix])
     return product
 def __init__(self, *args, **kwargs):
     DataParseError.__init__(self, *args, **kwargs)
Esempio n. 7
0
 def __init__(self, *args, **kwargs):
     DataParseError.__init__(self, *args, **kwargs)