def _process_chars(self, char_group, char_block, symbol_state_map, taxon): if self.exclude_chars: return if not char_group: return char_group = self._parse_nexus_multistate(char_group) for char in char_group: if len(char) == 1: try: state = symbol_state_map[char.upper()] except KeyError: if self.match_char is not None \ and char.upper() == self.match_char.upper(): state = char_block[0][len(char_block[taxon])].value else: raise self.data_format_error( "Unrecognized (single) state encountered in '%s': '%s' is not defined in %s" % ("".join(char_group), char, symbol_state_map.keys())) else: if hasattr(char, "open_tag"): state = self._get_state_for_multistate_char( char, char_block.default_state_alphabet) else: raise self.data_format_error( "Multiple character state without multi-state mark-up: '%s'" % char) if state is None: raise self.data_format_error( "Unrecognized state encountered: '%s'" % char) char_block[taxon].append(dataobject.CharacterDataCell(value=state))
def _parse_sequence_from_line(self, current_taxon, line, line_index): for c in line: if c in [' ', '\t']: continue try: state = self.symbol_state_map[c.upper()] except KeyError: if not self.ignore_invalid_chars: raise self._data_parse_error( "Invalid state symbol for taxon '%s': '%s'" % (current_taxon.label, c), line_index=line_index) else: self.char_matrix[current_taxon].append( dataobject.CharacterDataCell(value=state))
def _process_continuous_matrix_data(self, char_block): taxon_set = char_block.taxon_set token = self.stream_tokenizer.read_next_token() while token != ';' and not self.stream_tokenizer.eof: taxon = self._get_taxon(taxon_set=taxon_set, label=token) if taxon not in char_block: char_block[taxon] = dataobject.CharacterDataVector(taxon=taxon) if self.interleave: raise NotImplementedError( "Continuous characters in NEXUS schema not yet supported" ) else: while len( char_block[taxon] ) < self.file_specified_nchar and not self.stream_tokenizer.eof: char_group = self.stream_tokenizer.read_next_token( ignore_punctuation="-+") char_block[taxon].append( dataobject.CharacterDataCell( value=float(char_group))) if len(char_block[taxon]) < self.file_specified_nchar: raise self.data_format_error("Insufficient characters given for taxon '%s': expecting %d but only found %d ('%s')" \ % (taxon.label, self.file_specified_nchar, len(char_block[taxon]), char_block[taxon].symbols_as_string())) token = self.stream_tokenizer.read_next_token()
def read(self, stream): """ Main file parsing driver. """ if self.exclude_chars: return self.dataset if self.dataset is None: self.dataset = dataobject.DataSet() taxon_set = self.get_default_taxon_set() self.char_matrix = self.dataset.new_char_matrix( char_matrix_type=self.char_matrix_type, taxon_set=taxon_set) if isinstance(self.char_matrix, dataobject.StandardCharacterMatrix) \ and len(self.char_matrix.state_alphabets) == 0: self.char_matrix.state_alphabets.append( dataobject.get_state_alphabet_from_symbols("0123456789")) self.char_matrix.default_state_alphabet = self.char_matrix.state_alphabets[ 0] if self.char_matrix.default_state_alphabet is not None: self.symbol_state_map = self.char_matrix.default_state_alphabet.symbol_state_map( ) elif len(self.char_matrix.state_alphabets) == 0: raise ValueError("No state alphabets defined") elif len(self.char_matrix.state_alphabets) > 1: raise NotImplementedError( "Mixed state-alphabet matrices not supported") else: self.symbol_state_map = self.char_matrix.state_alphabets[0] curr_vec = None curr_taxon = None if self.simple_rows: legal_chars = self.char_matrix.default_state_alphabet.get_legal_symbols_as_str( ) for line_index, line in enumerate(stream): s = line.strip() if not s: continue if s.startswith('>'): if self.simple_rows and curr_taxon and curr_vec: self.char_matrix[curr_taxon] = "".join(curr_vec) name = s[1:].strip() curr_taxon = taxon_set.require_taxon(label=name) if curr_taxon in self.char_matrix: raise DataParseError( message="Fasta error: Repeated sequence name (%s) found" % name, row=line_index + 1, stream=stream) if curr_vec is not None and len(curr_vec) == 0: raise DataParseError( message= "Fasta error: Expected sequence, but found another sequence name (%s)" % name, row=line_index + 1, stream=stream) if self.simple_rows: curr_vec = [] else: curr_vec = dataobject.CharacterDataVector(taxon=curr_taxon) self.char_matrix[curr_taxon] = curr_vec elif curr_vec is None: raise DataParseError( message= "Fasta error: Expecting a lines starting with > before sequences", row=line_index + 1, stream=stream) else: if self.simple_rows: for col_ind, c in enumerate(s): c = c.strip() if not c: continue if c not in legal_chars: DataParseError( message='Unrecognized sequence symbol "%s"' % c, row=line_index + 1, column=col_ind + 1, stream=stream) curr_vec.append(c) else: for col_ind, c in enumerate(s): c = c.strip() if not c: continue try: state = self.symbol_state_map[c] curr_vec.append( dataobject.CharacterDataCell(value=state)) except: raise DataParseError( message='Unrecognized sequence symbol "%s"' % c, row=line_index + 1, column=col_ind + 1, stream=stream) if self.simple_rows and curr_taxon and curr_vec: self.char_matrix[curr_taxon] = "".join(curr_vec) return self.dataset
def _ncl_characters_block_to_native(self, taxa_block, ncl_cb, ncl_nxs_reader_handle): """ Processes a FORMAT command. Assumes that the file reader is positioned right after the "FORMAT" token in a FORMAT command. """ raw_matrix = ncl_cb.GetRawDiscreteMatrixRef() if ncl_cb.IsMixedType(): _LOG.warn( "Mixed datatype character blocks are not supported in Dendropy. Skipping..." ) return None char_block_type = _ncl_datatype_enum_to_dendropy( ncl_cb.GetDataType()) mapper = ncl_cb.GetDatatypeMapperForCharRef(0) symbols = mapper.GetSymbols() state_codes_mapping = mapper.GetPythonicStateVectors() char_block = char_block_type() char_block.taxon_set = taxa_block if isinstance(char_block, dataobject.StandardCharacterMatrix): sa = dataobject.get_state_alphabet_from_symbols( symbols=symbols, gap_symbol='-', missing_symbol='?') char_block.state_alphabets = [sa] char_block.default_state_alphabet = char_block.state_alphabets[ 0] symbol_state_map = char_block.default_state_alphabet.symbol_state_map( ) ncl_numeric_code_to_state = [] for s in symbols: ncl_numeric_code_to_state.append(symbol_state_map[s]) for sc in state_codes_mapping[len(symbols):-2]: search = set() for fundamental_state in sc: search.add(ncl_numeric_code_to_state[fundamental_state]) found = False for sym, state in symbol_state_map.iteritems(): ms = state.member_states if ms: possible = set(ms) if possible == search: found = True ncl_numeric_code_to_state.append(state) break if not found: raise ValueError( "NCL datatype cannot be coerced into datatype because ambiguity code for %s is missing " % str(search)) ncl_numeric_code_to_state.append(symbol_state_map['-']) ncl_numeric_code_to_state.append(symbol_state_map['?']) assert (len(raw_matrix) == len(taxa_block)) for row_ind, taxon in enumerate(taxa_block): v = dataobject.CharacterDataVector(taxon=taxon) raw_row = raw_matrix[row_ind] char_block[taxon] = v if not self.exclude_chars: for c in raw_row: state = ncl_numeric_code_to_state[c] v.append(dataobject.CharacterDataCell(value=state)) #dataset.characters_blocks.append(char_block) supporting_exsets = False supporting_charset_exsets = False if supporting_exsets: s = ncl_cb.GetExcludedIndexSet() print "Excluded chars =", str( nclwrapper.NxsSetReader.GetSetAsVector(s)) if supporting_charset_exsets: _LOG.debug("Calling MultiFormatReader.GetNumTaxaBlocks()") nab = ncl_nxs_reader_handle.GetNumAssumptionsBlocks(ncl_cb) for k in xrange(nab): _LOG.debug("Calling MultiFormatReader.GetNumTaxaBlocks()") a = ncl_nxs_reader_handle.GetAssumptionsBlock(ncl_cb, k) cs = a.GetCharSetNames() print "CharSets have the names ", str(cs) return char_block