def array2charmatrix(matrix, taxa_metadata=None): if taxa_metadata is None: taxa_metadata = TaxaMetadata.default(matrix.shape[0]) else: if len(taxa_metadata) != matrix.shape[0]: raise ValueError( f"Size of TaxaMetadata ({len(taxa_metadata)}) does not match size of matrix ({matrix.shape[0]})." ) alphabet = taxa_metadata.alphabet # TODO: add support for DNACharacterMatrix and others if alphabet is None: # input the values in the matrix directly alphabet = dendropy.new_standard_state_alphabet( val for val in np.unique(matrix)) char_matrix = dendropy.StandardCharacterMatrix( default_state_alphabet=alphabet, taxon_namespace=taxa_metadata.taxon_namespace) for taxon, ix in taxa_metadata.items(): char_matrix.new_sequence(taxon, [str(x) for x in matrix[ix, :]]) else: # assume values in the matrix are indices into the alphabet alpha2matrix_class = { dendropy.DNA_STATE_ALPHABET: dendropy.DnaCharacterMatrix, dendropy.RNA_STATE_ALPHABET: dendropy.RnaCharacterMatrix, dendropy.NUCLEOTIDE_STATE_ALPHABET: dendropy.NucleotideCharacterMatrix, dendropy.PROTEIN_STATE_ALPHABET: dendropy.ProteinCharacterMatrix, dendropy.BINARY_STATE_ALPHABET: dendropy.RestrictionSitesCharacterMatrix, } if alphabet in alpha2matrix_class: char_matrix = alpha2matrix_class[alphabet]( taxon_namespace=taxa_metadata.taxon_namespace) else: char_matrix = dendropy.StandardCharacterMatrix( default_state_alphabet=alphabet, taxon_namespace=taxa_metadata.taxon_namespace) for taxon, ix in taxa_metadata.items(): # you need .item to convert from numpy.int64 to int. dendropy expects only int char_matrix.new_sequence( taxon, [alphabet[v.item()] for v in matrix[ix, :]]) return char_matrix
def testFromStandardCharMatrix(self): ca2 = dendropy.StandardCharacterMatrix(self.char_matrix1) self.assertDistinctButEqual(self.char_matrix1, ca2, char_type=dendropy.StandardCharacterMatrix, distinct_state_alphabets=True, distinct_taxa=False)
def testIndexedRead(self): c = dendropy.StandardCharacterMatrix() c.read_from_stream(open(self.data_path, "rU"), "nexus", matrix_offset=1) self.assertDistinctButEqual(self.reference_dataset.char_matrices[1], c, char_type=dendropy.StandardCharacterMatrix, distinct_state_alphabets=None, distinct_taxa=True)
def write_summary_stats( self, dest=None, results_store=None, is_write_header=True, ): results_d = collections.OrderedDict() if self.supplemental_labels: for key in self.supplemental_labels: results_d[key] = self.supplemental_labels[key] for lineage_pair_idx, lineage_pair in enumerate( self.model.lineage_pairs): if self.is_concatenate_loci: if self.concatenated_locus_label: concatenated_locus_label = self.concatenated_locus_label else: concatenated_locus_label = model.compose_concatenated_locus_label( lineage_pair) field_name_prefix = "{}.{}.{}.joint.sfs".format( self.stat_label_prefix, lineage_pair.label, concatenated_locus_label, ) num_genes_deme0 = None num_genes_deme1 = None nsites = 0 master_data = dendropy.StandardCharacterMatrix( default_state_alphabet=self.default_state_alphabet) for locus_idx, locus_definition in enumerate( lineage_pair.locus_definitions): if num_genes_deme0 is None: num_genes_deme0 = locus_definition.num_genes_deme0 num_genes_deme1 = locus_definition.num_genes_deme1 else: if (num_genes_deme0 != locus_definition.num_genes_deme0 ) or (num_genes_deme0 != locus_definition.num_genes_deme0): raise ValueError( "Cannot concatenate loci if number of samples per deme vary across loci" ) data = self.read_data( filepath=locus_definition.alignment_filepath, datatype="standard", schema="fasta", taxon_namespace=master_data.taxon_namespace) nsites += locus_definition.num_sites master_data.extend_sequences(data, is_add_new_sequences=True) sequences = master_data.sequences() self._process_sequences( results_d, field_name_prefix, sequences=sequences, num_genes_deme0=num_genes_deme0, num_genes_deme1=num_genes_deme1, nsites=nsites, ) else: for locus_definition in lineage_pair.locus_definitions: field_name_prefix = "{}.{}.{}.joint.sfs".format( self.stat_label_prefix, lineage_pair.label, locus_definition.locus_label) data = self.read_data( filepath=locus_definition.alignment_filepath, datatype="standard", schema="fasta") sequences = data.sequences() self._process_sequences( results_d, field_name_prefix, sequences=sequences, num_genes_deme0=locus_definition.num_genes_deme0, num_genes_deme1=locus_definition.num_genes_deme1, nsites=locus_definition.num_sites, ) if is_write_header: dest.write(self.field_delimiter.join(results_d.keys())) dest.write("\n") dest.write( self.field_delimiter.join("{}".format(v) for v in results_d.values())) dest.write("\n") return results_d
def derived_state_matrix( char_matrix, ancestral_sequence=None, derived_state_alphabet=None, ignore_uncertain=True, ): """ Given a list of CharDataSequence objects, and a reference ancestral sequence, this returns a list of strings corresponding to the list of CharDataSequence objects, where a '0' indicates the ancestral state and '1' a derived state. e.g. Given: GGCTAATCTGA GCTTTTTCTGA GCTCTCTCTTC with ancestral sequence: GGTTAATCTGA this returns: 0010000000 0000110000 0001110011 """ if derived_state_alphabet is None: derived_state_alphabet = dendropy.StateAlphabet( fundamental_states="01", polymorphic_states=None, ambiguous_states=None, no_data_symbol="?", gap_symbol="-") derived_matrix = dendropy.StandardCharacterMatrix( taxon_namespace=char_matrix.taxon_namespace, default_state_alphabet=derived_state_alphabet) if ignore_uncertain: attr = "fundamental_indexes_with_gaps_as_missing" states_to_ignore = set([char_matrix.default_state_alphabet.gap_state, char_matrix.default_state_alphabet.no_data_state]) else: attr = "fundamental_indexes" states_to_ignore = set() if ancestral_sequence is None: ancestral_sequence = char_matrix[0] ancestral_fundamental_ids = [] for idx, c1 in enumerate(ancestral_sequence): if c1 in states_to_ignore: ancestral_fundamental_ids.append(None) else: ancestral_fundamental_ids.append(getattr(c1, attr)) for taxon in char_matrix: s1 = char_matrix[taxon] for idx, c2 in enumerate(s1): if ancestral_fundamental_ids[idx] is None or c2 in states_to_ignore: derived_matrix[taxon].append(derived_matrix.default_state_alphabet["?"]) continue f2 = getattr(c2, attr) if f2 == ancestral_fundamental_ids[idx]: derived_matrix[taxon].append(derived_matrix.default_state_alphabet["0"]) else: derived_matrix[taxon].append(derived_matrix.default_state_alphabet["1"]) return derived_matrix
def testCopyConstruction(self): chars1 = self.dataset.char_matrices[0] chars2 = dendropy.StandardCharacterMatrix(chars1) self.assertDistinctButEqualDiscreteCharMatrix(chars1, chars2, distinct_taxa=False)