Esempio n. 1
0
def array2charmatrix(matrix, taxa_metadata=None):
    if taxa_metadata is None:
        taxa_metadata = TaxaMetadata.default(matrix.shape[0])
    else:
        if len(taxa_metadata) != matrix.shape[0]:
            raise ValueError(
                f"Size of TaxaMetadata ({len(taxa_metadata)}) does not match size of matrix ({matrix.shape[0]})."
            )

    alphabet = taxa_metadata.alphabet
    # TODO: add support for DNACharacterMatrix and others
    if alphabet is None:
        # input the values in the matrix directly
        alphabet = dendropy.new_standard_state_alphabet(
            val for val in np.unique(matrix))
        char_matrix = dendropy.StandardCharacterMatrix(
            default_state_alphabet=alphabet,
            taxon_namespace=taxa_metadata.taxon_namespace)
        for taxon, ix in taxa_metadata.items():
            char_matrix.new_sequence(taxon, [str(x) for x in matrix[ix, :]])
    else:
        # assume values in the matrix are indices into the alphabet
        alpha2matrix_class = {
            dendropy.DNA_STATE_ALPHABET:
            dendropy.DnaCharacterMatrix,
            dendropy.RNA_STATE_ALPHABET:
            dendropy.RnaCharacterMatrix,
            dendropy.NUCLEOTIDE_STATE_ALPHABET:
            dendropy.NucleotideCharacterMatrix,
            dendropy.PROTEIN_STATE_ALPHABET:
            dendropy.ProteinCharacterMatrix,
            dendropy.BINARY_STATE_ALPHABET:
            dendropy.RestrictionSitesCharacterMatrix,
        }
        if alphabet in alpha2matrix_class:
            char_matrix = alpha2matrix_class[alphabet](
                taxon_namespace=taxa_metadata.taxon_namespace)
        else:
            char_matrix = dendropy.StandardCharacterMatrix(
                default_state_alphabet=alphabet,
                taxon_namespace=taxa_metadata.taxon_namespace)

        for taxon, ix in taxa_metadata.items():
            # you need .item to convert from numpy.int64 to int. dendropy expects only int
            char_matrix.new_sequence(
                taxon, [alphabet[v.item()] for v in matrix[ix, :]])

    return char_matrix
 def testFromStandardCharMatrix(self):
     ca2 = dendropy.StandardCharacterMatrix(self.char_matrix1)
     self.assertDistinctButEqual(self.char_matrix1,
                                 ca2,
                                 char_type=dendropy.StandardCharacterMatrix,
                                 distinct_state_alphabets=True,
                                 distinct_taxa=False)
 def testIndexedRead(self):
     c = dendropy.StandardCharacterMatrix()
     c.read_from_stream(open(self.data_path, "rU"),
                        "nexus",
                        matrix_offset=1)
     self.assertDistinctButEqual(self.reference_dataset.char_matrices[1],
                                 c,
                                 char_type=dendropy.StandardCharacterMatrix,
                                 distinct_state_alphabets=None,
                                 distinct_taxa=True)
Esempio n. 4
0
 def write_summary_stats(
     self,
     dest=None,
     results_store=None,
     is_write_header=True,
 ):
     results_d = collections.OrderedDict()
     if self.supplemental_labels:
         for key in self.supplemental_labels:
             results_d[key] = self.supplemental_labels[key]
     for lineage_pair_idx, lineage_pair in enumerate(
             self.model.lineage_pairs):
         if self.is_concatenate_loci:
             if self.concatenated_locus_label:
                 concatenated_locus_label = self.concatenated_locus_label
             else:
                 concatenated_locus_label = model.compose_concatenated_locus_label(
                     lineage_pair)
             field_name_prefix = "{}.{}.{}.joint.sfs".format(
                 self.stat_label_prefix,
                 lineage_pair.label,
                 concatenated_locus_label,
             )
             num_genes_deme0 = None
             num_genes_deme1 = None
             nsites = 0
             master_data = dendropy.StandardCharacterMatrix(
                 default_state_alphabet=self.default_state_alphabet)
             for locus_idx, locus_definition in enumerate(
                     lineage_pair.locus_definitions):
                 if num_genes_deme0 is None:
                     num_genes_deme0 = locus_definition.num_genes_deme0
                     num_genes_deme1 = locus_definition.num_genes_deme1
                 else:
                     if (num_genes_deme0 != locus_definition.num_genes_deme0
                         ) or (num_genes_deme0 !=
                               locus_definition.num_genes_deme0):
                         raise ValueError(
                             "Cannot concatenate loci if number of samples per deme vary across loci"
                         )
                 data = self.read_data(
                     filepath=locus_definition.alignment_filepath,
                     datatype="standard",
                     schema="fasta",
                     taxon_namespace=master_data.taxon_namespace)
                 nsites += locus_definition.num_sites
                 master_data.extend_sequences(data,
                                              is_add_new_sequences=True)
             sequences = master_data.sequences()
             self._process_sequences(
                 results_d,
                 field_name_prefix,
                 sequences=sequences,
                 num_genes_deme0=num_genes_deme0,
                 num_genes_deme1=num_genes_deme1,
                 nsites=nsites,
             )
         else:
             for locus_definition in lineage_pair.locus_definitions:
                 field_name_prefix = "{}.{}.{}.joint.sfs".format(
                     self.stat_label_prefix, lineage_pair.label,
                     locus_definition.locus_label)
                 data = self.read_data(
                     filepath=locus_definition.alignment_filepath,
                     datatype="standard",
                     schema="fasta")
                 sequences = data.sequences()
                 self._process_sequences(
                     results_d,
                     field_name_prefix,
                     sequences=sequences,
                     num_genes_deme0=locus_definition.num_genes_deme0,
                     num_genes_deme1=locus_definition.num_genes_deme1,
                     nsites=locus_definition.num_sites,
                 )
     if is_write_header:
         dest.write(self.field_delimiter.join(results_d.keys()))
         dest.write("\n")
     dest.write(
         self.field_delimiter.join("{}".format(v)
                                   for v in results_d.values()))
     dest.write("\n")
     return results_d
Esempio n. 5
0
def derived_state_matrix(
        char_matrix,
        ancestral_sequence=None,
        derived_state_alphabet=None,
        ignore_uncertain=True,
        ):
    """
    Given a list of CharDataSequence objects, and a reference ancestral sequence,
    this returns a list of strings corresponding to the list of CharDataSequence
    objects, where a '0' indicates the ancestral state and '1' a derived state.

    e.g.

        Given:
                GGCTAATCTGA
                GCTTTTTCTGA
                GCTCTCTCTTC

        with ancestral sequence:
                GGTTAATCTGA

        this returns:
                0010000000
                0000110000
                0001110011
    """
    if derived_state_alphabet is None:
        derived_state_alphabet = dendropy.StateAlphabet(
                fundamental_states="01",
                polymorphic_states=None,
                ambiguous_states=None,
                no_data_symbol="?",
                gap_symbol="-")
    derived_matrix = dendropy.StandardCharacterMatrix(
            taxon_namespace=char_matrix.taxon_namespace,
            default_state_alphabet=derived_state_alphabet)
    if ignore_uncertain:
        attr = "fundamental_indexes_with_gaps_as_missing"
        states_to_ignore = set([char_matrix.default_state_alphabet.gap_state, char_matrix.default_state_alphabet.no_data_state])
    else:
        attr = "fundamental_indexes"
        states_to_ignore = set()
    if ancestral_sequence is None:
        ancestral_sequence = char_matrix[0]
    ancestral_fundamental_ids = []
    for idx, c1 in enumerate(ancestral_sequence):
        if c1 in states_to_ignore:
            ancestral_fundamental_ids.append(None)
        else:
            ancestral_fundamental_ids.append(getattr(c1, attr))
    for taxon in char_matrix:
        s1 =  char_matrix[taxon]
        for idx, c2 in enumerate(s1):
            if ancestral_fundamental_ids[idx] is None or c2 in states_to_ignore:
                derived_matrix[taxon].append(derived_matrix.default_state_alphabet["?"])
                continue
            f2 = getattr(c2, attr)
            if f2 == ancestral_fundamental_ids[idx]:
                derived_matrix[taxon].append(derived_matrix.default_state_alphabet["0"])
            else:
                derived_matrix[taxon].append(derived_matrix.default_state_alphabet["1"])
    return derived_matrix
 def testCopyConstruction(self):
     chars1 = self.dataset.char_matrices[0]
     chars2 = dendropy.StandardCharacterMatrix(chars1)
     self.assertDistinctButEqualDiscreteCharMatrix(chars1, chars2, distinct_taxa=False)