Exemple #1
0
    def test_index(self):
        class MetadataHaver(dict):
            metadata = {}

            @property
            def metadata(self):
                return self

        obj = MetadataHaver({'foo': 123})
        self.assertEqual(resolve_key(obj, 'foo'), 123)

        obj = MetadataHaver({'foo': 123, 'bar': 'baz'})
        self.assertEqual(resolve_key(obj, 'bar'), 'baz')
Exemple #2
0
    def test_index(self):
        class MetadataHaver(dict):
            metadata = {}

            @property
            def metadata(self):
                return self

        obj = MetadataHaver({'foo': 123})
        self.assertEqual(resolve_key(obj, 'foo'), 123)

        obj = MetadataHaver({'foo': 123, 'bar': 'baz'})
        self.assertEqual(resolve_key(obj, 'bar'), 'baz')
Exemple #3
0
    def from_iterable(cls, iterable, metric, key=None, keys=None):
        """Create DistanceMatrix from all pairs in an iterable given a metric.

        Parameters
        ----------
        iterable : iterable
            Iterable containing objects to compute pairwise distances on.
        metric : callable
            A function that takes two arguments and returns a float
            representing the distance between the two arguments.
        key : callable or metadata key, optional
            A function that takes one argument and returns a string
            representing the id of the element in the distance matrix.
            Alternatively, a key to a `metadata` property if it exists for
            each element in the `iterable`. If None, then default ids will be
            used.
        keys : iterable, optional
            An iterable of the same length as `iterable`. Each element will be
            used as the respective key.

        Returns
        -------
        DistanceMatrix
            The `metric` applied to all pairwise elements in the `iterable`.

        Raises
        ------
        ValueError
            If `key` and `keys` are both provided.

        Notes
        -----
        Symmetry and hollowness are assumed when calculating the distances via
        `metric`. Therefore, distances are only computed for the strictly
        upper/lower triangle.

        """
        iterable = list(iterable)
        if key is not None and keys is not None:
            raise ValueError("Cannot use both `key` and `keys` at the same"
                             " time.")

        keys_ = None
        if key is not None:
            keys_ = [resolve_key(e, key) for e in iterable]
        elif keys is not None:
            keys_ = keys

        dm = np.zeros((len(iterable), ) * 2)
        for i, a in enumerate(iterable):
            for j, b in enumerate(iterable[:i]):
                dm[i, j] = dm[j, i] = metric(a, b)

        return cls(dm, keys_)
Exemple #4
0
    def from_iterable(cls, iterable, metric, key=None, keys=None):
        """Create DistanceMatrix from all pairs in an iterable given a metric.

        Parameters
        ----------
        iterable : iterable
            Iterable containing objects to compute pairwise distances on.
        metric : callable
            A function that takes two arguments and returns a float
            representing the distance between the two arguments.
        key : callable or metadata key, optional
            A function that takes one argument and returns a string
            representing the id of the element in the distance matrix.
            Alternatively, a key to a `metadata` property if it exists for
            each element in the `iterable`. If None, then default ids will be
            used.
        keys : iterable, optional
            An iterable of the same length as `iterable`. Each element will be
            used as the respective key.

        Returns
        -------
        DistanceMatrix
            The `metric` applied to all pairwise elements in the `iterable`.

        Raises
        ------
        ValueError
            If `key` and `keys` are both provided.

        Notes
        -----
        Symmetry and hollowness are assumed when calculating the distances via
        `metric`. Therefore, distances are only computed for the strictly
        upper/lower triangle.

        """
        iterable = list(iterable)
        if key is not None and keys is not None:
            raise ValueError("Cannot use both `key` and `keys` at the same"
                             " time.")

        keys_ = None
        if key is not None:
            keys_ = [resolve_key(e, key) for e in iterable]
        elif keys is not None:
            keys_ = keys

        dm = np.zeros((len(iterable),) * 2)
        for i, a in enumerate(iterable):
            for j, b in enumerate(iterable[:i]):
                dm[i, j] = dm[j, i] = metric(a, b)

        return cls(dm, keys_)
Exemple #5
0
    def from_iterable(cls, iterable, metric, key=None, keys=None):
        """Create DissimilarityMatrix from an iterable given a metric.

        Parameters
        ----------
        iterable : iterable
            Iterable containing objects to compute pairwise dissimilarities on.
        metric : callable
            A function that takes two arguments and returns a float
            representing the dissimilarity between the two arguments.
        key : callable or metadata key, optional
            A function that takes one argument and returns a string
            representing the id of the element in the dissimilarity matrix.
            Alternatively, a key to a `metadata` property if it exists for
            each element in the `iterable`. If None, then default ids will be
            used.
        keys : iterable, optional
            An iterable of the same length as `iterable`. Each element will be
            used as the respective key.

        Returns
        -------
        DissimilarityMatrix
            The `metric` applied to all pairwise elements in the `iterable`.

        Raises
        ------
        ValueError
            If `key` and `keys` are both provided.

        """
        iterable = list(iterable)
        if key is not None and keys is not None:
            raise ValueError("Cannot use both `key` and `keys` at the same"
                             " time.")

        keys_ = None
        if key is not None:
            keys_ = [resolve_key(e, key) for e in iterable]
        elif keys is not None:
            keys_ = keys

        dm = np.empty((len(iterable),) * 2)
        for i, a in enumerate(iterable):
            for j, b in enumerate(iterable):
                dm[i, j] = metric(a, b)

        return cls(dm, keys_)
    def from_iterable(cls, iterable, metric, key=None, keys=None):
        """Create DissimilarityMatrix from an iterable given a metric.

        Parameters
        ----------
        iterable : iterable
            Iterable containing objects to compute pairwise dissimilarities on.
        metric : callable
            A function that takes two arguments and returns a float
            representing the dissimilarity between the two arguments.
        key : callable or metadata key, optional
            A function that takes one argument and returns a string
            representing the id of the element in the dissimilarity matrix.
            Alternatively, a key to a `metadata` property if it exists for
            each element in the `iterable`. If None, then default ids will be
            used.
        keys : iterable, optional
            An iterable of the same length as `iterable`. Each element will be
            used as the respective key.

        Returns
        -------
        DissimilarityMatrix
            The `metric` applied to all pairwise elements in the `iterable`.

        Raises
        ------
        ValueError
            If `key` and `keys` are both provided.

        """
        iterable = list(iterable)
        if key is not None and keys is not None:
            raise ValueError("Cannot use both `key` and `keys` at the same"
                             " time.")

        keys_ = None
        if key is not None:
            keys_ = [resolve_key(e, key) for e in iterable]
        elif keys is not None:
            keys_ = keys

        dm = np.empty((len(iterable),) * 2)
        for i, a in enumerate(iterable):
            for j, b in enumerate(iterable):
                dm[i, j] = metric(a, b)

        return cls(dm, keys_)
    def from_iterable(cls, iterable, metric, key=None, keys=None,
                      validate=True):
        """Create DistanceMatrix from all pairs in an iterable given a metric.

        Parameters
        ----------
        iterable : iterable
            Iterable containing objects to compute pairwise distances on.
        metric : callable
            A function that takes two arguments and returns a float
            representing the distance between the two arguments.
        key : callable or metadata key, optional
            A function that takes one argument and returns a string
            representing the id of the element in the distance matrix.
            Alternatively, a key to a `metadata` property if it exists for
            each element in the `iterable`. If None, then default ids will be
            used.
        keys : iterable, optional
            An iterable of the same length as `iterable`. Each element will be
            used as the respective key.
        validate : boolean, optional
            If ``True``, all pairwise distances are computed, including upper
            and lower triangles and the diagonal, and the resulting matrix is
            validated for symmetry and hollowness. If ``False``, `metric` is
            assumed to be hollow and symmetric and only the lower triangle
            (excluding the diagonal) is computed. Pass ``validate=False`` if
            you are sure `metric` is hollow and symmetric for improved
            performance.

        Returns
        -------
        DistanceMatrix
            The `metric` applied to pairwise elements in the `iterable`.

        Raises
        ------
        ValueError
            If `key` and `keys` are both provided.

        """
        if validate:
            return super(DistanceMatrix, cls).from_iterable(iterable, metric,
                                                            key, keys)

        iterable = list(iterable)
        if key is not None and keys is not None:
            raise ValueError("Cannot use both `key` and `keys` at the same"
                             " time.")

        keys_ = None
        if key is not None:
            keys_ = [resolve_key(e, key) for e in iterable]
        elif keys is not None:
            keys_ = keys

        dm = np.zeros((len(iterable),) * 2)
        for i, a in enumerate(iterable):
            for j, b in enumerate(iterable[:i]):
                dm[i, j] = dm[j, i] = metric(a, b)

        return cls(dm, keys_)
Exemple #8
0
    def sort(self, key=None, reverse=False):
        """Sort sequences in-place.

        Performs a stable sort of the sequences in-place.

        Parameters
        ----------
        key : callable or metadata key, optional
            If provided, defines a key to sort each sequence on. Can either be
            a callable accepting a single argument (each sequence) or a key
            into each sequence's ``metadata`` attribute. If not provided,
            sequences will be sorted using existing keys on the ``TabularMSA``.
        reverse: bool, optional
            If ``True``, sort in reverse order.

        Raises
        ------
        OperationError
            If `key` is not provided and keys do not exist on the MSA.

        See Also
        --------
        keys
        has_keys
        reindex

        Notes
        -----
        This method's API is similar to Python's built-in sorting functionality
        (e.g., ``list.sort()``, ``sorted()``). See [1]_ for an excellent
        tutorial on sorting in Python.

        References
        ----------
        .. [1] https://docs.python.org/3/howto/sorting.html

        Examples
        --------
        Create a ``TabularMSA`` object without keys:

        >>> from skbio import DNA, TabularMSA
        >>> seqs = [DNA('ACG', metadata={'id': 'c'}),
        ...         DNA('AC-', metadata={'id': 'b'}),
        ...         DNA('AC-', metadata={'id': 'a'})]
        >>> msa = TabularMSA(seqs)

        Sort the sequences in alphabetical order by sequence identifier:

        >>> msa.sort(key='id')
        >>> msa == TabularMSA([DNA('AC-', metadata={'id': 'a'}),
        ...                    DNA('AC-', metadata={'id': 'b'}),
        ...                    DNA('ACG', metadata={'id': 'c'})])
        True

        Note that since the sort is in-place, the ``TabularMSA`` object is
        modified (a new object is **not** returned).

        Create a ``TabularMSA`` object with keys:

        >>> seqs = [DNA('ACG'), DNA('AC-'), DNA('AC-')]
        >>> msa = TabularMSA(seqs, keys=['c', 'b', 'a'])

        Sort the sequences using the MSA's existing keys:

        >>> msa.sort()
        >>> msa == TabularMSA([DNA('AC-'), DNA('AC-'), DNA('ACG')],
        ...                   keys=['a', 'b', 'c'])
        True

        """
        if key is None:
            sort_keys = self.keys.tolist()
        else:
            sort_keys = [resolve_key(seq, key) for seq in self._seqs]

        if len(self) > 0:
            if self.has_keys():
                _, sorted_seqs, sorted_keys = self._sort_by_first_element(
                    [sort_keys, self._seqs,
                     self.keys.tolist()], reverse)
                self.keys = sorted_keys
            else:
                _, sorted_seqs = self._sort_by_first_element(
                    [sort_keys, self._seqs], reverse)
            self._seqs = list(sorted_seqs)
Exemple #9
0
    def reindex(self, key=None, keys=None):
        """Reassign keys to sequences in the MSA.

        Parameters
        ----------
        key : callable or metadata key, optional
            If provided, defines a unique, hashable key for each sequence in
            the MSA. Can either be a callable accepting a single argument (each
            sequence) or a key into each sequence's ``metadata`` attribute.
        keys : iterable, optional
            An iterable of the same length as the number of sequences in the
            MSA. `keys` must contain unique, hashable elements. Each element
            will be used as the respective key for the sequences in the MSA.

        Raises
        ------
        ValueError
            If `key` and `keys` are both provided.
        ValueError
            If `keys` is not the same length as the number of sequences in the
            MSA.
        UniqueError
            If keys are not unique.

        See Also
        --------
        keys
        has_keys

        Notes
        -----
        If `key` or `keys` are not provided, keys will not be set and certain
        operations requiring keys will raise an ``OperationError``.

        Examples
        --------
        Create a ``TabularMSA`` object without keys:

        >>> from skbio import DNA, TabularMSA
        >>> seqs = [DNA('ACG', metadata={'id': 'a'}),
        ...         DNA('AC-', metadata={'id': 'b'})]
        >>> msa = TabularMSA(seqs)
        >>> msa.has_keys()
        False

        Set keys on the MSA, using each sequence's ID:

        >>> msa.reindex(key='id')
        >>> msa.has_keys()
        True
        >>> msa.keys
        array(['a', 'b'], dtype=object)

        Remove keys from the MSA:

        >>> msa.reindex()
        >>> msa.has_keys()
        False

        Alternatively, an iterable of keys may be passed via `keys`:

        >>> msa.reindex(keys=['a', 'b'])
        >>> msa.keys
        array(['a', 'b'], dtype=object)

        """
        if key is not None and keys is not None:
            raise ValueError(
                "Cannot use both `key` and `keys` at the same time.")

        keys_ = None
        if key is not None:
            keys_ = [resolve_key(seq, key) for seq in self._seqs]
        elif keys is not None:
            keys = list(keys)
            if len(keys) != len(self):
                raise ValueError(
                    "Number of elements in `keys` must match number of "
                    "sequences: %d != %d" % (len(keys), len(self)))
            keys_ = keys

        if keys_ is not None:
            # Hashability of keys is implicitly checked here.
            duplicates = find_duplicates(keys_)
            if duplicates:
                raise UniqueError("Keys must be unique. Duplicate keys: %r" %
                                  duplicates)

            # Create an immutable ndarray to ensure key invariants are
            # preserved. Use object dtype to preserve original key types. This
            # is important, for example, because np.array(['a', 42]) will
            # upcast to ['a', '42'].
            keys_ = np.array(keys_, dtype=object, copy=True)
            keys_.flags.writeable = False

        self._keys = keys_
Exemple #10
0
 def test_wrong_type(self):
     with self.assertRaises(TypeError):
         resolve_key({'foo': 1}, 'foo')
Exemple #11
0
    def test_callable(self):
        def func(x):
            return str(x)

        self.assertEqual(resolve_key(1, func), "1")
        self.assertEqual(resolve_key(4, func), "4")
    def extend(self, sequences, minter=None, index=None):
        """Extend this MSA with sequences without recomputing alignment.

        Parameters
        ----------
        sequences : iterable of alphabet-aware scikit-bio sequence objects
            Sequences to be appended. Must match the dtype of the MSA and the
            number of positions in the MSA.
        minter : callable or metadata key, optional
            Used to create index labels for the sequences being appended. If
            callable, it generates a label directly. Otherwise it's treated as
            a key into the sequence metadata. Note that `minter` cannot be
            combined with `index`.
        index : pd.Index consumable, optional
            Index labels to use for the appended sequences. Must be the same
            length as `sequences`. Must be able to be passed directly to
            ``pd.Index`` constructor. Note that `index` cannot be combined
            with `minter`.

        Raises
        ------
        ValueError
            If both `minter` and `index` are both provided.
        ValueError
            If neither `minter` nor `index` are provided and the MSA has a
            non-default index.
        ValueError
            If `index` is not the same length as `sequences`.
        TypeError
            If `sequences` contains a type that does not have an alphabet.
        TypeError
            If `sequence` contains a type that does not match the dtype of the
            MSA.
        ValueError
            If the length of a sequence does not match the number of positions
            in the MSA.

        See Also
        --------
        append
        reassign_index

        Notes
        -----
        If neither `minter` nor `index` are provided and this MSA has default
        index labels, the new index labels will be auto-incremented.

        The MSA is not automatically re-aligned when appending sequences.
        Therefore, this operation is not necessarily meaningful on its own.

        Examples
        --------
        >>> from skbio import DNA, TabularMSA
        >>> msa = TabularMSA([DNA('ACGT')])
        >>> msa.extend([DNA('AG-T'), DNA('-G-T')])
        >>> msa == TabularMSA([DNA('ACGT'), DNA('AG-T'), DNA('-G-T')])
        True

        Auto-incrementing index labels:

        >>> msa.index
        Int64Index([0, 1, 2], dtype='int64')
        >>> msa.extend([DNA('ACGA'), DNA('AC-T'), DNA('----')])
        >>> msa.index
        Int64Index([0, 1, 2, 3, 4, 5], dtype='int64')

        """
        if minter is not None and index is not None:
            raise ValueError(
                "Cannot use both `minter` and `index` at the same time.")

        sequences = list(sequences)

        if minter is None and index is None:
            if self.index.equals(pd.Index(np.arange(len(self)))):
                index = range(len(self), len(self) + len(sequences))
            else:
                raise ValueError(
                    "MSA does not have default index labels, must provide "
                    "a `minter` or `index` for sequence(s).")
        elif minter is not None:
            index = [resolve_key(seq, minter) for seq in sequences]

        # Cast to Index to identify tuples as a MultiIndex to match
        # pandas constructor. Just setting would make an index of tuples.
        if not isinstance(index, pd.Index):
            index = pd.Index(index)

        self._assert_valid_sequences(sequences)

        # pandas doesn't give a user-friendly error message if we pass through.
        if len(sequences) != len(index):
            raise ValueError(
                "Number of sequences (%d) must match index length (%d)" %
                (len(sequences), len(index)))
        self._seqs = self._seqs.append(pd.Series(sequences, index=index))
Exemple #13
0
    def test_callable(self):
        def func(x):
            return str(x)

        self.assertEqual(resolve_key(1, func), "1")
        self.assertEqual(resolve_key(4, func), "4")
    def append(self, sequence, minter=None, label=None):
        """Append a sequence to the MSA without recomputing alignment.

        Parameters
        ----------
        sequence : alphabet-aware scikit-bio sequence object
            Sequence to be appended. Must match the dtype of the MSA and the
            number of positions in the MSA.
        minter : callable or metadata key, optional
            Used to create a label for the sequence being appended. If
            callable, it generates a label directly. Otherwise it's treated as
            a key into the sequence metadata. Note that `minter` cannot be
            combined with `label`.
        label : object, optional
            Index label to use for the appended sequence. Note that `label`
            cannot be combined with `minter`.

        Raises
        ------
        ValueError
            If both `minter` and `label` are provided.
        ValueError
            If neither `minter` nor `label` are provided and the MSA has a
            non-default index.
        TypeError
            If the sequence object is a type that doesn't have an alphabet.
        TypeError
            If the type of the sequence does not match the dtype of the MSA.
        ValueError
            If the length of the sequence does not match the number of
            positions in the MSA.

        See Also
        --------
        reassign_index

        Notes
        -----
        If neither `minter` nor `label` are provided and this MSA has default
        index labels, the new label will be auto-incremented.

        The MSA is not automatically re-aligned when a sequence is appended.
        Therefore, this operation is not necessarily meaningful on its own.

        Examples
        --------
        >>> from skbio import DNA, TabularMSA
        >>> msa = TabularMSA([DNA('ACGT')])
        >>> msa.append(DNA('AG-T'))
        >>> msa == TabularMSA([DNA('ACGT'), DNA('AG-T')])
        True

        Auto-incrementing index labels:

        >>> msa.index
        Int64Index([0, 1], dtype='int64')
        >>> msa.append(DNA('ACGA'))
        >>> msa.index
        Int64Index([0, 1, 2], dtype='int64')

        """
        if minter is not None and label is not None:
            raise ValueError(
                "Cannot use both `minter` and `label` at the same time.")

        if minter is None and label is None:
            if self.index.equals(pd.Index(np.arange(len(self)))):
                label = len(self)
            else:
                raise ValueError(
                    "Must provide a `minter` or `label` for this sequence.")

        if minter is not None:
            label = resolve_key(sequence, minter)

        self._assert_valid_sequence(sequence)

        self._seqs = self._seqs.append(pd.Series([sequence], index=[label]))
    def reassign_index(self, mapping=None, minter=None):
        """Reassign index labels to sequences in this MSA.

        Parameters
        ----------
        mapping : dict-like or callable, optional
            Dictionary or callable that maps existing labels to new labels. Any
            label without a mapping will remain the same.
        minter : callable or metadata key, optional
            If provided, defines an index label for each sequence. Can either
            be a callable accepting a single argument (each sequence) or a key
            into each sequence's ``metadata`` attribute.

        Raises
        ------
        ValueError
            If `mapping` and `minter` are both provided.

        See Also
        --------
        index

        Notes
        -----
        If neither `mapping` nor `minter` are provided, default pandas labels
        will be used: integer labels ``0..(N-1)``, where ``N`` is the number of
        sequences.

        Examples
        --------
        Create a ``TabularMSA`` object with default index labels:

        >>> from skbio import DNA, TabularMSA
        >>> seqs = [DNA('ACG', metadata={'id': 'a'}),
        ...         DNA('AC-', metadata={'id': 'b'})]
        >>> msa = TabularMSA(seqs)
        >>> msa.index
        Int64Index([0, 1], dtype='int64')

        Assign new index to the MSA using each sequence's ID as a label:

        >>> msa.reassign_index(minter='id')
        >>> msa.index
        Index(['a', 'b'], dtype='object')

        Assign default index:

        >>> msa.reassign_index()
        >>> msa.index
        Int64Index([0, 1], dtype='int64')

        Alternatively, a mapping of existing labels to new labels may be passed
        via `mapping`:

        >>> msa.reassign_index(mapping={0: 'seq1', 1: 'seq2'})
        >>> msa.index
        Index(['seq1', 'seq2'], dtype='object')

        """
        if mapping is not None and minter is not None:
            raise ValueError(
                "Cannot use both `mapping` and `minter` at the same time.")
        if mapping is not None:
            self._seqs.rename(mapping, inplace=True)
        elif minter is not None:
            index = [resolve_key(seq, minter) for seq in self._seqs]

            # Cast to Index to identify tuples as a MultiIndex to match
            # pandas constructor. Just setting would make an index of tuples.
            self.index = pd.Index(index)
        else:
            self._seqs.reset_index(drop=True, inplace=True)
Exemple #16
0
    def sort(self, key=None, reverse=False):
        """Sort sequences in-place.

        Performs a stable sort of the sequences in-place.

        Parameters
        ----------
        key : callable or metadata key, optional
            If provided, defines a key to sort each sequence on. Can either be
            a callable accepting a single argument (each sequence) or a key
            into each sequence's ``metadata`` attribute. If not provided,
            sequences will be sorted using existing keys on the ``TabularMSA``.
        reverse: bool, optional
            If ``True``, sort in reverse order.

        Raises
        ------
        OperationError
            If `key` is not provided and keys do not exist on the MSA.

        See Also
        --------
        keys
        has_keys
        reindex

        Notes
        -----
        This method's API is similar to Python's built-in sorting functionality
        (e.g., ``list.sort()``, ``sorted()``). See [1]_ for an excellent
        tutorial on sorting in Python.

        References
        ----------
        .. [1] https://docs.python.org/3/howto/sorting.html

        Examples
        --------
        Create a ``TabularMSA`` object without keys:

        >>> from skbio import DNA, TabularMSA
        >>> seqs = [DNA('ACG', metadata={'id': 'c'}),
        ...         DNA('AC-', metadata={'id': 'b'}),
        ...         DNA('AC-', metadata={'id': 'a'})]
        >>> msa = TabularMSA(seqs)

        Sort the sequences in alphabetical order by sequence identifier:

        >>> msa.sort(key='id')
        >>> msa == TabularMSA([DNA('AC-', metadata={'id': 'a'}),
        ...                    DNA('AC-', metadata={'id': 'b'}),
        ...                    DNA('ACG', metadata={'id': 'c'})])
        True

        Note that since the sort is in-place, the ``TabularMSA`` object is
        modified (a new object is **not** returned).

        Create a ``TabularMSA`` object with keys:

        >>> seqs = [DNA('ACG'), DNA('AC-'), DNA('AC-')]
        >>> msa = TabularMSA(seqs, keys=['c', 'b', 'a'])

        Sort the sequences using the MSA's existing keys:

        >>> msa.sort()
        >>> msa == TabularMSA([DNA('AC-'), DNA('AC-'), DNA('ACG')],
        ...                   keys=['a', 'b', 'c'])
        True

        """
        if key is None:
            sort_keys = self.keys.tolist()
        else:
            sort_keys = [resolve_key(seq, key) for seq in self._seqs]

        if len(self) > 0:
            if self.has_keys():
                _, sorted_seqs, sorted_keys = self._sort_by_first_element(
                    [sort_keys, self._seqs, self.keys.tolist()], reverse)
                self.keys = sorted_keys
            else:
                _, sorted_seqs = self._sort_by_first_element(
                    [sort_keys, self._seqs], reverse)
            self._seqs = list(sorted_seqs)
Exemple #17
0
    def reindex(self, key=None, keys=None):
        """Reassign keys to sequences in the MSA.

        Parameters
        ----------
        key : callable or metadata key, optional
            If provided, defines a unique, hashable key for each sequence in
            the MSA. Can either be a callable accepting a single argument (each
            sequence) or a key into each sequence's ``metadata`` attribute.
        keys : iterable, optional
            An iterable of the same length as the number of sequences in the
            MSA. `keys` must contain unique, hashable elements. Each element
            will be used as the respective key for the sequences in the MSA.

        Raises
        ------
        ValueError
            If `key` and `keys` are both provided.
        ValueError
            If `keys` is not the same length as the number of sequences in the
            MSA.
        UniqueError
            If keys are not unique.

        See Also
        --------
        keys
        has_keys

        Notes
        -----
        If `key` or `keys` are not provided, keys will not be set and certain
        operations requiring keys will raise an ``OperationError``.

        Examples
        --------
        Create a ``TabularMSA`` object without keys:

        >>> from skbio import DNA, TabularMSA
        >>> seqs = [DNA('ACG', metadata={'id': 'a'}),
        ...         DNA('AC-', metadata={'id': 'b'})]
        >>> msa = TabularMSA(seqs)
        >>> msa.has_keys()
        False

        Set keys on the MSA, using each sequence's ID:

        >>> msa.reindex(key='id')
        >>> msa.has_keys()
        True
        >>> msa.keys
        array(['a', 'b'], dtype=object)

        Remove keys from the MSA:

        >>> msa.reindex()
        >>> msa.has_keys()
        False

        Alternatively, an iterable of keys may be passed via `keys`:

        >>> msa.reindex(keys=['a', 'b'])
        >>> msa.keys
        array(['a', 'b'], dtype=object)

        """
        if key is not None and keys is not None:
            raise ValueError(
                "Cannot use both `key` and `keys` at the same time.")

        keys_ = None
        if key is not None:
            keys_ = [resolve_key(seq, key) for seq in self._seqs]
        elif keys is not None:
            keys = list(keys)
            if len(keys) != len(self):
                raise ValueError(
                    "Number of elements in `keys` must match number of "
                    "sequences: %d != %d" % (len(keys), len(self)))
            keys_ = keys

        if keys_ is not None:
            # Hashability of keys is implicitly checked here.
            duplicates = find_duplicates(keys_)
            if duplicates:
                raise UniqueError(
                    "Keys must be unique. Duplicate keys: %r" % duplicates)

            # Create an immutable ndarray to ensure key invariants are
            # preserved. Use object dtype to preserve original key types. This
            # is important, for example, because np.array(['a', 42]) will
            # upcast to ['a', '42'].
            keys_ = np.array(keys_, dtype=object, copy=True)
            keys_.flags.writeable = False

        self._keys = keys_
Exemple #18
0
 def test_wrong_type(self):
     with self.assertRaises(TypeError):
         resolve_key({'foo': 1}, 'foo')
    def reassign_index(self, mapping=None, minter=None):
        """Reassign index labels to sequences in this MSA.

        Parameters
        ----------
        mapping : dict-like or callable, optional
            Dictionary or callable that maps existing labels to new labels. Any
            label without a mapping will remain the same.
        minter : callable or metadata key, optional
            If provided, defines an index label for each sequence. Can either
            be a callable accepting a single argument (each sequence) or a key
            into each sequence's ``metadata`` attribute.

        Raises
        ------
        ValueError
            If `mapping` and `minter` are both provided.

        See Also
        --------
        index

        Notes
        -----
        If neither `mapping` nor `minter` are provided, default pandas labels
        will be used: integer labels ``0..(N-1)``, where ``N`` is the number of
        sequences.

        Examples
        --------
        Create a ``TabularMSA`` object with default index labels:

        >>> from skbio import DNA, TabularMSA
        >>> seqs = [DNA('ACG', metadata={'id': 'a'}),
        ...         DNA('AC-', metadata={'id': 'b'})]
        >>> msa = TabularMSA(seqs)
        >>> msa.index
        Int64Index([0, 1], dtype='int64')

        Assign new index to the MSA using each sequence's ID as a label:

        >>> msa.reassign_index(minter='id')
        >>> msa.index
        Index(['a', 'b'], dtype='object')

        Assign default index:

        >>> msa.reassign_index()
        >>> msa.index
        Int64Index([0, 1], dtype='int64')

        Alternatively, a mapping of existing labels to new labels may be passed
        via `mapping`:

        >>> msa.reassign_index(mapping={0: 'seq1', 1: 'seq2'})
        >>> msa.index
        Index(['seq1', 'seq2'], dtype='object')

        """
        if mapping is not None and minter is not None:
            raise ValueError(
                "Cannot use both `mapping` and `minter` at the same time.")
        if mapping is not None:
            self._seqs.rename(mapping, inplace=True)
        elif minter is not None:
            index = [resolve_key(seq, minter) for seq in self._seqs]

            # Cast to Index to identify tuples as a MultiIndex to match
            # pandas constructor. Just setting would make an index of tuples.
            self.index = pd.Index(index)
        else:
            self._seqs.reset_index(drop=True, inplace=True)
Exemple #20
0
    def from_iterable(cls, iterable, metric, key=None, keys=None,
                      validate=True):
        """Create DistanceMatrix from all pairs in an iterable given a metric.

        Parameters
        ----------
        iterable : iterable
            Iterable containing objects to compute pairwise distances on.
        metric : callable
            A function that takes two arguments and returns a float
            representing the distance between the two arguments.
        key : callable or metadata key, optional
            A function that takes one argument and returns a string
            representing the id of the element in the distance matrix.
            Alternatively, a key to a `metadata` property if it exists for
            each element in the `iterable`. If None, then default ids will be
            used.
        keys : iterable, optional
            An iterable of the same length as `iterable`. Each element will be
            used as the respective key.
        validate : boolean, optional
            If ``True``, all pairwise distances are computed, including upper
            and lower triangles and the diagonal, and the resulting matrix is
            validated for symmetry and hollowness. If ``False``, `metric` is
            assumed to be hollow and symmetric and only the lower triangle
            (excluding the diagonal) is computed. Pass ``validate=False`` if
            you are sure `metric` is hollow and symmetric for improved
            performance.

        Returns
        -------
        DistanceMatrix
            The `metric` applied to pairwise elements in the `iterable`.

        Raises
        ------
        ValueError
            If `key` and `keys` are both provided.

        """
        if validate:
            return super(DistanceMatrix, cls).from_iterable(iterable, metric,
                                                            key, keys)

        iterable = list(iterable)
        if key is not None and keys is not None:
            raise ValueError("Cannot use both `key` and `keys` at the same"
                             " time.")

        keys_ = None
        if key is not None:
            keys_ = [resolve_key(e, key) for e in iterable]
        elif keys is not None:
            keys_ = keys

        dm = np.zeros((len(iterable),) * 2)
        for i, a in enumerate(iterable):
            for j, b in enumerate(iterable[:i]):
                dm[i, j] = dm[j, i] = metric(a, b)

        return cls(dm, keys_)