Ejemplo n.º 1
0
    def to_vector(self, sparse=True, dtype=None):
        """Get vector of bits/counts/floats.

        Returns
        -------
        numpy.ndarray or scipy.sparse.csr_matrix
            Vector of bits/counts/floats
        """
        if dtype is None:
            dtype = self.vector_dtype

        counts = self.counts
        if sparse:
            try:
                return csr_matrix(([counts[i] for i in self.indices],
                                   ([0] * self.bit_count, self.indices)),
                                  shape=(1, self.bits),
                                  dtype=dtype)
            except ValueError:
                raise E3FPBitsValueError(
                    "Number of bits is lower than size of indices")
        else:
            bitvector = np.zeros(self.bits, dtype=dtype)
            try:
                bitvector[self.indices] = [counts[i] for i in self.indices]
                return bitvector
            except IndexError:
                raise E3FPBitsValueError(
                    "Number of bits is lower than size of indices")
Ejemplo n.º 2
0
    def __init__(
        self,
        indices=None,
        counts=None,
        bits=BITS_DEF,
        level=-1,
        name=None,
        props={},
        **kwargs
    ):
        """Initialize."""
        if indices is None and counts is None:
            raise E3FPOptionError("indices or counts must be specified")

        self.reset()

        if indices is not None:
            indices = np.asarray(indices, dtype=np.long)

            if np.any(indices >= bits):
                raise E3FPBitsValueError(
                    "number of bits is lower than provided indices"
                )

            if counts is None:
                indices, counts = np.unique(indices, return_counts=True)
                counts = dict(zip(indices, counts))
            else:
                indices = np.unique(indices)
                if not np.all([x in indices for x in counts]):
                    raise E3FPCountsError(
                        "At least one index in `counts` is not in `indices`."
                    )
                if len(set(indices).symmetric_difference(counts)) > 0:
                    raise E3FPCountsError(
                        "At least one index in `indices` is not in `counts`."
                    )

        else:
            indices = np.asarray(sorted(counts.keys()), dtype=np.long)

            if np.any(indices >= bits):
                raise E3FPBitsValueError(
                    "number of bits is lower than provided indices"
                )

        self.indices = indices
        self.counts = counts
        self.bits = bits
        self.level = level
        if name:
            self.props[NAME_PROP_KEY] = name
        self.update_props(props)
Ejemplo n.º 3
0
    def __sub__(self, other):
        if not isinstance(other, CountFingerprint):
            raise E3FPInvalidFingerprintError(
                "variable is not CountFingerprint.")

        if self.bits != other.bits:
            raise E3FPBitsValueError(
                "cannot subtract fingerprints of different sizes")

        if self.level == other.level:
            level = self.level
        else:
            level = -1

        new_counts = self.counts.copy()
        for k, v in other.counts.items():
            new_counts[k] = new_counts.get(k, 0) - v

        new_indices = np.asarray(new_counts.keys(), dtype=np.long)

        if other.__class__ is FloatFingerprint:
            new_class = FloatFingerprint
        else:
            new_class = self.__class__

        return new_class(new_indices,
                         counts=new_counts,
                         bits=self.bits,
                         level=level)
Ejemplo n.º 4
0
    def __or__(self, other):
        if not isinstance(other, Fingerprint):
            raise E3FPInvalidFingerprintError(
                "variable is %s not Fingerprint" % (other.__class__.__name__))

        if self.bits != other.bits:
            raise E3FPBitsValueError(
                "cannot compare fingerprints of different sizes")

        return Fingerprint(np.union1d(self.indices, other.indices),
                           bits=self.bits)
Ejemplo n.º 5
0
    def __sub__(self, other):
        if not isinstance(other, Fingerprint):
            raise E3FPInvalidFingerprintError(
                "variable is %s not Fingerprint" % (other.__class__.__name__))

        if self.bits != other.bits:
            raise E3FPBitsValueError(
                "cannot subtract fingerprints of different sizes")

        return Fingerprint(np.setdiff1d(self.indices,
                                        other.indices,
                                        assume_unique=True),
                           bits=self.bits)
Ejemplo n.º 6
0
    def __init__(
        self, indices, bits=BITS_DEF, level=-1, name=None, props={}, **kwargs
    ):
        """Initialize Fingerprint object."""
        self.reset()

        indices = np.asarray(indices, dtype=np.long)

        if np.any(indices >= bits):
            raise E3FPBitsValueError(
                "number of bits is lower than provided indices"
            )

        self.indices = np.unique(indices)
        self.bits = bits
        self.level = level
        self.update_props(props)
        if name:
            self.name = name
Ejemplo n.º 7
0
    def fold(self, bits=FOLD_BITS_DEF, method=0, linked=True):
        """Return fingerprint for bitvector folded to size `bits`.

        Parameters
        ----------
        bits : int, optional
            Length of new bitvector, ideally multiple of 2.
        method : {0, 1}, optional
            Method to use for folding.

            0
                partitioning (array is divided into equal sized arrays of
                length `bits` which are bitwise combined with OR)
            1
                compression (adjacent bits pairs are combined with OR until
                length is `bits`)
        linked : bool, optional
            Link folded and unfolded fingerprints for easy referencing. Set
            to False if intending to save and want to reduce file size.

        Returns
        -------
        Fingerprint : Fingerprint of folded bitvector
        """
        if bits > self.bits:
            raise E3FPBitsValueError("folded bits greater than existing bits")
        if not np.log2(self.bits / bits).is_integer():
            raise E3FPBitsValueError(
                "existing bits divided by power of 2 does not give folded bits"
            )
        if method not in (0, 1):
            raise E3FPOptionError("method must be 0 or 1")

        if (bits, method) not in self.folded_fingerprint:
            if method == 0:
                folded_indices = self.indices % bits
            elif method == 1:
                folded_indices = self.indices / (self.bits / bits)

            self.index_to_folded_index_dict = dict(
                zip(self.indices, folded_indices))
            folded_index_to_index_dict = {}
            for index, folded_index in self.index_to_folded_index_dict.items():
                folded_index_to_index_dict.setdefault(folded_index,
                                                      set([])).add(index)

            fp = self.__class__.from_indices(folded_indices,
                                             bits=bits,
                                             level=self.level)
            fp.update_props(self.props)

            fp.index_to_unfolded_index_dict = folded_index_to_index_dict
            if self.index_id_map is not None:
                fp.index_id_map = {}
                for index, id_set in self.index_id_map.items():
                    fp.index_id_map.setdefault(
                        self.index_to_folded_index_dict[index],
                        set()).update(id_set)

            if linked:
                fp.unfolded_fingerprint = self
                self.folded_fingerprint[(bits, method)] = fp

        assert isinstance(self.folded_fingerprint[(bits, method)],
                          self.__class__)
        return self.folded_fingerprint[(bits, method)]