def to_vector(self, sparse=True, dtype=None): """Get vector of bits/counts/floats. Returns ------- numpy.ndarray or scipy.sparse.csr_matrix Vector of bits/counts/floats """ if dtype is None: dtype = self.vector_dtype counts = self.counts if sparse: try: return csr_matrix(([counts[i] for i in self.indices], ([0] * self.bit_count, self.indices)), shape=(1, self.bits), dtype=dtype) except ValueError: raise E3FPBitsValueError( "Number of bits is lower than size of indices") else: bitvector = np.zeros(self.bits, dtype=dtype) try: bitvector[self.indices] = [counts[i] for i in self.indices] return bitvector except IndexError: raise E3FPBitsValueError( "Number of bits is lower than size of indices")
def __init__( self, indices=None, counts=None, bits=BITS_DEF, level=-1, name=None, props={}, **kwargs ): """Initialize.""" if indices is None and counts is None: raise E3FPOptionError("indices or counts must be specified") self.reset() if indices is not None: indices = np.asarray(indices, dtype=np.long) if np.any(indices >= bits): raise E3FPBitsValueError( "number of bits is lower than provided indices" ) if counts is None: indices, counts = np.unique(indices, return_counts=True) counts = dict(zip(indices, counts)) else: indices = np.unique(indices) if not np.all([x in indices for x in counts]): raise E3FPCountsError( "At least one index in `counts` is not in `indices`." ) if len(set(indices).symmetric_difference(counts)) > 0: raise E3FPCountsError( "At least one index in `indices` is not in `counts`." ) else: indices = np.asarray(sorted(counts.keys()), dtype=np.long) if np.any(indices >= bits): raise E3FPBitsValueError( "number of bits is lower than provided indices" ) self.indices = indices self.counts = counts self.bits = bits self.level = level if name: self.props[NAME_PROP_KEY] = name self.update_props(props)
def __sub__(self, other): if not isinstance(other, CountFingerprint): raise E3FPInvalidFingerprintError( "variable is not CountFingerprint.") if self.bits != other.bits: raise E3FPBitsValueError( "cannot subtract fingerprints of different sizes") if self.level == other.level: level = self.level else: level = -1 new_counts = self.counts.copy() for k, v in other.counts.items(): new_counts[k] = new_counts.get(k, 0) - v new_indices = np.asarray(new_counts.keys(), dtype=np.long) if other.__class__ is FloatFingerprint: new_class = FloatFingerprint else: new_class = self.__class__ return new_class(new_indices, counts=new_counts, bits=self.bits, level=level)
def __or__(self, other): if not isinstance(other, Fingerprint): raise E3FPInvalidFingerprintError( "variable is %s not Fingerprint" % (other.__class__.__name__)) if self.bits != other.bits: raise E3FPBitsValueError( "cannot compare fingerprints of different sizes") return Fingerprint(np.union1d(self.indices, other.indices), bits=self.bits)
def __sub__(self, other): if not isinstance(other, Fingerprint): raise E3FPInvalidFingerprintError( "variable is %s not Fingerprint" % (other.__class__.__name__)) if self.bits != other.bits: raise E3FPBitsValueError( "cannot subtract fingerprints of different sizes") return Fingerprint(np.setdiff1d(self.indices, other.indices, assume_unique=True), bits=self.bits)
def __init__( self, indices, bits=BITS_DEF, level=-1, name=None, props={}, **kwargs ): """Initialize Fingerprint object.""" self.reset() indices = np.asarray(indices, dtype=np.long) if np.any(indices >= bits): raise E3FPBitsValueError( "number of bits is lower than provided indices" ) self.indices = np.unique(indices) self.bits = bits self.level = level self.update_props(props) if name: self.name = name
def fold(self, bits=FOLD_BITS_DEF, method=0, linked=True): """Return fingerprint for bitvector folded to size `bits`. Parameters ---------- bits : int, optional Length of new bitvector, ideally multiple of 2. method : {0, 1}, optional Method to use for folding. 0 partitioning (array is divided into equal sized arrays of length `bits` which are bitwise combined with OR) 1 compression (adjacent bits pairs are combined with OR until length is `bits`) linked : bool, optional Link folded and unfolded fingerprints for easy referencing. Set to False if intending to save and want to reduce file size. Returns ------- Fingerprint : Fingerprint of folded bitvector """ if bits > self.bits: raise E3FPBitsValueError("folded bits greater than existing bits") if not np.log2(self.bits / bits).is_integer(): raise E3FPBitsValueError( "existing bits divided by power of 2 does not give folded bits" ) if method not in (0, 1): raise E3FPOptionError("method must be 0 or 1") if (bits, method) not in self.folded_fingerprint: if method == 0: folded_indices = self.indices % bits elif method == 1: folded_indices = self.indices / (self.bits / bits) self.index_to_folded_index_dict = dict( zip(self.indices, folded_indices)) folded_index_to_index_dict = {} for index, folded_index in self.index_to_folded_index_dict.items(): folded_index_to_index_dict.setdefault(folded_index, set([])).add(index) fp = self.__class__.from_indices(folded_indices, bits=bits, level=self.level) fp.update_props(self.props) fp.index_to_unfolded_index_dict = folded_index_to_index_dict if self.index_id_map is not None: fp.index_id_map = {} for index, id_set in self.index_id_map.items(): fp.index_id_map.setdefault( self.index_to_folded_index_dict[index], set()).update(id_set) if linked: fp.unfolded_fingerprint = self self.folded_fingerprint[(bits, method)] = fp assert isinstance(self.folded_fingerprint[(bits, method)], self.__class__) return self.folded_fingerprint[(bits, method)]