def test_weighted_the_same_objects(self): minhash = Minhash() hashes1 = minhash.weighted_minhash( [[10, 5], [11, 5], [12, 5], [13, 5], [14, 5], [15, 5]], 16) hashes2 = minhash.weighted_minhash( [[10, 5], [11, 5], [12, 5], [13, 5], [14, 5], [15, 5]], 16) testing.assert_equal(1.0, minhash.weighted_jaccard(hashes1, hashes2))
def test_weighted_the_same_objects_different_weights(self): minhash = Minhash() hashes1 = minhash.weighted_minhash( [[10, 0], [11, 0], [12, 1], [13, 0], [14, 1], [15, 0]], 20) hashes2 = minhash.weighted_minhash( [[10, 1], [11, 2], [12, 0], [13, 1], [14, 0], [15, 1]], 20) testing.assert_equal(0.0, minhash.weighted_jaccard(hashes1, hashes2))
def test_weighted_different_objects(self): minhash = Minhash() hashes1 = minhash.weighted_minhash( [[10, 1], [11, 1], [12, 1], [13, 1], [14, 1], [15, 1]], 32) hashes2 = minhash.weighted_minhash( [[16, 1], [17, 1], [18, 1], [19, 1], [20, 1], [21, 1]], 32) testing.assert_equal(0.0, minhash.weighted_jaccard(hashes1, hashes2))
def test_weighted_different_objects(self): minhash = Minhash() hashes1 = minhash.weighted_minhash( ["10", "11", "12", "13", "14", "15"]) hashes2 = minhash.weighted_minhash( ["26", "27", "28", "29", "30", "31"]) testing.assert_array_equal(0, minhash.jaccard(hashes1, hashes2))
def __init__(self, W, sigma=1, shingle_size=15, hash_tables=128, response_variable='y', random_seed=42): np.random.seed(random_seed) self.W = W self.R = np.random.rand(W) self.sigma = sigma self.shingle_size = shingle_size self.index = HashIndex(hash_tables=hash_tables) self.minhash = Minhash(permutation_count=hash_tables) self.response_variable = response_variable
def test_weighted_the_same_objects_differnt_weights(self): minhash = Minhash() hashes1 = minhash.weighted_minhash( [[10, 1], [11, 1], [12, 1], [13, 1], [14, 1], [15, 1]], 16) hashes2 = minhash.weighted_minhash( [[10, 5], [11, 5], [12, 5], [13, 5], [14, 5], [15, 5]], 16) testing.assert_equal(minhash.weighted_jaccard(hashes1, hashes2), 0.15625) hashes1 = minhash.weighted_minhash( [[10, 4], [11, 5], [12, 6], [13, 1], [14, 1], [15, 1]], 16) hashes2 = minhash.weighted_minhash( [[10, 5], [11, 5], [12, 5], [13, 5], [14, 5], [15, 5]], 16) testing.assert_equal(minhash.weighted_jaccard(hashes1, hashes2), 0.515625)
def test_weighted_the_same_objects(self): minhash = Minhash() hashes1 = minhash.minhash(["10", "11", "12", "13", "14", "15"]) hashes2 = minhash.minhash(["10", "11", "12", "13", "14", "15"]) testing.assert_array_equal(1.0, minhash.jaccard(hashes1, hashes2))
def test_different_objects(self): minhash = Minhash() hashes1 = minhash.minhash(["10", "11", "12", "13", "14", "15"]) hashes2 = minhash.minhash(["13", "14", "15", "29", "30", "31"]) testing.assert_array_equal(0.3125, minhash.jaccard(hashes1, hashes2))
class TimeSeriesLSH: ''' Time series hashing algorithm based on the following paper: [NIPS Time Series Workshop 2016] SSH (Sketch, Shingle, & Hash) for Indexing Massive-Scale Time Series. by Chen Luo, Anshumali Shrivastava (https://arxiv.org/abs/1610.07328) Parameters ---------- W : int the number of hashtables in the index sigma : int sliding window step shingle_size: int the size of shingle hash_tables: int the number of hash tables in the hash response_variables: _encode_string the name of response variable column random_seed: int the random seed ''' def __init__(self, W, sigma=1, shingle_size=15, hash_tables=128, response_variable='y', random_seed=42): np.random.seed(random_seed) self.W = W self.R = np.random.rand(W) self.sigma = sigma self.shingle_size = shingle_size self.index = HashIndex(hash_tables=hash_tables) self.minhash = Minhash(permutation_count=hash_tables) self.response_variable = response_variable def fit(self, time_series): """Fit function that perform indexing of timeseries. Args: time_series: the array of timeseries pandas frames Returns: None """ for idx, ts in enumerate(time_series): shingles = self._series_shingles(ts) hash = self._hash_shingles(shingles) self.index.index({"series": ts, "idx": idx}, hash) def _series_shingles(self, series): """Makes shingles out of pandas frame Args: series: pandas frame with time series Returns: list of shingles """ znorm = StandardScaler().fit_transform( series[self.response_variable].values.reshape(-1, 1)) bits = self._series_to_bit_string(znorm.squeeze()) return self._bits_to_shingles(bits) def _series_to_bit_string(self, series): """Makes a list of bit strings from timeseries. Args: series: extracts from time series list of bit strings Returns: list of bit strings """ result = [] for i in range(0, len(series) - self.W + 1, self.sigma): window = series[i:(i + self.W)] result.append(np.sign(window.dot(self.R))) return result def _bits_to_shingles(self, bits): """Makes a weighted list of shingles out of list of bit strings by dedublicating the elements in the list. This is analogous to getting the frequencies of words from the document. Args: bits: extracts from time series list of bit strings Returns: list of pairs [shingle, occurence_count] """ result = {} for i in range(0, len(bits) - self.shingle_size + 1, 1): shingle = self._to_shingle_str(bits[i:(i + self.shingle_size)]) if shingle not in result: result[shingle] = 1 else: result[shingle] += 1 arr = [] for key, value in result.items(): arr.append([key, value]) return arr def _to_shingle_str(self, shingle_window): """Converts shingle window into compact bit string. (Assuming the shingle window is less than 32) Args: shingle_window: window of [1, -1, -1, ... ] Returns: integer value keeping bit representation of the shingle window where bit set to 1 if the corresponding value in the shingle window is >= 0, or set to 0 - otherwise """ if len(shingle_window) > 32: raise TimeSeriesLSHException( "Expected shingle window of size < 32") result = 0 for idx, c in enumerate(shingle_window): result = self.set_bit(result, idx, c >= 0) return result def set_bit(self, value, index, x): """Set the index:th bit of v to 1 if x is truthy, else to 0, and return the new value. Args: value: the value where bit is set index: index of the bit x: bit value Returns: integer value with bit set """ mask = 1 << index # Compute mask, an integer with just bit 'index' set. value &= ~mask # Clear the bit indicated by the mask (if x is False) if x: value |= mask # If x was True, set the bit indicated by the mask. return value def _hash_shingles(self, shingles): """Calculates LSH value using consistent weighted sampling schema. Args: singles: list of pairs [shingle, occurence_count] Returns: np array of hash pairs """ return self.minhash.weighted_minhash(shingles, np.power(2, self.shingle_size)) def query(self, query): """Query similar time series using weighted jaccard similarity. Args: query: pandas frame with time series Returns: a list of objects {"object": series, "similarity": similarity} """ query_shingles = self._series_shingles(query) query_hash = self._hash_shingles(query_shingles) similar_items = self.index.query(query_hash) result = [] for value in similar_items: series = value["object"] hash = value["hash"] similarity = self.minhash.weighted_jaccard(hash, query_hash) result.append({"object": series, "similarity": similarity}) return result