def test_union(self, all_values): bitmaps = [BitMap(values) for values in all_values] bitmaps_copy = [BitMap(bm) for bm in bitmaps] result = BitMap.union(*bitmaps) self.assertEqual(bitmaps_copy, bitmaps) expected_result = functools.reduce(lambda x, y: x|y, bitmaps) self.assertEqual(expected_result, result)
def test_flip(self, values, start, end): st.assume(start < end) bm_before = BitMap(values) bm_copy = BitMap(bm_before) bm_after = bm_before.flip(start, end) self.assertEqual(bm_before, bm_copy) self.check_flip(bm_before, bm_after, start, end)
class ManyOperationsTest(Util): @given(hyp_collection, hyp_many_collections, st.booleans()) def setUp(self, initial_values, all_values, cow): self.initial_bitmap = BitMap(initial_values, copy_on_write=cow) self.all_values = all_values self.all_bitmaps = [ BitMap(values, copy_on_write=cow) for values in all_values ] def test_update(self): self.initial_bitmap.update(*self.all_values) expected_result = functools.reduce( lambda x, y: x | y, self.all_bitmaps + [self.initial_bitmap]) self.assertEqual(expected_result, self.initial_bitmap) def test_intersection_update(self): self.initial_bitmap.intersection_update(*self.all_values) expected_result = functools.reduce( lambda x, y: x & y, self.all_bitmaps + [self.initial_bitmap]) self.assertEqual(expected_result, self.initial_bitmap) def test_union(self): result = BitMap.union(*self.all_bitmaps) expected_result = functools.reduce(lambda x, y: x | y, self.all_bitmaps) self.assertEqual(expected_result, result) def test_intersection(self): result = BitMap.intersection(*self.all_bitmaps) expected_result = functools.reduce(lambda x, y: x & y, self.all_bitmaps) self.assertEqual(expected_result, result)
def test_flip_empty(self, values, start, end, cow): st.assume(start >= end) bm_before = BitMap(values, copy_on_write=cow) bm_copy = BitMap(bm_before) bm_after = bm_before.flip(start, end) self.assertEqual(bm_before, bm_copy) self.assertEqual(bm_before, bm_after)
def test_adapt_BitMap(): bs = BitMap([1, 3, 5, 8]) result = db.adapt_BitMap(bs) expected = blosc.compress(bs.serialize(), cname='zstd') assert result == expected
def test_constructor_copy(self, values, other_value): st.assume(other_value not in values) bitmap1 = BitMap(values) bitmap2 = BitMap(bitmap1) self.assertEqual(bitmap1, bitmap2) bitmap1.add(other_value) self.assertNotEqual(bitmap1, bitmap2)
def bitsets(): return MockedBitMapDict( { 'a': BitMap([1, 2, 3]), 'b': BitMap([1, 2, 4, 5, 8]), 'c': BitMap([1, 2, 4, 8]) }, 8)
def __isub__(self, other: Optional[bitmap]) -> bitmap: if other is None: other = NIL_BITMAP if isinstance(other, bitmap): BitMap.__isub__(self.bm, other.bm) return self raise TypeError("Invalid type {type(other)}")
def test_wrong_constructor_values(self): with self.assertRaises(TypeError): # this should fire a type error! bitmap = BitMap([3, 'bla', 3, 42]) with self.assertRaises(ValueError): bitmap = BitMap(range(0, 10, 0)) with self.assertRaises(ValueError): bitmap = BitMap(range(10, 0, 1))
def dependencies(targets: List[Element], scope: int, *, recurse: bool = True) -> Iterator[Element]: # Keep track of 'visited' in this scope, so that all targets # share the same context. visited = (BitMap(), BitMap()) for target in targets: for element in target._dependencies(scope, recurse=recurse, visited=visited): yield element
def do_test_binary_op(self, op): old_bitmap1 = BitMap(self.bitmap1) old_bitmap2 = BitMap(self.bitmap2) result_set = op(self.set1, self.set2) result_bitmap = op(self.bitmap1, self.bitmap2) self.assertEqual(self.bitmap1, old_bitmap1) self.assertEqual(self.bitmap2, old_bitmap2) self.compare_with_set(result_bitmap, result_set)
def do_test(self, values1, values2, op): self.assertEqual(op(BitMap(values1), BitMap(values1)), op(set(values1), set(values1))) self.assertEqual(op(BitMap(values1), BitMap(values2)), op(set(values1), set(values2))) self.assertEqual(op(BitMap(values1)|BitMap(values2), BitMap(values2)), op(set(values1)|set(values2), set(values2))) self.assertEqual(op(BitMap(values1), BitMap(values1)|BitMap(values2)), op(set(values1), set(values1)|set(values2)))
def test_constructor_copy(self, values, other_value, cow1, cow2): st.assume(other_value not in values) bitmap1 = BitMap(values, copy_on_write=cow1) bitmap2 = BitMap( bitmap1, copy_on_write=cow2) # should be robust even if cow2 != cow1 self.assertEqual(bitmap1, bitmap2) bitmap1.add(other_value) self.assertNotEqual(bitmap1, bitmap2)
def dependencies(self, targets, scope, *, recurse=True): # Keep track of 'visited' in this scope, so that all targets # share the same context. visited = (BitMap(), BitMap()) for target in targets: for element in target.dependencies(scope, recurse=recurse, visited=visited): yield element
def test_shrink_to_fit(self, cls): bm1 = BitMap() size = 1000 for i in range(size): bm1.add(i) bm2 = cls(bm1, optimize=False) self.assertGreater(bm2.shrink_to_fit(), 0) self.assertEqual(bm2.shrink_to_fit(), 0) bm3 = cls(bm1, optimize=True) self.assertEqual(bm3.shrink_to_fit(), 0)
def test_intersection_update(self, initial_values, all_values, cow): self.initial_bitmap = BitMap(initial_values, copy_on_write=cow) self.all_bitmaps = [ BitMap(values, copy_on_write=cow) for values in all_values ] self.initial_bitmap.intersection_update(*all_values) expected_result = functools.reduce( lambda x, y: x & y, self.all_bitmaps + [self.initial_bitmap]) self.assertEqual(expected_result, self.initial_bitmap) self.assertEqual(type(expected_result), type(self.initial_bitmap))
def test_similarity(self): bitset1 = BitMap([1, 2, 3]) bitset2 = BitMap([1, 2, 4, 8]) result = modifiedtanimoto.similarity(bitset1, bitset2, self.number_of_bits, self.corr_st, self.corr_sto) expected = 0.5779523809525572 pytest.approx(result, expected)
def cohort_analysis(endP=None, sample=None, init_behavior=None,return_behavior=None, number=True,need_user_id=False): """ :param endP: end period :param sample: select certain type of people :param init_behavior: start behavior . Input is column name :param return_behavior: input is column name :param number: return number or percentage :param need_user_id: if true return user_id to know who fullfill the requirement :return: return a cohort table """ cohorts_user = [] cohorts_num = [] cohorts = [] overview = [] periods = endP + 1 for i in range(1, periods-1): overlap_tseries = [] overlap_num = [] cohort_tmp = [] for j in range( i + 1, periods): cohort_init = BitMap(sample[(sample["week_iso"] == i) & (sample["visits"] > 0)]["user_id"].astype(int)) cohort_return = BitMap(sample[(sample["week_iso"] == j) & (sample["visits"] > 0)]["user_id"].astype(int)) overlap_users = list(cohort_init & cohort_return) overlap_user_num = len(overlap_users) cohort_init_num = len(cohort_init) overlap_tseries.append(overlap_users) overlap_num.append(overlap_user_num) cohort_tmp.append(cohort_init_num) cohorts_user.append(overlap_tseries) cohorts_num.append(overlap_num) cohorts.append(cohort_tmp) cohorts = pd.DataFrame(data=cohorts, columns=range(1, periods-1), index=range(1, periods-1)) cohorts_num_df = pd.DataFrame(data=cohorts_num, columns=range(1, periods-1), index=range(1, periods-1)) for i in range(1, periods-1): dn = cohorts[i].sum() if dn == 0: value = 0 else: value = round(cohorts_num_df[i].sum()*100/cohorts[i].sum(), 2) overview.append(value) cohort_table = pd.concat([cohorts[1].rename("sample size"), round(cohorts_num_df*100/cohorts, 2)], axis=1) return cohort_table
def __init__(self, values: Any = None, copy_on_write: bool = False, optimize: bool = True): self.bm: BitMap if isinstance(values, bitmap): values = values.bm elif isinstance(values, slice): values = range(values.start, values.stop, (values.step if values.step else 1)) self.bm = BitMap(values, copy_on_write, optimize)
def test_phase_2_no_temp_table_from_phase_1(): cs = CloStream(filter_fn=NO_FILTER_FN) transaction = frozenset('CD') temp_table = {transaction: 0} assert cs.closed_df.loc[0, 'itemset'] == frozenset() cs._phase_2(temp_table) assert cs.closed_df.values.tolist() == [[frozenset(), 0], [frozenset('CD'), 1]] assert cs.cid_list_map == dict(C=BitMap([1]), D=BitMap([1]))
def test_calc_mean_onbit_density(self): bitsets = { 'a': BitMap([1, 2, 3]), 'b': BitMap([1, 2, 4, 5, 8]), 'c': BitMap([1, 2, 4, 8]) } result = modifiedtanimoto.calc_mean_onbit_density( bitsets.values(), self.number_of_bits) expected = 0.04 assert result == expected
def test_basic_properties(self, values): bitmap = BitMap(values) stats = bitmap.get_statistics() self.assertEqual(stats.n_values_array_containers + stats.n_values_bitset_containers + stats.n_values_run_containers, len(bitmap)) self.assertEqual(stats.n_bytes_array_containers, 2*stats.n_values_array_containers) self.assertEqual(stats.n_bytes_bitset_containers, 2**13*stats.n_bitset_containers) if len(values) > 0: self.assertEqual(stats.min_value, bitmap[0]) self.assertEqual(stats.max_value, bitmap[len(bitmap)-1]) self.assertEqual(stats.cardinality, len(bitmap)) self.assertEqual(stats.sum_value, sum(values))
def test_hash_eq2(self): """It can happen that two bitmaps hold the same values but have a different data structure. They should still have a same hash. This test compares two bitmaps with the same values, one has a run container, the other has an array container.""" n = 100 bm1 = FrozenBitMap(range(n)) bm2 = BitMap() for i in range(n): bm2.add(i) bm2 = FrozenBitMap(bm2, optimize=False) self.assertEqual(bm1, bm2) self.assertNotEqual(bm1.get_statistics(), bm2.get_statistics()) self.assertEqual(hash(bm1), hash(bm2))
class RoaringMinMaxBitmap(): def __init__(self): self.indexes = BitMap() self.max = float('-inf') self.min = float('inf') def insert(self, key): self.indexes.add(key) self.max = self.indexes.max() self.min = self.indexes.min() def discard(self, key): self.indexes.discard(key) if not self.indexes: self.max = float('-inf') self.min = float('inf') else: self.max = self.indexes.max() self.min = self.indexes.min() def __lt__(self, other): if isinstance(other, int): return self.max < other else: return self.max < other.max
class ManyOperationsTest(Util): @given(hyp_collection, hyp_many_collections, st.booleans()) def test_update(self, initial_values, all_values, cow): self.initial_bitmap = BitMap(initial_values, copy_on_write=cow) self.all_bitmaps = [ BitMap(values, copy_on_write=cow) for values in all_values ] self.initial_bitmap.update(*all_values) expected_result = functools.reduce( lambda x, y: x | y, self.all_bitmaps + [self.initial_bitmap]) self.assertEqual(expected_result, self.initial_bitmap) self.assertEqual(type(expected_result), type(self.initial_bitmap)) @given(hyp_collection, hyp_many_collections, st.booleans()) def test_intersection_update(self, initial_values, all_values, cow): self.initial_bitmap = BitMap(initial_values, copy_on_write=cow) self.all_bitmaps = [ BitMap(values, copy_on_write=cow) for values in all_values ] self.initial_bitmap.intersection_update(*all_values) expected_result = functools.reduce( lambda x, y: x & y, self.all_bitmaps + [self.initial_bitmap]) self.assertEqual(expected_result, self.initial_bitmap) self.assertEqual(type(expected_result), type(self.initial_bitmap)) @given(bitmap_cls, st.data(), hyp_many_collections, st.booleans()) def test_union(self, cls, data, all_values, cow): classes = [data.draw(bitmap_cls) for _ in range(len(all_values))] self.all_bitmaps = [ classes[i](values, copy_on_write=cow) for i, values in enumerate(all_values) ] result = cls.union(*self.all_bitmaps) expected_result = functools.reduce(lambda x, y: x | y, self.all_bitmaps) self.assertEqual(expected_result, result) self.assertIsInstance(result, cls) @given(bitmap_cls, st.data(), hyp_many_collections, st.booleans()) def test_intersection(self, cls, data, all_values, cow): classes = [data.draw(bitmap_cls) for _ in range(len(all_values))] self.all_bitmaps = [ classes[i](values, copy_on_write=cow) for i, values in enumerate(all_values) ] result = cls.intersection(*self.all_bitmaps) expected_result = functools.reduce(lambda x, y: x & y, self.all_bitmaps) self.assertEqual(expected_result, result) self.assertIsInstance(result, cls)
def groupwise_jaccard(self, profiles: Iterable[Iterable[str]]) -> float: """ jaccard similarity applied to greater than 2 profiles, ie groupwise similarity instead of pairwise Useful for quantifying the strength of a cluster of profiles (eg disease clustering) """ profile_union = BitMap.union( *[self.graph.get_profile_closure(profile) for profile in profiles]) profile_intersection = BitMap.intersection( *[self.graph.get_profile_closure(profile) for profile in profiles]) return len(profile_intersection) / len(profile_union)
def search(self, query: List[int]) -> Union[List[int], List[Tuple[int, float]]]: index = self._index sizes = self._sizes fb, *sq = sorted(query, key=lambda x: sizes.get(x, 0)) records = index[fb].copy() for k in sq: records &= index[k] if not records: return [] if self._fingerprints: bm = BitMap(query) fps = self._fingerprints return sorted(((x, bm.jaccard_index(fps[x])) for x in records), key=itemgetter(1), reverse=True) return list(records)
def test_pickle_protocol(self, values): old_bm = BitMap(values) pickled = pickle.dumps(old_bm) new_bm = pickle.loads(pickled) self.assertEqual(old_bm, new_bm) self.assertTrue(old_bm is not new_bm) self.assertNotEqual(old_bm.__obj__, new_bm.__obj__)
def test_run_optimize(self, cls): bm1 = BitMap() size = 1000 for i in range(size): bm1.add(i) bm2 = cls(bm1, optimize=False) stats = bm2.get_statistics() self.assertEqual(bm1.get_statistics(), stats) self.assertEqual(stats['n_containers'], stats['n_array_containers']) self.assertEqual(stats['n_values_array_containers'], size) self.assertTrue(bm2.run_optimize()) stats = bm2.get_statistics() self.assertEqual(stats['n_containers'], stats['n_run_containers']) self.assertEqual(stats['n_values_run_containers'], size) bm3 = cls(bm1) # optimize is True by default self.assertEqual(stats, bm3.get_statistics())
def test_immutability(self, values, other, number): frozen = FrozenBitMap(values) copy = FrozenBitMap(values) other = BitMap(other) with self.assertRaises(TypeError): frozen |= other with self.assertRaises(TypeError): frozen &= other with self.assertRaises(TypeError): frozen ^= other with self.assertRaises(TypeError): frozen -= other self.assertEqual(frozen, copy) with self.assertRaises(AttributeError): frozen.add(number) with self.assertRaises(AttributeError): frozen.update(other) with self.assertRaises(AttributeError): frozen.discard(number) with self.assertRaises(AttributeError): frozen.remove(number) with self.assertRaises(AttributeError): frozen.intersection_update(other) with self.assertRaises(AttributeError): frozen.update(number, number + 10) self.assertEqual(frozen, copy)
def get(self, UserTag): customer = pd.read_csv( '/home/pimpwhippa/Works/tornado_todo/todo/binary_tag.csv') tag = customer['tag'].to_dict() #gen 500 randint ks = [] for _ in range(500): n = random.randint(0, 10) ks.append(n) #gen 500 rows of BitMap set #by taking those 500 randint as k for random.sample(range(1, 11), 3) <--k =3 rows = [] for k in ks: row = BitMap(random.sample(range(1, 11), k)) rows.append(row) #make a DataFrame of 500 row from dict of BitMap sets df = pd.DataFrame(data=rows, columns=[ 'tag1', 'tag2', 'tag3', 'tag4', 'tag5', 'tag6', 'tag7', 'tag8', 'tag9', 'tag10' ]) #find no. of rows that have [1,2] in it Q1 = df.loc[(df['tag1'] == 1.0) & (df['tag2'] == 2.0)] num_user_with_tag1_and_tag2 = str( len(Q1)).encode("utf-8").decode("utf-8") self.write(num_user_with_tag1_and_tag2) #find row index of all rows that have [1] df['id'] = range(1, 501) IDtag1 = df.loc[df['tag1'] == 1.0]['id'] self.write(IDtag1.to_dict())
class ManyOperationsTest(Util): @given(hyp_collection, hyp_many_collections, st.booleans()) def test_update(self, initial_values, all_values, cow): self.initial_bitmap = BitMap(initial_values, copy_on_write=cow) self.all_bitmaps = [BitMap(values, copy_on_write=cow) for values in all_values] self.initial_bitmap.update(*all_values) expected_result = functools.reduce( lambda x, y: x | y, self.all_bitmaps+[self.initial_bitmap]) self.assertEqual(expected_result, self.initial_bitmap) self.assertEqual(type(expected_result), type(self.initial_bitmap)) @given(hyp_collection, hyp_many_collections, st.booleans()) def test_intersection_update(self, initial_values, all_values, cow): self.initial_bitmap = BitMap(initial_values, copy_on_write=cow) self.all_bitmaps = [BitMap(values, copy_on_write=cow) for values in all_values] self.initial_bitmap.intersection_update(*all_values) expected_result = functools.reduce( lambda x, y: x & y, self.all_bitmaps+[self.initial_bitmap]) self.assertEqual(expected_result, self.initial_bitmap) self.assertEqual(type(expected_result), type(self.initial_bitmap)) @given(bitmap_cls, st.data(), hyp_many_collections, st.booleans()) def test_union(self, cls, data, all_values, cow): classes = [data.draw(bitmap_cls) for _ in range(len(all_values))] self.all_bitmaps = [classes[i](values, copy_on_write=cow) for i, values in enumerate(all_values)] result = cls.union(*self.all_bitmaps) expected_result = functools.reduce( lambda x, y: x | y, self.all_bitmaps) self.assertEqual(expected_result, result) self.assertIsInstance(result, cls) @given(bitmap_cls, st.data(), hyp_many_collections, st.booleans()) def test_intersection(self, cls, data, all_values, cow): classes = [data.draw(bitmap_cls) for _ in range(len(all_values))] self.all_bitmaps = [classes[i](values, copy_on_write=cow) for i, values in enumerate(all_values)] result = cls.intersection(*self.all_bitmaps) expected_result = functools.reduce( lambda x, y: x & y, self.all_bitmaps) self.assertEqual(expected_result, result) self.assertIsInstance(result, cls)
def test_intersection_update(self, initial_values, all_values, cow): self.initial_bitmap = BitMap(initial_values, copy_on_write=cow) self.all_bitmaps = [BitMap(values, copy_on_write=cow) for values in all_values] self.initial_bitmap.intersection_update(*all_values) expected_result = functools.reduce( lambda x, y: x & y, self.all_bitmaps+[self.initial_bitmap]) self.assertEqual(expected_result, self.initial_bitmap) self.assertEqual(type(expected_result), type(self.initial_bitmap))
def test_basic(self, values, cow): bitmap = BitMap(copy_on_write=cow) self.assertEqual(bitmap.copy_on_write, cow) expected_set = set() self.compare_with_set(bitmap, expected_set) values = list(values) random.shuffle(values) size = len(values) for value in values[:size//2]: bitmap.add(value) expected_set.add(value) self.compare_with_set(bitmap, expected_set) for value in values[size//2:]: bitmap.add(value) expected_set.add(value) self.compare_with_set(bitmap, expected_set) for value in values[:size//2]: bitmap.remove(value) expected_set.remove(value) with self.assertRaises(KeyError): bitmap.remove(value) self.compare_with_set(bitmap, expected_set) for value in values[size//2:]: bitmap.discard(value) # check that we can discard element not in the bitmap bitmap.discard(value) expected_set.discard(value) self.compare_with_set(bitmap, expected_set)
def test_intersection_update(self, initial_values, new_values, cow): bm = BitMap(initial_values, cow) expected = BitMap(bm) bm.intersection_update(new_values) expected &= BitMap(new_values, copy_on_write=cow) self.assertEqual(bm, expected)
def test_flip_inplace_empty(self, values, start, end, cow): st.assume(start >= end) bm_before = BitMap(values, copy_on_write=cow) bm_after = BitMap(bm_before) bm_after.flip_inplace(start, end) self.assertEqual(bm_before, bm_after)
def test_flip_inplace(self, values, start, end, cow): st.assume(start < end) bm_before = BitMap(values, copy_on_write=cow) bm_after = BitMap(bm_before) bm_after.flip_inplace(start, end) self.check_flip(bm_before, bm_after, start, end)
def test_incompatible_union(self): self.incompatible_op(lambda x, y: BitMap.union(x, y)) self.incompatible_op(lambda x, y: BitMap.union(x, x, y, y, x, x, y, y))