def test_merge_indexed_journalled_entries_count(self): old = np.asarray([0, 0, 0, 1, 1, 2, 3, 3, 5, 5, 5], dtype=np.int32) new = np.asarray([0, 2, 3, 4, 5, 6], dtype=np.int32) old_i, new_i = ops.ordered_generate_journalling_indices(old, new) old_inds = np.asarray([0, 2, 4, 6, 9, 12, 14, 17, 20, 23, 26, 29]) old_vals = np.frombuffer(b''.join([ b'aa', b'ab', b'ac', b'baa', b'bab', b'ca', b'daa', b'dab', b'faa', b'fab', b'fac' ]), dtype='S1') expected_inds = np.asarray([0, 2, 4, 6, 9, 12, 14, 17, 20, 23, 26, 29], dtype=np.int32) expected_vals = np.asarray([ b'a', b'a', b'a', b'b', b'a', b'c', b'b', b'a', b'a', b'b', b'a', b'b', b'c', b'a', b'd', b'a', b'a', b'd', b'a', b'b', b'f', b'a', b'a', b'f', b'a', b'b', b'f', b'a', b'c' ], dtype='S1') self.assertTrue(np.array_equal(old_inds, expected_inds)) self.assertTrue(np.array_equal(old_vals, expected_vals)) new_inds = np.asarray([0, 2, 4, 7, 9, 12, 14]) new_vals = np.frombuffer(b''.join( [b'ad', b'cb', b'dac', b'ea', b'fad', b'ga']), dtype='S1') to_keep = np.zeros(len(new_i), dtype=bool) ops.compare_indexed_rows_for_journalling(old_i, new_i, old_inds, old_vals, new_inds, new_vals, to_keep) count = ops.merge_indexed_journalled_entries_count( old_i, new_i, to_keep, old_inds, new_inds) self.assertTrue(np.array_equal(count, 43))
def test_compare_rows_for_journalling(self): old = np.asarray([0, 0, 0, 1, 1, 2, 3, 3, 5, 5, 5], dtype=np.int32) new = np.asarray([0, 2, 3, 4, 5, 6], dtype=np.int32) old_i, new_i = ops.ordered_generate_journalling_indices(old, new) old_expected = np.asarray([2, 4, 5, 7, -1, 10, -1], dtype=np.int32) self.assertTrue(np.array_equal(old_i, old_expected)) new_expected = np.asarray([0, -1, 1, 2, 3, 4, 5], dtype=np.int32) self.assertTrue(np.array_equal(new_i, new_expected)) old_data = np.asarray([0, 1, 2, 10, 11, 20, 30, 31, 50, 51, 52]) new_data = np.asarray([2, 20, 31, 40, 52, 60]) to_keep = np.zeros(len(new_i), dtype=bool) ops.compare_rows_for_journalling(old_i, new_i, old_data, new_data, to_keep) expected = np.asarray([False, False, False, False, True, False, True]) self.assertTrue(np.array_equal(to_keep, expected)) old_data = np.asarray([0, 1, 2, 10, 11, 20, 30, 31, 50, 51, 52]) new_data = np.asarray([3, 21, 32, 41, 53, 61]) to_keep = np.zeros(len(new_i), dtype=bool) ops.compare_rows_for_journalling(old_i, new_i, old_data, new_data, to_keep) expected = np.asarray([True, False, True, True, True, True, True]) self.assertTrue(np.array_equal(to_keep, expected))
def test_merge_journalled_entries(self): old = np.asarray([0, 0, 0, 1, 1, 2, 3, 3, 5, 5, 5], dtype=np.int32) new = np.asarray([0, 2, 3, 4, 5, 6], dtype=np.int32) old_i, new_i = ops.ordered_generate_journalling_indices(old, new) old_data = np.asarray([0, 1, 2, 10, 11, 20, 30, 31, 50, 51, 52]) new_data = np.asarray([2, 20, 31, 40, 52, 60]) to_keep = np.zeros(len(new_i), dtype=bool) ops.compare_rows_for_journalling(old_i, new_i, old_data, new_data, to_keep) dest = np.zeros(len(old) + to_keep.sum(), dtype=old.dtype) ops.merge_journalled_entries(old_i, new_i, to_keep, old_data, new_data, dest) expected = np.asarray( [0, 1, 2, 10, 11, 20, 30, 31, 40, 50, 51, 52, 60], dtype=np.int32) self.assertTrue(np.array_equal(dest, expected)) old_data = np.asarray([0, 1, 2, 10, 11, 20, 30, 31, 50, 51, 52]) new_data = np.asarray([3, 21, 32, 40, 53, 60]) to_keep = np.zeros(len(new_i), dtype=bool) ops.compare_rows_for_journalling(old_i, new_i, old_data, new_data, to_keep) dest = np.zeros(len(old) + to_keep.sum(), dtype=old.dtype) ops.merge_journalled_entries(old_i, new_i, to_keep, old_data, new_data, dest) expected = np.asarray( [0, 1, 2, 3, 10, 11, 20, 21, 30, 31, 32, 40, 50, 51, 52, 53, 60], dtype=np.int32) self.assertTrue(np.array_equal(dest, expected))
def test_ordered_generate_journalling_indices(self): old = np.asarray([0, 0, 0, 1, 1, 2, 3, 3, 5, 5, 5], dtype=np.int32) new = np.asarray([0, 2, 3, 4, 5, 6], dtype=np.int32) old_i, new_i = ops.ordered_generate_journalling_indices(old, new) old_expected = np.asarray([2, 4, 5, 7, -1, 10, -1], dtype=np.int32) self.assertTrue(np.array_equal(old_i, old_expected)) new_expected = np.asarray([0, -1, 1, 2, 3, 4, 5], dtype=np.int32) self.assertTrue(np.array_equal(new_i, new_expected))
def journal_table(session, schema, old_src, new_src, src_pk, result): old_keys = set(old_src.keys()) new_keys = set(new_src.keys()) common_keys = old_keys.intersection(new_keys) common_keys.remove('j_valid_from') common_keys.remove('j_valid_to') old_only_keys = old_keys.difference(new_keys) new_only_keys = new_keys.difference(old_keys) with utils.Timer("sorting old ids"): old_ids = session.get(old_src[src_pk]) old_ids_ = old_ids.data[:] old_ids_valid_from = session.get(old_src['j_valid_from']).data[:] old_sorted_index = session.dataset_sort_index((old_ids_, old_ids_valid_from)) old_count = len(old_ids_) with utils.Timer("sorting new_ids"): new_ids_ = session.get(new_src[src_pk]).data[:] new_sorted_index = session.dataset_sort_index((new_ids_,)) new_count = len(new_ids_) # print("old_ids:", old_ids_[old_sorted_index[:20]]) # print("new_ids:", new_ids_[new_sorted_index[:20]]) # get the row maps for rows that we need to compare with utils.Timer("generating row_maps for merging"): old_ids_ = old_ids_[old_sorted_index] new_ids_ = new_ids_[new_sorted_index] old_map, new_map = ops.ordered_generate_journalling_indices(old_ids_, new_ids_) to_keep = np.zeros(len(old_map), dtype=np.bool) schema_fields = schema.fields.keys() common_keys = [k for k in schema_fields if k in common_keys] print("old_map:", old_map) print("new_map:", new_map) for k in common_keys: if k in (src_pk, 'j_valid_from', 'j_valid_to'): continue old_f = session.get(old_src[k]) new_f = session.get(new_src[k]) print(k) if isinstance(old_f, flds.IndexedStringField): old_f_i_, old_f_v_ = session.apply_index(old_sorted_index, old_f) new_f_i_, new_f_v_ = session.apply_index(new_sorted_index, new_f) ops.compare_indexed_rows_for_journalling(old_map, new_map, old_f_i_, old_f_v_, new_f_i_, new_f_v_, to_keep) else: old_f_ = session.apply_index(old_sorted_index, old_f) new_f_ = session.apply_index(new_sorted_index, new_f) ops.compare_rows_for_journalling(old_map, new_map, old_f_, new_f_, to_keep) print("to_keep:", to_keep.astype(np.uint8)) print(to_keep.sum(), len(to_keep)) merged_length = len(old_ids.data) + to_keep.sum() only_in_old = 0 only_in_new = 0 not_updated = 0 updated = 0 for i in range(len(old_map)): if old_map[i] == -1: only_in_new += 1 if new_map[i] == -1: only_in_old += 1 if (old_map[i] != -1) and (to_keep[i] == True): updated += 1 if (new_map[i] != -1) and (to_keep[i] == False): not_updated += 1 for k in common_keys: if k in (src_pk, 'j_valid_from', 'j_valid_to'): continue old_f = session.get(old_src[k]) new_f = session.get(new_src[k]) print(k) if isinstance(old_f, flds.IndexedStringField): old_f_i_, old_f_v_ = session.apply_index(old_sorted_index, old_f) new_f_i_, new_f_v_ = session.apply_index(new_sorted_index, new_f) dest_i_ = np.zeros(merged_length + 1, old_f_i_.dtype) val_count = ops.merge_indexed_journalled_entries_count(old_map, new_map, to_keep, old_f_i_, new_f_i_) dest_v_ = np.zeros(val_count, old_f_v_.dtype) ops.merge_indexed_journalled_entries(old_map, new_map, to_keep, old_f_i_, old_f_v_, new_f_i_, new_f_v_, dest_i_, dest_v_) dest_f = new_f.create_like(result, k) dest_f.indices.write(dest_i_) dest_f.values.write(dest_v_) else: old_f_v_ = session.apply_index(old_sorted_index, old_f) new_f_v_ = session.apply_index(new_sorted_index, new_f) dest_ = np.zeros(merged_length, old_f_v_.dtype) ops.merge_journalled_entries(old_map, new_map, to_keep, old_f_v_, new_f_v_, dest_) dest_f = new_f.create_like(result, k) dest_f.data.write(dest_) print("old_count:", old_count) print("new_count:", new_count) print("only in old:", only_in_old) print("only in new:", only_in_new) print("updated:", updated) print("not updated:", not_updated) print("post journal count:", merged_length)