def test_merge_character_pos(): x = StringTracker() y = StringTracker() data = ["abc abc", "93341-1", "912254", "bac tralalala"] data_2 = ["geometric inference ", "93341-1", "912254", "bac tralalala", "😀 this is a sale! a ❄️ sale!", "this is a long sentence that ends in an A", None] for record in data: x.update(record) for record in data_2: y.update(record) assert x.char_pos_tracker.char_pos_map["NITL"].count == 2 assert x.char_pos_tracker.char_pos_map["NITL"].histogram.get_max_value() == 3 assert x.char_pos_tracker.char_pos_map["NITL"].histogram.get_min_value() == 3 assert x.char_pos_tracker.char_pos_map["a"].histogram.get_max_value() == 12 assert x.char_pos_tracker.char_pos_map["a"].histogram.get_min_value() == 0 assert y.char_pos_tracker.char_pos_map["NITL"].count == 22 assert y.char_pos_tracker.char_pos_map["NITL"].histogram.get_max_value() == 39 new_tracker = x.merge(y) assert new_tracker.char_pos_tracker.char_pos_map["a"].histogram.get_max_value() == 40 assert new_tracker.char_pos_tracker.char_pos_map["NITL"].histogram.get_max_value() == 39 assert new_tracker.char_pos_tracker.char_pos_map["NITL"].histogram.get_min_value() == 0 assert new_tracker.char_pos_tracker.char_pos_map["NITL"].count == 24
def test_merge_mod_character_lists(): x = StringTracker() y = StringTracker() data = ["abc abc", "93341-1", "912254", "bac tralalala"] data_2 = [ "geometric inference ", "93341-1", "912254", "bac tralalala", "😀 this is a sale! a ❄️ sale!", "this is a long sentence that ends in an A", None ] for record in data: x.update(record, character_list="ab") for record in data_2: y.update(record, character_list="a") assert x.char_pos_tracker.char_pos_map["NITL"].count == 23 assert x.char_pos_tracker.char_pos_map["NITL"].histogram.get_max_value( ) == 11 assert x.char_pos_tracker.char_pos_map["a"].histogram.get_max_value() == 12 assert y.char_pos_tracker.char_pos_map["NITL"].count == 102 assert y.char_pos_tracker.char_pos_map["a"].histogram.get_max_value() == 40 assert y.char_pos_tracker.char_pos_map["NITL"].histogram.get_max_value( ) == 39 x = x.merge(y) assert x.char_pos_tracker.char_pos_map["a"].histogram.get_max_value() == 40 assert x.char_pos_tracker.char_pos_map["NITL"].histogram.get_max_value( ) == 39 assert x.char_pos_tracker.char_pos_map["NITL"].count == 125 assert x.token_length.histogram.get_max_value() == 10
def test_string_tracker_merge(): x = StringTracker() data = ["one", "two", "three", "one", "one", "One test", "six", None, None] for record in data: x.update(record) assert x.token_length.histogram.get_max_value() == 2 assert x.token_length.histogram.get_min_value() == 1 x2 = StringTracker() data = ["this is a long sentence that ends in an A"] for record in data: x2.update(record) assert x2.token_length.histogram.get_max_value() == 10 assert x2.token_length.histogram.get_min_value() == 10 new_string_track = x2.merge(x) assert new_string_track.token_length.histogram.get_max_value() == 10 assert new_string_track.token_length.histogram.get_min_value() == 1