def setUp(self): self.dlm = create_delimiter_tokenizer(' ') self.A = pd.DataFrame([{'id': 1, 'attr':'ab cd ef aa bb'}, {'id': 2, 'attr':''}, {'id': 3, 'attr':'ab'}, {'id': 4, 'attr':'ll oo he'}, {'id': 5, 'attr':'xy xx zz fg'}]) self.B = pd.DataFrame([{'id': 1, 'attr':'zz fg xx'}, {'id': 2, 'attr':'he ll'}, {'id': 3, 'attr':'xz pl ou'}, {'id': 4, 'attr':'aa'}, {'id': 5, 'attr':'fg cd aa ef ab'}]) self.empty_table = pd.DataFrame(columns=['id', 'attr']) self.default_l_out_prefix = 'l_' self.default_r_out_prefix = 'r_'
def setUp(self): self.dlm = create_delimiter_tokenizer(' ') self.A = pd.DataFrame([{ 'l_id': 1, 'l_attr': 'ab cd ef aa bb' }, { 'l_id': 2, 'l_attr': '' }, { 'l_id': 3, 'l_attr': 'ab' }, { 'l_id': 4, 'l_attr': 'll oo pp' }, { 'l_id': 5, 'l_attr': 'xy xx zz fg' }]) self.B = pd.DataFrame([{ 'r_id': 1, 'r_attr': 'mn' }, { 'r_id': 2, 'r_attr': 'he ll' }, { 'r_id': 3, 'r_attr': 'xy pl ou' }, { 'r_id': 4, 'r_attr': 'aa' }, { 'r_id': 5, 'r_attr': 'fg cd aa ef' }]) # generate cartesian product A x B to be used as candset self.A['tmp_join_key'] = 1 self.B['tmp_join_key'] = 1 self.C = pd.merge(self.A[['l_id', 'tmp_join_key']], self.B[['r_id', 'tmp_join_key']], on='tmp_join_key').drop('tmp_join_key', 1) self.empty_A = pd.DataFrame(columns=['l_id', 'l_attr']) self.empty_B = pd.DataFrame(columns=['r_id', 'r_attr']) self.empty_candset = pd.DataFrame(columns=['l_id', 'r_id'])
def setUp(self): self.A = pd.DataFrame([{'A.id':1, 'A.attr':'hello'}]) self.B = pd.DataFrame([{'B.id':1, 'B.attr':'world'}]) self.tokenizer = create_delimiter_tokenizer() self.sim_measure_type = 'JACCARD' self.threshold = 0.8
def setUp(self): self.dlm = create_delimiter_tokenizer(' ')
def time_overlap_delim_1(self): dl = create_delimiter_tokenizer() overlap_join(self.ltable, self.rtable, 'id', 'id', 'attr', 'attr', dl, 1)
def time_cosine_delim_07(self): dl = create_delimiter_tokenizer() cosine_join(self.ltable, self.rtable, 'id', 'id', 'attr', 'attr', dl, 0.7)
def setUp(self): self.A = pd.DataFrame([{'A.id': 1, 'A.attr': 'hello'}]) self.B = pd.DataFrame([{'B.id': 1, 'B.attr': 'world'}]) self.tokenizer = create_delimiter_tokenizer() self.threshold = 1
def test_set_sim_join(): # data to be tested. test_scenario_1 = [('data/table_A.csv', 'A.ID', 'A.name'), ('data/table_B.csv', 'B.ID', 'B.name')] data = {'TEST_SCENARIO_1': test_scenario_1} # similarity measures to be tested. sim_measure_types = ['COSINE', 'DICE', 'JACCARD', 'OVERLAP'] # similarity thresholds to be tested. thresholds = { 'JACCARD': [0.3, 0.5, 0.7, 0.85, 1], 'COSINE': [0.3, 0.5, 0.7, 0.85, 1], 'DICE': [0.3, 0.5, 0.7, 0.85, 1], 'OVERLAP': [1, 2, 3] } # tokenizers to be tested. tokenizers = { 'SPACE_DELIMITER': create_delimiter_tokenizer(), '2_GRAM': create_qgram_tokenizer(), '3_GRAM': create_qgram_tokenizer(3) } # Test each combination of similarity measure, threshold and tokenizer # for different test scenarios. for label, scenario in iteritems(data): for sim_measure_type in sim_measure_types: for threshold in thresholds.get(sim_measure_type): for tok_type, tok in iteritems(tokenizers): test_function = partial(test_valid_join, scenario, sim_measure_type, (tok, threshold)) test_function.description = 'Test ' + sim_measure_type + \ ' with ' + str(threshold) + ' threshold and ' + \ tok_type + ' tokenizer for ' + label + '.' yield test_function, # Test each similarity measure with output attributes added. for sim_measure_type in sim_measure_types: test_function = partial( test_valid_join, test_scenario_1, sim_measure_type, (tokenizers['SPACE_DELIMITER'], 1, ['A.birth_year', 'A.zipcode' ], ['B.name', 'B.zipcode'])) test_function.description = 'Test ' + sim_measure_type + \ ' with output attributes.' yield test_function, # Test each similarity measure with a different output prefix. for sim_measure_type in sim_measure_types: test_function = partial( test_valid_join, test_scenario_1, sim_measure_type, (tokenizers['SPACE_DELIMITER'], 1, ['A.birth_year', 'A.zipcode'], ['B.name', 'B.zipcode'], 'ltable.', 'rtable.')) test_function.description = 'Test ' + sim_measure_type + \ ' with output attributes and prefix.' yield test_function, # Test each similarity measure with output_sim_score disabled. for sim_measure_type in sim_measure_types: test_function = partial( test_valid_join, test_scenario_1, sim_measure_type, (tokenizers['SPACE_DELIMITER'], 1, ['A.birth_year', 'A.zipcode'], ['B.name', 'B.zipcode'], 'ltable.', 'rtable.', False)) test_function.description = 'Test ' + sim_measure_type + \ ' with sim_score disabled.' yield test_function,