def test_join_similar_recursive(self): columns = ["awordorsomething", "getawordorsomething", "getawordorsomethingelse"] rows = self.generate_rows(columns) dataset = pandas.DataFrame(data=rows, columns=columns, dtype=int) result = preprocess.join_similar(dataset.copy(deep=True), 0.9) # create expected result dataset expected_columns = columns[:1] # all should be merged to the first expected_rows = [row[:1] for row in rows] for idx, expected_row in enumerate(expected_rows): expected_rows[idx][0] += rows[idx][1] + rows[idx][2] expected_result = pandas.DataFrame(data=expected_rows, columns=expected_columns, dtype=int) # make the test self.assertTrue(expected_result.equals(result))
def test_join_similar_with_blacklist(self): # create a test dataset columns = ["adding", "awordorsomething", "getawordorsomething", "padding", "word1", "word2"] rows = self.generate_rows(columns) dataset = pandas.DataFrame(data=rows, columns=columns, dtype=int) result = preprocess.join_similar(dataset.copy(deep=True), 0.9) # create expected result dataset expected_columns = columns[:] expected_columns.pop(2) expected_rows = [row[:2] + row[3:] for row in rows] for idx, expected_row in enumerate(expected_rows): expected_rows[idx][1] += rows[idx][2] expected_result = pandas.DataFrame(data=expected_rows, columns=expected_columns, dtype=int) # make the test self.assertTrue(expected_result.equals(result))