def test_build_matrix(self): """Should create a csr matrix only master""" test_series = pd.Series(['foo', 'bar', 'baz']) sg = StringGrouper(test_series) master, dupe = sg._get_tf_idf_matrices() c = csr_matrix([[0., 0., 1.], [1., 0., 0.], [0., 1., 0.]]) np.testing.assert_array_equal(c.toarray(), master.toarray()) np.testing.assert_array_equal(c.toarray(), dupe.toarray())
def test_build_matches(self): """Should create the cosine similarity matrix of two series""" test_series_1 = pd.Series(['foo', 'bar', 'baz']) test_series_2 = pd.Series(['foo', 'bar', 'bop']) sg = StringGrouper(test_series_1, test_series_2) master, dupe = sg._get_tf_idf_matrices() expected_matches = np.array([[1., 0., 0.] , [0., 1., 0.] , [0., 0., 0.]]) np.testing.assert_array_equal(expected_matches, sg._build_matches(master, dupe).toarray())
def test_build_matrix_master_and_duplicates(self): """Should create a csr matrix for master and duplicates""" test_series_1 = pd.Series(['foo', 'bar', 'baz']) test_series_2 = pd.Series(['foo', 'bar', 'bop']) sg = StringGrouper(test_series_1, test_series_2) master, dupe = sg._get_tf_idf_matrices() master_expected = csr_matrix([[0., 0., 0., 1.], [1., 0., 0., 0.], [0., 1., 0., 0.]]) dupes_expected = csr_matrix([[0., 0., 0., 1.], [1., 0., 0., 0.], [0., 0., 1., 0.]]) np.testing.assert_array_equal(master_expected.toarray(), master.toarray()) np.testing.assert_array_equal(dupes_expected.toarray(), dupe.toarray())