def test_lsi_helper_class(): import scipy.sparse X = scipy.sparse.rand(100, 10000) lsi = _TruncatedSVD_LSI(n_components=20) lsi.fit(X) X_p = lsi.transform_lsi(X) X_p2 = lsi.transform_lsi_norm(X) assert lsi.components_.shape == (20, X.shape[1]) assert X_p.shape == (100, 20) assert X_p2.shape == (100, 20)
def fit_transform(self, n_components=150, n_iter=5, alpha=0.33): """ Perform the SVD decomposition Parameters ---------- n_components : int number of selected singular values (number of LSI dimensions) n_iter : int number of iterations for the stochastic SVD algorithm Returns ------- mid : str model id lsi : _BaseWrapper the TruncatedSVD object exp_var : float the explained variance of the SVD decomposition """ parent_id = self.pipeline.mid dsid_dir = self.fe.dsid_dir if not dsid_dir.exists(): raise IOError pars = {'parent_id': parent_id, 'n_components': n_components} mid_dir_base = dsid_dir / self._wrapper_type mid, mid_dir = setup_model(mid_dir_base, mid=self.mid, mode=self.mode) ds = self.pipeline.data n_components_opt = _compute_lsi_dimensionality(n_components, *ds.shape, alpha=alpha) svd = _TruncatedSVD_LSI(n_components=n_components_opt, n_iter=n_iter, random_state=self.random_state) lsi = svd lsi.fit(ds) ds_p = lsi.transform_lsi_norm(ds) joblib.dump(pars, str(mid_dir / 'pars')) joblib.dump(lsi, str(mid_dir / 'model')) joblib.dump(ds_p, str(mid_dir / 'data')) exp_var = lsi.explained_variance_ratio_.sum() self.mid = mid return lsi, exp_var
def test_lsi_book_example(): """ LSI example taken from the "Information retrieval" (2004) book by Grossman & Ophir This illustrates the general principle of LSI using sklearn API with _TruncatedSVD_LSI """ # replacing "a" with "aa" as the former seems to be ignored by the CountVectorizer documents = [ "Shipment of gold damaged in aa fire.", "Delivery of silver arrived in aa silver truck.", "Shipment of gold arrived in aa truck.", ] querry = "gold silver truck" from sklearn.feature_extraction.text import CountVectorizer from sklearn.metrics.pairwise import cosine_similarity import scipy.linalg dm_vec = CountVectorizer() dm_vec.fit(documents) X = dm_vec.transform(documents) assert X.shape[1] == 11 assert X.sum( ) == 22 # checking the total number of elements in the document matrix #print(X.todense().T) q = dm_vec.transform([querry]) lsi = _TruncatedSVD_LSI(n_components=2) #, algorithm='arpack') lsi.fit(X) X_p = lsi.transform_lsi(X) q_p = lsi.transform_lsi(q) U, s, Vh = scipy.linalg.svd(X.todense().T, full_matrices=False) #print(' ') #print(U[:, :-1]) q_p_2 = q.dot(U[:, :-1]).dot(np.diag(1. / s[:-1])) assert_allclose(np.abs(q_p_2), np.array([[0.2140, 0.1821]]), 1e-3) X_p_2 = X.dot(U[:, :-1]).dot(np.diag(1. / s[:-1])) assert_allclose(np.abs(np.abs(X_p_2)), np.abs(X_p)) assert_allclose(np.abs(np.abs(q_p_2)), np.abs(q_p)) #print(lsi.Sigma) #print(' ') #print(X_p) #print(q_p) D = cosine_similarity(X_p, q_p) assert_allclose(D[:2], np.array([-0.05, 0.9910, 0.9543])[:2, None], 2e-2, 1e-2)
def test_search(kind): # testing that search algorithm actually works corpus = [ "To be, or not to be; that is the question;", "Whether ‘tis nobler in the mind to suffer", "The slings and arrows of outrageous fortune,", "Or to take arms against a sea of troubles,", "And by opposing end them. To die: to sleep:", "Nor more; and by a sleep to say we end", "The heart-ache and the thousand natural shocks", "That flesh is heir to; ‘tis a consummation", "Devoutly to be wished. To die; to sleep;", "To sleep: perchance to dream: aye, there is the rub;", "For in that sleep of death what dreams may come,", "When we have shuffled off this mortal coil,", "Must give us pause: there’s the respect", "That makes calamity of so long life;" ] vect = CountVectorizer() X_tf = vect.fit_transform(corpus) idf = SmartTfidfTransformer('nnc') X_vect = idf.fit_transform(X_tf) if kind == 'semantic': lsi = _TruncatedSVD_LSI(n_components=20) lsi.fit(X_vect) X = lsi.transform_lsi_norm(X_vect) else: lsi = None X = X_vect s = Search(vect, idf, lsi) s.fit(X) for query, best_id in [(corpus[2], 2), ('death dreams', 10)]: dist = s.search(query) assert dist.shape == (X.shape[0], ) assert dist.argmax() == best_id # 2 - cosine distance should be in [0, 2] assert_array_less(dist, 1.001) assert_array_less(-1 - 1e-9, dist)