def fit(self, train): """ train is a pandas DataFrame, which has columns: 'user_id' 'movie_id' 'rating' """ self.sparse_matrix = self._df_to_sparse_matrix(train) # copied from mrec ItemSimilarityRecommender if not isinstance(self.sparse_matrix, fast_sparse_matrix): dataset = fast_sparse_matrix(self.sparse_matrix) num_users, num_items = dataset.shape # build up a sparse similarity matrix data = [] row = [] col = [] for j in xrange(num_items): w = self.compute_similarities(dataset, j) for k, v in enumerate(w): if v != 0: data.append(v) row.append(j) col.append(k) idx = np.array([row, col], dtype='int32') self.similarity_matrix = csr_matrix((data, idx), (num_items, num_items))
def fit(self, dataset, item_features=None): """ Learn the complete similarity matrix from a user-item matrix. Parameters ========== dataset : scipy sparse matrix or mrec.sparse.fast_sparse_matrix, shape = [num_users, num_items] The matrix of user-item counts, row i holds the counts for the i-th user. item_features : array_like, shape = [num_items, num_features] Features for items in training set, ignored here. """ if not isinstance(dataset, fast_sparse_matrix): dataset = fast_sparse_matrix(dataset) num_users, num_items = dataset.shape # build up a sparse similarity matrix data = [] row = [] col = [] for j in range(num_items): w = self.compute_similarities(dataset, j) for k, v in enumerate(w): if v != 0: data.append(v) row.append(j) col.append(k) idx = np.array([row, col], dtype='int32') self.similarity_matrix = csr_matrix((data, idx), (num_items, num_items))
def fit(self,train): """ Learn factors from training set. User and item factors are fitted alternately. Parameters ========== train : scipy.sparse.csr_matrix or mrec.sparse.fast_sparse_matrix User-item matrix. """ if type(train) == csr_matrix: train = fast_sparse_matrix(train) self._init(train) self.U = self.init_factors(self.num_users,False) # don't need values, will compute them self.V = self.init_factors(self.num_items) for it in xrange(self.num_iters): print 'iteration',it # fit user factors VV = self.V.T.dot(self.V) for u in xrange(self.num_users): # get (positive i.e. non-zero scored) items for user indices = self.data.X[u].nonzero()[1] if indices.size: self.U[u,:] = self.update(indices,self.V,VV) else: self.U[u,:] = np.zeros(self.d) # fit item factors UU = self.U.T.dot(self.U) for i in xrange(self.num_items): indices = self.data.fast_get_col(i).nonzero()[0] if indices.size: self.V[i,:] = self.update(indices,self.U,UU) else: self.V[i,:] = np.zeros(self.d)
def test_init_fast_sparse_matrix(): X = get_random_coo_matrix() Y = X.tocsr() Z = X.tocsc() for M in [X, Y, Z]: m = fast_sparse_matrix(M) assert_array_equal(m.X.toarray(), M.toarray()) assert_equal(m.shape, M.shape)
def test_save_load(): """Save to file as arrays in numpy binary format.""" X = get_random_coo_matrix() m = fast_sparse_matrix(X) f, path = tempfile.mkstemp(suffix='.npz') m.save(path) n = fast_sparse_matrix.load(path) os.remove(path) assert_equal(m.shape, n.shape) assert_array_equal(m.X.toarray(), n.X.toarray()) assert_array_equal(m.col_view.toarray(), n.col_view.toarray())
def test_save_load(): """Save to file as arrays in numpy binary format.""" X = get_random_coo_matrix() m = fast_sparse_matrix(X) f, path = tempfile.mkstemp(suffix=".npz") m.save(path) n = fast_sparse_matrix.load(path) os.remove(path) assert_equal(m.shape, n.shape) assert_array_equal(m.X.toarray(), n.X.toarray()) assert_array_equal(m.col_view.toarray(), n.col_view.toarray())
def save_sparse_matrix(data, fmt, filepath): """ Save a scipy sparse matrix in the specified format. Row and column indices will be converted to 1-indexed if you specify a plain text format (tsv, csv, mm). Note that zero entries are guaranteed to be saved in tsv or csv format. Parameters ---------- data : scipy sparse matrix to save fmt : str Specifies the file format to write: - tsv - csv - mm (MatrixMarket) - npz (save as npz archive of numpy arrays) - fsm (mrec.sparse.fast_sparse_matrix) filepath : str The file to load. """ if fmt == 'tsv': m = data.tocoo() with open(filepath, 'w') as out: for u, i, v in zip(m.row, m.col, m.data): print('{0}\t{1}\t{2}'.format(u + 1, i + 1, v), file=out) elif fmt == 'csv': m = data.tocoo() with open(filepath, 'w') as out: for u, i, v in zip(m.row, m.col, m.data): print('{0},{1},{2}'.format(u + 1, i + 1, v), file=out) elif fmt == 'mm': mmwrite(filepath, data) elif fmt == 'npz': savez(data.tocoo(), filepath) elif fmt == 'fsm': fast_sparse_matrix(data).save(filepath) else: raise ValueError('unknown output format: {0}'.format(fmt))
def test_fast_update_col(): X = get_random_coo_matrix().tocsc() m = fast_sparse_matrix(X) cols = X.shape[1] for j in xrange(cols): vals = m.fast_get_col(j).data if (vals == 0).all(): continue vals[vals != 0] += 1 m.fast_update_col(j, vals) expected = X[:, j].toarray() for i in xrange(expected.shape[0]): if expected[i] != 0: expected[i] += 1 assert_array_equal(m.fast_get_col(j).toarray(), expected)
def fit(self,train,S,item_features=None): """ Learn factors from training set. User and item factors are fitted alternately. Parameters ========== train : scipy.sparse.csr_matrix or mrec.sparse.fast_sparse_matrix User-item matrix. S: item similaities item_features : array_like, shape = [num_items, num_features] Features for each item in the dataset, ignored here. """ self.S = S nbrs = NearestNeighbors(n_neighbors=self.k + 1).fit(self.S) # Does contain itself, hence k + 1 self.k_distances, self.k_indices = nbrs.kneighbors(self.S) if type(train) == csr_matrix: train = fast_sparse_matrix(train) num_users,num_items = train.shape self.U = self.init_factors(num_users,False) # don't need values, will compute them self.V = self.init_factors(num_items) # Items for it in xrange(self.num_iters): print 'iteration',it # fit user factors VV = self.V.T.dot(self.V) for u in xrange(num_users): # get (positive i.e. non-zero scored) items for user indices = train.X[u].nonzero()[1] if indices.size: self.U[u,:] = self.update(indices,self.V,VV) else: self.U[u,:] = np.zeros(self.d) # fit item factors UU = self.U.T.dot(self.U) for i in xrange(num_items): indices = train.fast_get_col(i).nonzero()[0] if indices.size: self.V[i,:] = self.update_item(indices,self.U,UU) else: self.V[i,:] = np.zeros(self.d)
elif type(learner) == WeightedMf: trainX = X.toScipyCsr() if modelSelect: modelSelectX, userInds = Sampling.sampleUsers2(X, modelSelectSamples) modelSelectX = modelSelectX.toScipyCsc() meanMetrics, stdMetrics = learner.modelSelect(modelSelectX) learner.learnModel(trainX) U = learner.U V = learner.V elif type(learner) == CosineKNNRecommender or type(learner) == SLIM: #We take a subsample of users for these recommenders as they are slow X, userInds = Sampling.sampleUsers2(X, modelSelectSamples) fastTrainX = fast_sparse_matrix(X.toScipyCsr()) trainX = X.toScipyCsr() m, n = trainX.shape learner.fit(fastTrainX) recommendations = learner.range_recommend_items(trainX, 0, m, max_items=maxItems) orderedItems = numpy.zeros((m, maxItems), numpy.int) scores = numpy.zeros((m, maxItems)) for i in range(m): itemScores = numpy.array(recommendations[i]) if itemScores.shape[0] != 0: orderedItems[i, 0:itemScores.shape[0]] = itemScores[:, 0] scores[i, 0:itemScores.shape[0]] = itemScores[:, 1] else:
def test_fast_get_col(): X = get_random_coo_matrix().tocsc() m = fast_sparse_matrix(X) rows, cols = X.shape for j in xrange(cols): assert_array_equal(m.fast_get_col(j).toarray(), X[:, j].toarray())