def fit(self, ratings, **kwargs): timer = util.Stopwatch() users = pd.Index(np.unique(ratings['user'])) items = pd.Index(np.unique(ratings['item'])) u_no = users.get_indexer(ratings['user']) i_no = items.get_indexer(ratings['item']) mean = np.mean(ratings['rating'].values, dtype='f4') # TensorFlow is using 32-bits model = self._build_model(len(users), len(items), mean) _log.info('[%s] training model', timer) model.fit([u_no, i_no], ratings['rating'], epochs=self.epochs, batch_size=self.batch_size) _log.info('[%s] model finished', timer) self.user_index_ = users self.item_index_ = items self.model = model return self
def fit(self, ratings, **kwargs): """ Train a model. The model-training process depends on ``save_nbrs`` and ``min_sim``, but *not* on other algorithm parameters. Args: ratings(pandas.DataFrame): (user,item,rating) data for computing item similarities. """ util.check_env() # Training proceeds in 2 steps: # 1. Normalize item vectors to be mean-centered and unit-normalized # 2. Compute similarities with pairwise dot products self._timer = util.Stopwatch() _logger.debug('[%s] beginning fit, memory use %s', self._timer, util.max_memory()) _logger.debug('[%s] using CSR kernel %s', self._timer, csrk.name) init_rmat, users, items = sparse_ratings(ratings) n_items = len(items) _logger.info( '[%s] made sparse matrix for %d items (%d ratings from %d users)', self._timer, len(items), init_rmat.nnz, len(users)) _logger.debug('[%s] made matrix, memory use %s', self._timer, util.max_memory()) rmat, item_means = self._mean_center(ratings, init_rmat, items) _logger.debug('[%s] centered, memory use %s', self._timer, util.max_memory()) rmat = self._normalize(rmat) _logger.debug('[%s] normalized, memory use %s', self._timer, util.max_memory()) _logger.info('[%s] computing similarity matrix', self._timer) smat = self._compute_similarities(rmat) _logger.debug('[%s] computed, memory use %s', self._timer, util.max_memory()) _logger.info('[%s] got neighborhoods for %d of %d items', self._timer, np.sum(np.diff(smat.rowptrs) > 0), n_items) _logger.info('[%s] computed %d neighbor pairs', self._timer, smat.nnz) self.item_index_ = items self.item_means_ = item_means self.item_counts_ = np.diff(smat.rowptrs) self.sim_matrix_ = smat self.user_index_ = users self.rating_matrix_ = init_rmat # create an inverted similarity matrix for efficient scanning self._sim_inv_ = smat.transpose() _logger.info('[%s] transposed matrix for optimization', self._timer) _logger.debug('[%s] done, memory use %s', self._timer, util.max_memory()) return self
def main(args): mod_name = args.get('-m') input = args.get('--splits') output = args.get('-o') n_recs = int(args.get('-n')) model = args.get('ALGO') _log.info(f'importing from module {mod_name}') algorithms = importlib.import_module(mod_name) algo = getattr(algorithms, model) algo = Recommender.adapt(algo) path = Path(input) dest = Path(output) dest.mkdir(exist_ok=True, parents=True) ds_def = getattr(datasets, path.name, None) for file in path.glob("test-*"): test = pd.read_csv(file, sep=',') suffix = file.name[5:] train_file = path / f'train-{suffix}' timer = util.Stopwatch() if 'index' in test.columns: _log.info('setting test index') test = test.set_index('index') else: _log.warn('no index column found in %s', file.name) if train_file.exists(): _log.info('[%s] loading training data from %s', timer, train_file) train = pd.read_csv(path / f'train-{suffix}', sep=',') elif ds_def is not None: _log.info('[%s] extracting training data from data set %s', timer, path.name) train = datasets.ds_diff(ds_def.ratings, test) train.reset_index(drop=True, inplace=True) else: _log.error('could not find training data for %s', file.name) continue _log.info('[%s] Fitting the model', timer) # We train isolated to manage resource use model = batch.train_isolated(algo, train) try: _log.info('[%s] generating recommendations for unique users', timer) users = test.user.unique() recs = batch.recommend(model, users, n_recs) _log.info('[%s] writing recommendations to %s', timer, dest) recs.to_csv(dest / f'recs-{suffix}', index=False) if isinstance(algo, Predictor) and not args['--no-predict']: _log.info('[%s] generating predictions for user-item', timer) preds = batch.predict(model, test) preds.to_csv(dest / f'pred-{suffix}', index=False) finally: model.close()
def test_stopwatch_minutes(): w = lku.Stopwatch() w.stop() w.start_time = w.stop_time - 62 s = str(w) p = re.compile(r'1m2.\d\ds') assert p.match(s)
def test_stopwatch_hours(): w = lku.Stopwatch() w.stop() w.start_time = w.stop_time - 3663 s = str(w) p = re.compile(r'1h1m3.\d\ds') assert p.match(s)
def fit(self, data): """ Run the optimization problem to learn W. Args: data (DataFrame): a data frame of ratings. Must have at least `user`, `item`, and `rating` columns. Returns: SLIM: the fit slim algorithm object. """ self._timer = util.Stopwatch() selector_data = data.copy(deep=True) self.selector.fit(selector_data) if self.binary: data = data.copy(deep=True) data['rating'] = 1 rmat, uidx, iidx = sparse_ratings(data) # Optimize each item independently on different threads using joblib item_coeff_array_tuples = Parallel(n_jobs=self.nprocs)( delayed(self._fs_train_item)(item, iidx, data) for item in iidx.values) _logger.info('[%s] completed calculating coefficients for %s items', self._timer, rmat.ncols) coeff_row = np.array([], dtype=np.int32) coeff_col = np.array([], dtype=np.int32) coeff_values = np.array([], dtype=np.float64) for itemid, i_pos, ncoeff_row, ncoeff_col, ncoeff_val in item_coeff_array_tuples: # Add coefficients with proper indexes for sparse matrix coeff_row = np.append(coeff_row, ncoeff_row) coeff_col = np.append(coeff_col, ncoeff_col) coeff_values = np.append(coeff_values, ncoeff_val) _logger.info('[%s] completed unpacking %s coefficients for %s items', self._timer, len(coeff_values), rmat.ncols) coeff_row = np.require(coeff_row, dtype=np.int32) coeff_col = np.require(coeff_col, dtype=np.int32) coeff_values = np.require(coeff_values, dtype=np.float64) # Create sparse coefficient matrix self.coefficients_ = CSR.from_coo(coeff_row, coeff_col, coeff_values, (len(iidx), len(iidx))).to_scipy() self.user_index_ = uidx self.item_index_ = iidx self.ratings_matrix_ = rmat return self
def fit(self, pruned_data): self.timer = util.Stopwatch() self.user_index = pruned_data.set_index('user')['item'] self.user_data['count'] = pruned_data.groupby('user')['SO'].count() self.user_data.reset_index(inplace=True) self.item_data['count'] = pruned_data.groupby('item')['SO'].count() self.item_data.reset_index(inplace=True) user_prof = self.get_userfeature(pruned_data) item_prof = self.get_itemfeature(pruned_data) self.similarity_matrix = self.cosine_sim(user_prof, item_prof) #_logger.info('[%s] fitting LDA model', self.timer) return self
def fit(self, pruned_data): self.timer = util.Stopwatch() self.review_data = pruned_data only_rev = pruned_data.dropna() item_rev = pd.DataFrame({'review': only_rev.groupby(['item']).review.apply(lambda x:' '.join(x))}) item_rev.reset_index(inplace=True) item_rev['processed_reviews'] = item_rev['review'].apply(lambda row: self.process(row)) self.item_data = item_rev tf_idf_mat = self.tf_idf(self.item_data, 'processed_reviews') self.similarity_matrix = self.cosine_sim(tf_idf_mat) _logger.info('[%s] fitting tfidf model', self.timer) return self
def fit(self, ratings): """ Train a model. The model-training process depends on ``save_nbrs`` and ``min_sim``, but *not* on other algorithm parameters. Args: ratings(pandas.DataFrame): (user,item,rating) data for computing item similarities. """ # Training proceeds in 2 steps: # 1. Normalize item vectors to be mean-centered and unit-normalized # 2. Compute similarities with pairwise dot products self._timer = util.Stopwatch() init_rmat, users, items = matrix.sparse_ratings(ratings) n_items = len(items) _logger.info( '[%s] made sparse matrix for %d items (%d ratings from %d users)', self._timer, len(items), init_rmat.nnz, len(users)) rmat, item_means = self._mean_center(ratings, init_rmat, items) rmat = self._normalize(rmat) _logger.info('[%s] computing similarity matrix', self._timer) smat = self._compute_similarities(rmat) _logger.info('[%s] got neighborhoods for %d of %d items', self._timer, np.sum(np.diff(smat.rowptrs) > 0), n_items) _logger.info('[%s] computed %d neighbor pairs', self._timer, smat.nnz) self.item_index_ = items self.item_means_ = item_means self.item_counts_ = np.diff(smat.rowptrs) self.sim_matrix_ = smat self.user_index_ = users self.rating_matrix_ = init_rmat return self
def fit(self, pruned_data): self.timer = util.Stopwatch() self.review_data = pruned_data only_rev = pruned_data.dropna() item_rev = pd.DataFrame({ 'review': only_rev.groupby(['item']).review.apply(lambda x: ' '.join(x)) }) item_rev.reset_index(inplace=True) #item_rev['processed_reviews'] = item_rev['review'].apply(lambda row: self.process(row)) self.item_data = item_rev self.LDA_matrix = self.LDA(self.item_data, 'review') #self.LDA_matrix = self.LDA(self.item_data, 'processed_reviews') self.user_index = self.review_data.set_index('user')['item'] self.item2index = pd.Index(self.item_data.item.unique(), name='item') _logger.info('[%s] fitting LDA model', self.timer) return self
def fit(self, ratings, **kwargs): timer = util.Stopwatch() normed = self.bias.fit_transform(ratings, indexes=True) model = self._build_model(len(self.bias.user_offsets_), len(self.bias.item_offsets_)) _log.info('[%s] training model', timer) model.fit([normed['uidx'], normed['iidx']], normed['rating'], epochs=self.epochs, batch_size=self.batch_size) _log.info('[%s] model finished, extracting weights', timer) self.user_features_ = model.get_layer('user-embed').get_weights()[0] self.item_features_ = model.get_layer('item-embed').get_weights()[0] self.global_bias_ = self.bias.mean_ self.user_bias_ = self.bias.user_offsets_.values self.item_bias_ = self.bias.item_offsets_.values self.user_index_ = self.bias.user_index self.item_index_ = self.bias.item_index return self
def fit(self, ratings, **kwargs): timer = util.Stopwatch() rng = util.rng(self.rng_spec) matrix, users, items = sparse_ratings(ratings[['user', 'item']]) _log.info('[%s] setting up model', timer) train, model = self._build_model(len(users), len(items)) _log.info('[%s] preparing training dataset', timer) train_data = BprInputs(matrix, self.batch_size, self.neg_count, rng) _log.info('[%s] training model', timer) train.fit(train_data, epochs=self.epochs) _log.info('[%s] model finished', timer) self.user_index_ = users self.item_index_ = items self.model = model return self
def fit(self, ratings): """ Train a model. The model-training process depends on ``save_nbrs`` and ``min_sim``, but *not* on other algorithm parameters. Args: ratings(pandas.DataFrame): (user,item,rating) data for computing item similarities. """ # Training proceeds in 2 steps: # 1. Normalize item vectors to be mean-centered and unit-normalized # 2. Compute similarities with pairwise dot products self._timer = util.Stopwatch() init_rmat, users, items = matrix.sparse_ratings(ratings) ''' # Find User Rating to remove for experimenting with Unlearn Algorithm # Try to Find non trivial rating items to remove for index, row in ratings.iterrows(): if items.get_loc(row['item']) in [17,138,22,83,76,31,92]: #print(row['user'],row['item'],index,users.get_loc(row['user']),items.get_loc(row['item'])) pass ''' n_items = len(items) _logger.info( '[%s] made sparse matrix for %d items (%d ratings from %d users)', self._timer, len(items), init_rmat.nnz, len(users)) start = time.time() rmat_scipy = init_rmat.to_scipy() self._compute_similarities_unlearn_min_centering_sparse_vectorize( rmat_scipy, items, users) end = time.time() learn_unlearn_time = end - start print("Unlearn Supported Learning: {}".format(end - start)) rows, cols, vals = self.smat_unlearn_sparse_csr self.smat_unlearn_sparse = sps.csr_matrix((vals, (rows, cols)), shape=(self.M, self.M)) # Print OUT Similarity Matrix to Verify Completeness #print(self.smat_unlearn_sparse) start = time.time() self._unlearn_min_centering_sparse(54, 17, rmat_scipy, self.smat_unlearn_sparse) end = time.time() unlearn_time = end - start print("Unlearn: {}".format(end - start)) start = time.time() rmat, item_means = self._mean_center(ratings, init_rmat, items, users) rmat = self._normalize(rmat) _logger.info('[%s] computing similarity matrix', self._timer) smat = self._compute_similarities(rmat, items, users) end = time.time() native_learn_time = end - start # Print OUT Similarity Matrix to Verify Completeness #print(smat.to_scipy()) print("Native Learning: {}".format(end - start)) _logger.info('[%s] got neighborhoods for %d of %d items', self._timer, np.sum(np.diff(smat.rowptrs) > 0), n_items) _logger.info('[%s] computed %d neighbor pairs', self._timer, smat.nnz) self.item_index_ = items self.item_means_ = item_means self.item_counts_ = np.diff(smat.rowptrs) self.sim_matrix_ = smat self.user_index_ = users self.rating_matrix_ = init_rmat # Save the Time Cost evaluation result #f = open("output_matrix.csv","a+") #f.write("{},{},{},{}\n".format(init_rmat.nnz ,native_learn_time,learn_unlearn_time,unlearn_time)) #f.close() return self
def test_stopwatch_instant(): w = lku.Stopwatch() assert w.elapsed() > 0
def test_stopwatch_stop(): w = lku.Stopwatch() time.sleep(0.5) w.stop() time.sleep(0.5) assert w.elapsed() >= 0.45
def test_stopwatch_str(): w = lku.Stopwatch() time.sleep(0.5) s = str(w) assert s.endswith('ms')
def test_stopwatch_long_str(): w = lku.Stopwatch() time.sleep(1.2) s = str(w) assert s.endswith('s')