def train_step(user_ids, per_user_count, per_user_item_ids, per_user_ratings): """ A single training step """ per_user_neg_ids = ratings.get_batch_neg(user_ids, per_user_item_ids.shape[1]) feed_dict = { model.input_user_ids: user_ids, model.input_per_user_count: per_user_count, model.input_per_user_item_ids: per_user_item_ids, model.input_per_user_neg_ids: per_user_neg_ids, model.input_per_user_ratings: per_user_ratings, } sess.run(train_op, feed_dict) step, loss, rate = sess.run([global_step, model.loss, learning_rate], feed_dict) if step % FLAGS.summary_every == 0: summaries = sess.run(train_summary_op, feed_dict) train_summary_writer.add_summary(summaries, step) time_str = datetime.now().isoformat() if step % FLAGS.summary_every == 0: print_flush("{}: step {}, loss {:g}, rate {:g}".format( time_str, step, loss, rate)) return loss
def runall(): res = defaultdict(list) with open('results.txt', 'a') as f: with tf.Graph().as_default(): session_conf = tf.ConfigProto( allow_soft_placement=FLAGS.allow_soft_placement, log_device_placement=FLAGS.log_device_placement) session_conf.gpu_options.allow_growth = True sess = tf.Session(config=session_conf) with sess.as_default(): model = PredictionModel( num_users=ratings.num_users, num_items=ratings.num_items, num_ratings=len(ratings.train), embedding_dim=FLAGS.embedding_dim, mu=np.mean(ratings.train['rating']), alpha=FLAGS.alpha, reg_lambda=FLAGS.reg_lambda, ) for i in range(1): last_loss = train(model, sess, 1e0, 40000, 0.5, FLAGS.training_stop_after) f.write('loss: {}\n'.format(last_loss)) f.flush() res['loss'].append(last_loss) U, V, Vb = sess.run(model.get_embedding_mats()) np.savetxt('results-' + dataset_name + '/ours2.u.txt', U, delimiter=',') np.savetxt('results-' + dataset_name + '/ours2.v.txt', V, delimiter=',') np.savetxt('results-' + dataset_name + '/ours2.vb.txt', Vb, delimiter=',') numtest = 1000 testids = np.random.permutation( list(set(ratings.val['user_id'])))[:numtest] predictions = np.matmul(U[testids], np.transpose(V)) + np.transpose(Vb) ndcg, mrr, precision = calc_scores.calc_scores( ratings.val, testids, predictions, 10) f.write(repr((ndcg, mrr, precision)) + '\n') f.write('\n') f.flush() res['ndcg_at_10'].append(ndcg) res['mrr_at_10'].append(mrr) res['precision_at_10'].append(precision) print_flush(res) return res
def calc_top(user_ids, predictions, k, verbose): per_user_top_rankings = dict() for i, user_id in enumerate(user_ids): if verbose and ((i + 1) % 1000) == 0: print_flush(' {}...'.format(i + 1)) # calculate the score for the prediction user_predictions = predictions[i] # too slow: # top_predicted_item_ids = np.array(np.argsort(user_predictions)[::-1][:k]) # a more efficient way: top_k = np.argpartition(-user_predictions, k)[:k] top_predicted_item_ids = top_k[np.argsort( user_predictions[top_k])[::-1]] # if i < 5: # print_flush(top_predicted_item_ids) per_user_top_rankings[user_id] = top_predicted_item_ids return per_user_top_rankings
def calc_scores(true_ratings, user_ids, predictions, k, save_path=None, verbose=True): if verbose: print_flush('Calculating scores on {} users'.format(len(user_ids))) per_user_top_rankings = calc_top(user_ids, predictions, k, verbose) if save_path is not None: try: with open(save_path, 'w') as f: f.write(repr(per_user_top_rankings)) except Exception as e: print_flush(e) num_items = predictions.shape[1] return calc_scores_(true_ratings, num_items, k, per_user_top_rankings, verbose)
def runall(): res = defaultdict(lambda: defaultdict(list)) with open('results.txt', 'a') as f: # for alpha in [0.0, 0.1, 0.2, 0.3, 0.4]: # for alpha in [0.0]: for alpha in [1.0]: with tf.Graph().as_default(): session_conf = tf.ConfigProto( allow_soft_placement=FLAGS.allow_soft_placement, log_device_placement=FLAGS.log_device_placement) session_conf.gpu_options.allow_growth = True sess = tf.Session(config=session_conf) with sess.as_default(): model = PredictionModel( num_users=ratings.num_users, num_items=ratings.num_items, num_ratings=len(ratings.train), embedding_dim=FLAGS.embedding_dim, alpha=alpha, reg_lambda=FLAGS.reg_lambda, ) for i in range(1): f.write('alpha: {}\n'.format(alpha)) last_loss = train(model, sess, 1e0, 40000, 0.5, FLAGS.training_stop_after) f.write('loss: {}\n'.format(last_loss)) f.flush() res[alpha]['loss:'].append(last_loss) U, V = sess.run(model.get_embedding_mats()) np.savetxt('ml-100k-take1.pmf.u.txt', U, delimiter=',') np.savetxt('ml-100k-take1.pmf.v.txt', V, delimiter=',') predictions = np.matmul(U, np.transpose(V)) ndcg, mrr = calc_scores.calc_scores( ratings.val, predictions, 10) f.write(repr((ndcg, mrr)) + '\n') f.write('\n') f.flush() # res[alpha]['precision_at_10'].append(precision_at_10) res[alpha]['ndcg_at_10'].append(ndcg) res[alpha]['mrr_at_10'].append(mrr) print_flush(res) return res
def val_step(user_ids, per_user_count, per_user_item_ids, per_user_ratings, writer=None): """ Evaluates model on a val set """ feed_dict = { model.input_user_ids: user_ids, model.input_per_user_count: per_user_count, model.input_per_user_item_ids: per_user_item_ids, model.input_per_user_ratings: per_user_ratings, } step, summaries, loss = sess.run( [global_step, val_summary_op, model.loss], feed_dict) time_str = datetime.now().isoformat() print_flush("{}: step {}, loss {:g}".format(time_str, step, loss)) if writer: writer.add_summary(summaries, step) return loss
def main(): if len(sys.argv) != 3: print_flush( 'Usage: python3 calc_scores_from_top_rankings.py <dataset_name> <model_name>' ) exit(1) dataset = sys.argv[1] model = sys.argv[2] # dataset = 'yelp-take1' # model = 'popularity' ratings = RatingsData.RatingsData.from_files(dataset + '.train.txt', dataset + '.val.txt') save_path = 'results-' + dataset + '/' + model + '.top_rankings.txt' print('Loading top rankings from {}'.format(save_path)) with open(save_path) as f: print('Reading file...') r = f.read().replace('array', 'np.array') print('Processing file...') per_user_top_rankings = eval(r) k = len(per_user_top_rankings[list(per_user_top_rankings.keys())[0]]) print_flush('Calculating scores...') ndcg, mrr, precision = calc_scores_(ratings.val, ratings.num_items, k, per_user_top_rankings, verbose=True) print_flush( 'Results on VALIDATION set: NDCG@{}={}, MRR@{}={}, P@{}={}'.format( k, ndcg, k, mrr, k, precision))
def main(): if len(sys.argv) != 3: print_flush( 'Usage: python3 calc_scores.py <dataset_name> <model_name>') exit(1) dataset = sys.argv[1] model = sys.argv[2] # dataset = 'yelp-take1' # model = 'popularity' ratings = RatingsData.RatingsData.from_files(dataset + '.train.txt', dataset + '.val.txt') def loadmat(path): with open(path) as f: lines = f.readlines() vals = [list(map(float, line.split(','))) for line in lines] lens = list(map(len, vals)) assert np.min(lens) == np.max(lens) return np.array(vals) U = loadmat('results-' + dataset + '/' + model + '.u.txt') assert U.shape[0] == ratings.num_users, (U.shape, ratings.num_users) V = loadmat('results-' + dataset + '/' + model + '.v.txt') assert V.shape[0] == ratings.num_items, (V.shape, ratings.num_items) # verify that the embedding dim is the same assert U.shape[1] == V.shape[1], (U.shape, V.shape) Vb = None try: Vb = loadmat('results-' + dataset + '/' + model + '.vb.txt') assert Vb.shape == (ratings.num_items, 1), Vb.shape except FileNotFoundError as e: print_flush('Skipping item bias: {}'.format(e)) save_path = 'results-' + dataset + '/' + model + '.top_rankings.txt' calc_wrapper(ratings, U, V, Vb, save_path)
def calc_scores_(true_ratings, num_items, k, per_user_top_rankings, verbose): def calc_dcg(top_k_ratings): assert len(top_k_ratings) == k return np.sum(((2**top_k_ratings) - 1) / np.log2(2 + np.arange(k))) ndcg = 0 mrr = 0 precision = 0 for i, (user_id, top_predicted_item_ids) in enumerate( per_user_top_rankings.items()): if verbose and ((i + 1) % 1000) == 0: print_flush(' {}...'.format(i + 1)) user_ratings = true_ratings[true_ratings['user_id'] == user_id] assert len(user_ratings) != 0 user_all_ratings = np.zeros([num_items], dtype=np.float32) user_all_ratings[user_ratings['item_id']] = user_ratings['rating'] top_predicted_ratings = user_all_ratings[top_predicted_item_ids] dcg = calc_dcg(top_predicted_ratings) # calculate IDCG (ideal DCG) top_k_ratings = np.sort(user_ratings['rating'])[::-1][:k] pad = [0] * (k - len(top_k_ratings)) top_k_ratings = np.concatenate((top_k_ratings, pad)) idcg = calc_dcg(top_k_ratings) if idcg == 0: print(user_id) print(user_ratings) ndcg += dcg / idcg # calculate MRR & precision match_rank = np.nonzero(top_predicted_ratings)[0] mrr += 1. / (match_rank[0] + 1) if len(match_rank) > 0 else 0 precision += 1. if len(match_rank) > 0 else 0 num_users = len(per_user_top_rankings) ndcg = ndcg / num_users mrr = mrr / num_users precision = precision / num_users return (ndcg, mrr, precision)
def train(model, sess, starter_learning_rate, learning_rate_decay_every, learning_rate_decay_by, stop_after): # Define Training procedure global_step = tf.Variable(0, name="global_step", trainable=False) #optimizer = tf.train.AdamOptimizer(1e-3) learning_rate = tf.train.exponential_decay(starter_learning_rate, global_step, learning_rate_decay_every, learning_rate_decay_by, staircase=True) # optimizer = tf.train.AdamOptimizer(learning_rate) optimizer = tf.train.AdagradOptimizer(learning_rate) grads_and_vars = optimizer.compute_gradients(model.loss) train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) # Keep track of gradient values and sparsity (optional) grad_summaries = [] #for g, v in grads_and_vars: for g, v in []: if g is not None: grad_hist_summary = tf.summary.histogram( "{}/grad/hist".format(v.name), g) sparsity_summary = tf.summary.scalar( "{}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g)) grad_summaries.append(grad_hist_summary) grad_summaries.append(sparsity_summary) #grad_summaries_merged = tf.summary.merge(grad_summaries) # Output directory for models and summaries timestamp = str(int(time.time())) out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp)) print_flush("Writing to {}\n".format(out_dir)) # Summaries for loss loss_summary = tf.summary.scalar("loss", model.loss) learning_rate_summary = tf.summary.scalar("learning_rate", learning_rate) # Train Summaries train_summary_op = tf.summary.merge([loss_summary, learning_rate_summary ]) #, grad_summaries_merged]) train_summary_dir = os.path.join(out_dir, "summaries", "train") train_summary_writer = tf.summary.FileWriter(train_summary_dir, sess.graph) # Val summaries val_summary_op = tf.summary.merge([loss_summary, learning_rate_summary]) val_summary_dir = os.path.join(out_dir, "summaries", "val") val_summary_writer = tf.summary.FileWriter(val_summary_dir, sess.graph) # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints")) checkpoint_prefix = os.path.join(checkpoint_dir, "model") if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) saver = tf.train.Saver(tf.global_variables(), max_to_keep=FLAGS.num_checkpoints) # Initialize all variables sess.run(tf.global_variables_initializer()) def train_step(user_ids, per_user_count, per_user_item_ids, per_user_ratings): """ A single training step """ feed_dict = { model.input_user_ids: user_ids, model.input_per_user_count: per_user_count, model.input_per_user_item_ids: per_user_item_ids, model.input_per_user_ratings: per_user_ratings, } sess.run(train_op, feed_dict) step, loss, rate = sess.run([global_step, model.loss, learning_rate], feed_dict) if step % FLAGS.summary_every == 0: summaries = sess.run(train_summary_op, feed_dict) train_summary_writer.add_summary(summaries, step) time_str = datetime.now().isoformat() if step % FLAGS.summary_every == 0: print_flush("{}: step {}, loss {:g}, rate {:g}".format( time_str, step, loss, rate)) return loss def val_step(user_ids, per_user_count, per_user_item_ids, per_user_ratings, writer=None): """ Evaluates model on a val set """ feed_dict = { model.input_user_ids: user_ids, model.input_per_user_count: per_user_count, model.input_per_user_item_ids: per_user_item_ids, model.input_per_user_ratings: per_user_ratings, } step, summaries, loss = sess.run( [global_step, val_summary_op, model.loss], feed_dict) time_str = datetime.now().isoformat() print_flush("{}: step {}, loss {:g}".format(time_str, step, loss)) if writer: writer.add_summary(summaries, step) return loss # Generate batches batches = ratings.train_batch_iter(FLAGS.batch_size, FLAGS.num_epochs) last_val_loss = 0 # Training loop. For each batch... for (user_ids, per_user_count, per_user_item_ids, per_user_ratings) in batches: last_train_loss = train_step(user_ids, per_user_count, per_user_item_ids, per_user_ratings) current_step = tf.train.global_step(sess, global_step) if stop_after and current_step > stop_after: print_flush('Stopping after {} training steps'.format(stop_after)) break if current_step % FLAGS.evaluate_every == 0: print_flush("\nEvaluation:") (val_user_ids, val_per_user_count, val_per_user_item_ids, val_per_user_ratings) = ratings.get_batch( ratings.val[:FLAGS.batch_size]) last_val_loss = val_step(val_user_ids, val_per_user_count, val_per_user_item_ids, val_per_user_ratings, writer=val_summary_writer) U, V = sess.run(model.get_embedding_mats()) predictions = np.matmul(U, np.transpose(V)) ndcg, mrr = calc_scores.calc_scores(ratings.val, predictions, 10) print_flush( ' NDCG@10 and MRR@10 for val set: {:.4f}, {:.4f}'.format( ndcg, mrr)) print_flush("") if current_step % FLAGS.checkpoint_every == 0: path = saver.save(sess, checkpoint_prefix, global_step=current_step) print_flush("Saved model checkpoint to {}\n".format(path)) pass return (last_train_loss, last_val_loss)
# Misc Parameters self.allow_soft_placement = True self.log_device_placement = False FLAGS = Flags() np.random.seed(1234) ########### # Dataset # ########### ratings = RatingsData.from_files('ml-100k-take1.train.txt', 'ml-100k-take1.val.txt') print_flush('Num users: {}'.format(ratings.num_users)) print_flush('Num items: {}'.format(ratings.num_items)) print_flush("Train/Val/Test split: {}/{}/{}".format(len(ratings.train), len(ratings.val), len(ratings.test))) users_train = set(ratings.train['user_id']) users_val = set(ratings.val['user_id']) print_flush('# users in train set: {}'.format(len(users_train))) print_flush('# users in val set: {}'.format(len(users_val))) print_flush('# users in val set not in train set: {}'.format( len(users_val - users_train))) ############# # The model #
def load_ml_100k(path, verbose=True): if verbose: print_flush('Loading MovieLens 100k ratings...') with open(path) as f: if verbose: print_flush('Scanning file...') for num_lines, _ in enumerate(f, 1): pass if verbose: print_flush('Will load {} ratings'.format(num_lines)) f.seek(0) all_data = np.zeros(num_lines, dtype=[('user_id', np.int32), ('item_id', np.int32), ('rating', np.float32), ('timestamp', np.int64)]) for i, line in enumerate(f): user_id, item_id, rating, timestamp = map(int, line.split()) user_id -= 1 # IDs start at 0, and we don't want users that don't have any ratings # rating = (rating - 1) / 4.0 rating = rating / 5.0 all_data[i] = (user_id, item_id, rating, timestamp) if verbose: print_flush('Loaded {} ratings'.format(len(all_data))) print_flush('Num users: {}'.format(np.max(all_data['user_id'] + 1))) print_flush('Num items: {}'.format(np.max(all_data['item_id'] + 1))) ratings = all_data['rating'] print_flush('Min/mean/max rating: {}/{:.3}/{}'.format( np.min(ratings), np.mean(ratings), np.max(ratings))) return all_data
def load_yelp(path, max_lines=None, verbose=True): if verbose: print_flush('Loading Yelp ratings...') user2id = IdAssigner() item2id = IdAssigner() with open(path) as f: if verbose: print_flush('Scanning file...') for num_lines, _ in enumerate(f, 1): if num_lines == max_lines: break if verbose: print_flush('Will load {} ratings'.format(num_lines)) all_data = np.zeros(num_lines, dtype=[('user_id', np.int32), ('item_id', np.int32), ('rating', np.float32), ('timestamp', np.int64)]) unique = set() f.seek(0) for i, line in enumerate(f): if i == num_lines: break if verbose and ((i + 1) % 100000) == 0: print_flush(' Loaded {} ratings...'.format(i + 1)) data = json.loads(line) user_id = user2id.get_id(data['user_id']) item_id = item2id.get_id(data['business_id']) # rating = (data['stars'] - 1) / 4.0 rating = data['stars'] / 5.0 # too slow: # timestamp = datetime.strptime(data['date'], '%Y-%m-%d').toordinal() year, month, day = map(int, data['date'].split('-')) timestamp = datetime(year=year, month=month, day=day).toordinal() all_data[i] = (user_id, item_id, rating, timestamp) t = (user_id, item_id) if t in unique: print_flush('Multiple ratings for user {} on item {}'.format( data['user_id'], data['business_id'])) unique.add(t) if verbose: print_flush('Loaded {} ratings'.format(len(all_data))) print_flush('Num users: {}'.format(user2id.get_next_id())) print_flush('Num items: {}'.format(item2id.get_next_id())) ratings = all_data['rating'] print_flush('Min/mean/max rating: {}/{:.3}/{}'.format( np.min(ratings), np.mean(ratings), np.max(ratings))) return all_data
data['user_id'], data['business_id'])) unique.add(t) if verbose: print_flush('Loaded {} ratings'.format(len(all_data))) print_flush('Num users: {}'.format(user2id.get_next_id())) print_flush('Num items: {}'.format(item2id.get_next_id())) ratings = all_data['rating'] print_flush('Min/mean/max rating: {}/{:.3}/{}'.format( np.min(ratings), np.mean(ratings), np.max(ratings))) return all_data max_lines = None # max_lines = 500000 yelp_data = load_yelp('/home/tvromen/research/datasets/yelp/review.json', max_lines) yelp_data = RatingsData.remove_top_percentile(yelp_data) # Need to remap user IDs to be sequential print_flush('Reassigning user IDs') user2id = IdAssigner() for i in range(len(yelp_data)): yelp_data[i]['user_id'] = user2id.get_id(yelp_data[i]['user_id']) np.random.seed(1234) ratings = RatingsData.RatingsData.from_data(yelp_data) ratings.output_as_text(ratings.train, 'yelp-take1.train.txt') ratings.output_as_text(ratings.val, 'yelp-take1.val.txt') ratings.output_as_text(ratings.test, 'yelp-take1.test.txt')
def load_amazon_cd(path, max_lines=None, verbose=True): if verbose: print_flush('Loading Amazon CD ratings...') user2id = IdAssigner() item2id = IdAssigner() with open(path) as f: if verbose: print_flush('Scanning file...') for num_lines, _ in enumerate(f, 1): if num_lines == max_lines: break if verbose: print_flush('Will load {} ratings'.format(num_lines)) all_data = np.zeros(num_lines, dtype=[('user_id', np.int32), ('item_id', np.int32), ('rating', np.float32), ('timestamp', np.int64)]) unique = set() f.seek(0) for i, line in enumerate(f): if i == num_lines: break if verbose and ((i + 1) % 100000) == 0: print_flush(' Loaded {} ratings...'.format(i + 1)) data = line.split(',') user_id = user2id.get_id(data[0]) item_id = item2id.get_id(data[1]) rating = float(data[2]) / 5.0 timestamp = int(data[3]) all_data[i] = (user_id, item_id, rating, timestamp) t = (user_id, item_id) if t in unique: print('Multiple ratings for user {} on item {}'.format( data['user_id'], data['business_id'])) unique.add(t) if verbose: print_flush('Loaded {} ratings'.format(len(all_data))) print_flush('Num users: {}'.format(user2id.get_next_id())) print_flush('Num items: {}'.format(item2id.get_next_id())) ratings = all_data['rating'] print_flush('Min/mean/max rating: {}/{:.3}/{}'.format( np.min(ratings), np.mean(ratings), np.max(ratings))) return all_data
def calc_wrapper(ratings, U, V, Vb, save_path=None): k = 10 batch_size = 5000 num_batches = 4 per_user_top_rankings = dict() user_ids = np.random.permutation(ratings.val['user_id']) for i in range(num_batches): print_flush('Batch {}'.format(i + 1)) batch_ids = user_ids[i * batch_size:(i + 1) * batch_size] if len(batch_ids) == 0: break if Vb is None: predictions = np.matmul(U[batch_ids], np.transpose(V)) else: predictions = np.matmul(U[batch_ids], np.transpose(V)) + np.transpose(Vb) print_flush('Removing items from training set...') for rating in ratings.train: pos = np.where(batch_ids == rating['user_id']) if len(pos) == 0: continue assert len(pos) == 1 predictions[pos, rating['item_id']] = -10000000 print_flush('Calculating top items...'.format(i)) batch_top_rankings = calc_top(batch_ids, predictions, k, verbose=True) per_user_top_rankings.update(batch_top_rankings) del predictions if save_path is not None: try: with open(save_path, 'w') as f: f.write(repr(per_user_top_rankings)) except Exception as e: print_flush(e) print_flush('Calculating scores...') ndcg, mrr, precision = calc_scores_(ratings.train, ratings.num_items, k, per_user_top_rankings, verbose=True) print_flush( 'Results on TRAINING set: NDCG@{}={}, MRR@{}={}, P@{}={}'.format( k, ndcg, k, mrr, k, precision)) ndcg, mrr, precision = calc_scores_(ratings.val, ratings.num_items, k, per_user_top_rankings, verbose=True) print_flush( 'Results on VALIDATION set: NDCG@{}={}, MRR@{}={}, P@{}={}'.format( k, ndcg, k, mrr, k, precision))