def train(): print('Preprocessing raw data') preprocessor = Preprocessor() preprocessor.preprocess() dataset = Dataset(preprocessor) print('Training MF') mf = MF(preprocessor, dataset) mf.train_or_load_if_exists() print('Building I2I') i2i = Item2Item(dataset) print('Generating candidates') candidate_generator = CandidateGenerator(preprocessor, dataset, mf, i2i) X_train, y_train, q_train, q_train_reader = candidate_generator.generate_train() X_val, y_val, q_val, q_val_reader = candidate_generator.generate_val() import pickle try: with open('puke.pkl', 'wb') as f: pickle.dump((X_train, y_train, q_train, q_train_reader, X_val, y_val, q_val, q_val_reader), f) except: print("Couldn't save puke") print('Training ranker') ranker = Ranker() ranker.train(X_train, y_train, q_train, X_val, y_val, q_val) ranker.save() print('Validating ranker') rank_scores = ranker.rank(X_val) print('ndcg', dataset.validate_ndcg(y_val, q_val, q_val_reader, rank_scores))
def main(args): torch.manual_seed(333) if use_cuda: torch.cuda.manual_seed(333) random.seed(333) train_data_path = "data/training.dat" train_eval_data_path = "data/train-eval.dat" dev_data_path = "data/full/dev.dat" eval_data_path = "data/full/evaluation.dat" feats_path = "data/model.features" num_feats = len([line for line in open(feats_path)]) batch_size = 80 ranker = Ranker(num_feats, 256) ## Instances for training - loaded as pairs feat_indices = set([i for i in range(num_feats)]) train_instances = load_data(train_data_path, num_feats, feat_indices) train_eval_instances = load_eval_data(train_data_path, num_feats, feat_indices) dev_instances = load_data(dev_data_path, num_feats, feat_indices) dev_eval_instances = load_eval_data(dev_data_path, num_feats, feat_indices) tst_instances = load_eval_data(eval_data_path, num_feats, feat_indices) logger.info("Loaded {} training instances with {} features".format( len(train_instances), num_feats)) trainer = RankerTrainer(ranker, batch_size, 'output/') trainer.train(train_instances, dev_instances, train_eval_instances, dev_eval_instances, tst_instances) ranker.save('output/ranker.model')