def main(): params = dict() params['batch_size'] = 1 params['data_dir'] = args.path_to_train_data params['major'] = 'users' params['itemIdInd'] = 1 params['userIdInd'] = 0 params['extension'] = '.csv' params['delimiter'] = ',' params['header'] = 1 print("Loading training data") data_layer = input_layer.UserItemRecDataProvider(params=params) print("Data loaded") print("Total items found: {}".format(len(data_layer.data.keys()))) print("Vector dim: {}".format(data_layer.vector_dim)) print("Loading eval data") eval_params = copy.deepcopy(params) # must set eval batch size to 1 to make sure no examples are missed eval_params['batch_size'] = 1 eval_params['data_dir'] = args.path_to_eval_data eval_data_layer = input_layer.UserItemRecDataProvider(params=eval_params, user_id_map=data_layer.userIdMap, item_id_map=data_layer.itemIdMap) rencoder = model.AutoEncoder(layer_sizes=[data_layer.vector_dim] + [int(l) for l in args.hidden_layers.split(',')], nl_type=args.non_linearity_type, is_constrained=args.constrained, dp_drop_prob=args.drop_prob, last_layer_activations=not args.skip_last_layer_nl) path_to_model = Path(args.save_path) if path_to_model.is_file(): print("Loading model from: {}".format(path_to_model)) rencoder.load_state_dict(torch.load(args.save_path)) print('######################################################') print('######################################################') print('############# AutoEncoder Model: #####################') print(rencoder) print('######################################################') print('######################################################') rencoder.eval() if use_gpu: rencoder = rencoder.cuda() inv_userIdMap = {v: k for k, v in data_layer.userIdMap.items()} inv_itemIdMap = {v: k for k, v in data_layer.itemIdMap.items()} eval_data_layer.src_data = data_layer.data with open(args.predictions_path, 'w') as outf: for i, ((out, src), majorInd) in enumerate(eval_data_layer.iterate_one_epoch_eval(for_inf=True)): inputs = Variable(src.cuda().to_dense() if use_gpu else src.to_dense()) targets_np = out.to_dense().numpy()[0, :] outputs = rencoder(inputs).cpu().data.numpy()[0, :] non_zeros = targets_np.nonzero()[0].tolist() major_key = inv_userIdMap[majorInd] for ind in non_zeros: outf.write("{}\t{}\t{}\t{}\n".format(major_key, inv_itemIdMap[ind], outputs[ind], targets_np[ind])) if i % 10000 == 0: print("Done: {}".format(i))
def load_train_data(data_dir): params = dict() params['batch_size'] = 1 params['data_dir'] = data_dir params['major'] = 'users' params['itemIdInd'] = 1 params['userIdInd'] = 0 cherrypy.log("CHERRYPYLOG Loading training data") data_layer = input_layer.UserItemRecDataProvider(params=params) cherrypy.log("Data loaded") cherrypy.log("Total {} found: {}".format(params['major'], len(data_layer.data.keys()))) cherrypy.log("Vector dim: {}".format(data_layer.vector_dim)) cherrypy.log("data_layer.userIdMap: {}".format(len(data_layer.userIdMap))) cherrypy.log("data_layer.itemIdMap: {}".format(len(data_layer.itemIdMap))) inv_userIdMap = {v: k for k, v in data_layer.userIdMap.items()} inv_itemIdMap = {v: k for k, v in data_layer.itemIdMap.items()} return data_layer, inv_userIdMap, inv_itemIdMap
def main(): logger = Logger(args.logdir) params = dict() params['batch_size'] = args.batch_size params['data_dir'] = args.path_to_train_data params['major'] = 'users' params['itemIdInd'] = 1 params['userIdInd'] = 0 print("Loading training data") data_layer = input_layer.UserItemRecDataProvider(params=params) print("Data loaded") print("Total items found: {}".format(len(data_layer.data.keys()))) print("Vector dim: {}".format(data_layer.vector_dim)) print("Loading eval data") eval_params = copy.deepcopy(params) # must set eval batch size to 1 to make sure no examples are missed eval_params['data_dir'] = args.path_to_eval_data eval_data_layer = input_layer.UserItemRecDataProvider( params=eval_params, user_id_map=data_layer.userIdMap, # the mappings are provided item_id_map=data_layer.itemIdMap) eval_data_layer.src_data = data_layer.data rencoder = model.AutoEncoder( layer_sizes=[data_layer.vector_dim] + [int(l) for l in args.hidden_layers.split(',')], nl_type=args.non_linearity_type, is_constrained=args.constrained, dp_drop_prob=args.drop_prob, last_layer_activations=not args.skip_last_layer_nl) model_checkpoint = args.logdir + "/model" path_to_model = Path(model_checkpoint) if path_to_model.is_file(): print("Loading model from: {}".format(model_checkpoint)) rencoder.load_state_dict(torch.load(model_checkpoint)) print('######################################################') print('######################################################') print('############# AutoEncoder Model: #####################') print(rencoder) print('######################################################') print('######################################################') gpu_ids = [int(g) for g in args.gpu_ids.split(',')] print('Using GPUs: {}'.format(gpu_ids)) if len(gpu_ids) > 1: rencoder = nn.DataParallel(rencoder, device_ids=gpu_ids) rencoder = rencoder.cuda() if args.optimizer == "adam": optimizer = optim.Adam(rencoder.parameters(), lr=args.lr, weight_decay=args.weight_decay) elif args.optimizer == "adagrad": optimizer = optim.Adagrad(rencoder.parameters(), lr=args.lr, weight_decay=args.weight_decay) elif args.optimizer == "momentum": optimizer = optim.SGD(rencoder.parameters(), lr=args.lr, momentum=0.9, weight_decay=args.weight_decay) scheduler = MultiStepLR(optimizer, milestones=[24, 36, 48, 66, 72], gamma=0.5) elif args.optimizer == "rmsprop": optimizer = optim.RMSprop(rencoder.parameters(), lr=args.lr, momentum=0.9, weight_decay=args.weight_decay) else: raise ValueError('Unknown optimizer kind') t_loss = 0.0 t_loss_denom = 0.0 global_step = 0 if args.noise_prob > 0.0: dp = nn.Dropout(p=args.noise_prob) for epoch in range(args.num_epochs): print('Doing epoch {} of {}'.format(epoch, args.num_epochs)) e_start_time = time.time() rencoder.train() total_epoch_loss = 0.0 denom = 0.0 if args.optimizer == "momentum": scheduler.step() for i, mb in enumerate(data_layer.iterate_one_epoch()): inputs = Variable(mb.cuda().to_dense()) optimizer.zero_grad() outputs = rencoder(inputs) loss, num_ratings = model.MSEloss(outputs, inputs) loss = loss / num_ratings loss.backward() optimizer.step() global_step += 1 t_loss += loss.data[0] t_loss_denom += 1 if i % args.summary_frequency == 0: print('[%d, %5d] RMSE: %.7f' % (epoch, i, sqrt(t_loss / t_loss_denom))) logger.scalar_summary("Training_RMSE", sqrt(t_loss / t_loss_denom), global_step) t_loss = 0 t_loss_denom = 0.0 log_var_and_grad_summaries(logger, rencoder.encode_w, global_step, "Encode_W") log_var_and_grad_summaries(logger, rencoder.encode_b, global_step, "Encode_b") if not rencoder.is_constrained: log_var_and_grad_summaries(logger, rencoder.decode_w, global_step, "Decode_W") log_var_and_grad_summaries(logger, rencoder.decode_b, global_step, "Decode_b") total_epoch_loss += loss.data[0] denom += 1 #if args.aug_step > 0 and i % args.aug_step == 0 and i > 0: if args.aug_step > 0: # Magic data augmentation trick happen here for t in range(args.aug_step): inputs = Variable(outputs.data) if args.noise_prob > 0.0: inputs = dp(inputs) optimizer.zero_grad() outputs = rencoder(inputs) loss, num_ratings = model.MSEloss(outputs, inputs) loss = loss / num_ratings loss.backward() optimizer.step() e_end_time = time.time() print( 'Total epoch {} finished in {} seconds with TRAINING RMSE loss: {}' .format(epoch, e_end_time - e_start_time, sqrt(total_epoch_loss / denom))) logger.scalar_summary("Training_RMSE_per_epoch", sqrt(total_epoch_loss / denom), epoch) logger.scalar_summary("Epoch_time", e_end_time - e_start_time, epoch) if epoch % 3 == 0 or epoch == args.num_epochs - 1: eval_loss = do_eval(rencoder, eval_data_layer) print('Epoch {} EVALUATION LOSS: {}'.format(epoch, eval_loss)) logger.scalar_summary("EVALUATION_RMSE", eval_loss, epoch) print("Saving model to {}".format(model_checkpoint + ".epoch_" + str(epoch))) torch.save(rencoder.state_dict(), model_checkpoint + ".epoch_" + str(epoch)) print("Saving model to {}".format(model_checkpoint + ".last")) torch.save(rencoder.state_dict(), model_checkpoint + ".last")