def train_loop(net, dname, data_dir, epochs=90, workers=4, resume='', savedir='./', save_all_epochs=False, q_nograd_its=0, batch_size=256): mkdir(savedir) global best_err1 # Load data here: _, train_loader, val_loader, _, _, Ntrain = \ get_image_loader(dname, batch_size, cuda=True, workers=workers, distributed=False, data_dir=data_dir) net.N_train = Ntrain start_epoch = 0 marginal_loglike = np.zeros(epochs) train_loss = np.zeros(epochs) dev_loss = np.zeros(epochs) err_train = np.zeros(epochs) err_dev = np.zeros(epochs) # optionally resume from a checkpoint if resume: if os.path.isfile(resume): print("=> loading checkpoint '{}'".format(resume)) start_epoch, best_err1 = net.load(resume) print("=> loaded checkpoint '{}' (epoch {})".format( resume, start_epoch)) else: print("=> no checkpoint found at '{}'".format(resume)) candidate_progress_file = resume.split('/') candidate_progress_file = '/'.join( candidate_progress_file[:-1]) + '/stats_array.pkl' if os.path.isfile(candidate_progress_file): print("=> found progress file at '{}'".format( candidate_progress_file)) try: marginal_loglike, err_train, train_loss, err_dev, dev_loss = \ load_object(candidate_progress_file) print("=> Loaded progress file at '{}'".format( candidate_progress_file)) except Exception: print("=> Unable to load progress file at '{}'".format( candidate_progress_file)) else: print("=> NOT found progress file at '{}'".format( candidate_progress_file)) if q_nograd_its > 0: net.prob_model.q_logits.requires_grad = False for epoch in range(start_epoch, epochs): if q_nograd_its > 0 and epoch == q_nograd_its: net.prob_model.q_logits.requires_grad = True tic = time.time() nb_samples = 0 for x, y in train_loader: marg_loglike_estimate, minus_loglike, err = net.fit(x, y) marginal_loglike[epoch] += marg_loglike_estimate * x.shape[0] err_train[epoch] += err * x.shape[0] train_loss[epoch] += minus_loglike * x.shape[0] nb_samples += len(x) marginal_loglike[epoch] /= nb_samples train_loss[epoch] /= nb_samples err_train[epoch] /= nb_samples toc = time.time() # ---- print print('\n depth approx posterior', net.prob_model.current_posterior.data.cpu().numpy()) print( "it %d/%d, ELBO/evidence %.4f, pred minus loglike = %f, err = %f" % (epoch, epochs, marginal_loglike[epoch], train_loss[epoch], err_train[epoch]), end="") cprint('r', ' time: %f seconds\n' % (toc - tic)) net.update_lr() # ---- dev tic = time.time() nb_samples = 0 for x, y in val_loader: minus_loglike, err = net.eval(x, y) dev_loss[epoch] += minus_loglike * x.shape[0] err_dev[epoch] += err * x.shape[0] nb_samples += len(x) dev_loss[epoch] /= nb_samples err_dev[epoch] /= nb_samples toc = time.time() cprint('g', ' pred minus loglike = %f, err = %f\n' % (dev_loss[epoch], err_dev[epoch]), end="") cprint('g', ' time: %f seconds\n' % (toc - tic)) filename = 'checkpoint.pth.tar' if save_all_epochs: filename = str(epoch) + '_' + filename net.save(os.path.join(savedir, filename), best_err1) if err_dev[epoch] < best_err1: best_err1 = err_dev[epoch] cprint('b', 'best top1 dev err: %f' % err_dev[epoch]) shutil.copyfile(os.path.join(savedir, filename), os.path.join(savedir, 'model_best.pth.tar')) all_results = [ marginal_loglike, err_train, train_loss, err_dev, dev_loss ] save_object(all_results, os.path.join(savedir, 'stats_array.pkl'))
def __init__(self, w2v_path, save_dir='models/recommender', courses_path='data/processed/grouped_courses.json'): """Load, preprocess and precompute word and course vectors. Load vectors if precomputed are available. Args: w2v_path (str): Google News Word2Vec word embeddings path. save_dir (str): Directory path to persist precomputed vectors. courses_path (str): Courses json path, grouped by category {category: Course}. """ self.logger = logging.getLogger(APP_NAME + ".Recommender") self.logger.info("Loading word2vec embeddings") self.word2id, self.id2word, self.word_embeddings = load_word2vec(w2v_path, 1000000) self.logger.info("Loaded %d word vectors with dim=%d" % self.word_embeddings.shape) self.stopwords = stopwords.words("english") course2id_path = os.path.join(save_dir, 'course2id.pkl') id2course_path = os.path.join(save_dir, 'id2course.pkl') category2id_path = os.path.join(save_dir, 'category2id.pkl') id2category_path = os.path.join(save_dir, 'id2category.pkl') category2courses_path = os.path.join(save_dir, 'category2courses.pkl') course_embeddings_path = os.path.join(save_dir, 'course_embeddings.pkl') category_embeddings_path = os.path.join(save_dir, 'category_embeddings.pkl') if os.path.exists(save_dir): self.logger.info("Loading course embeddings") self.course2id = load_object(course2id_path) self.id2course = load_object(id2course_path) self.category2id = load_object(category2id_path) self.id2category = load_object(id2category_path) self.category2courses = load_object(category2courses_path) self.course_embeddings = load_object(course_embeddings_path) self.category_embeddings = load_object(category_embeddings_path) else: os.makedirs(save_dir) self.logger.info("Course embeddings not found, building") self.course2id, self.id2course, self.category2id, self.id2category, self.category2courses, \ self.course_embeddings, self.category_embeddings = self._prepare_courses(courses_path) save_object(self.course2id, course2id_path) save_object(self.id2course, id2course_path) save_object(self.category2id, category2id_path) save_object(self.id2category, id2category_path) save_object(self.category2courses, category2courses_path) save_object(self.course_embeddings, course_embeddings_path) save_object(self.category_embeddings, category_embeddings_path)
def train_machines_multiple_dfs_new(self, _labels, _experiment_output_name='demo',_max_iter=20, _prediction_path=None, _print=False, _test_data=None, _test_labels=None, _uniformly=False): """ Train the PFSMs given a set of dataframes and their labels :param _labels: column types labeled by hand, where _label[i][j] denotes the type of j^th column in i^th dataframe. :param _experiment_output_name: :param _max_iter: the maximum number of iterations the optimization algorithm runs as long as it's not converged. :param _prediction_path: :param _print: :param _test_data: :param _test_labels: :param _uniformly: a binary variable used to initialize the PFSMs - True allows initializing uniformly rather than using hand-crafted values. :return: """ self.print = _print self.prediction_path = _prediction_path self.experiment_output_name = _experiment_output_name if _uniformly: self.initialize_params_uniformly() # Setup folders and probabilities for all columns self.normalize_params() # Changing column names self.data_frames = [data_frame.rename(columns=lambda n: str(n).replace(' ', '')) for data_frame in self.data_frames] self.model.data_frames = self.data_frames # find the unique values in all of the columns once for i, df in enumerate(self.model.data_frames): if i == 0: unique_vals = np.unique(df.values) else: unique_vals = np.concatenate((unique_vals, np.unique(df.values))) self.model.unique_vals = unique_vals self.PFSMRunner.set_unique_values(unique_vals) # Finding unique values and their counts self.model.dfs_unique_vals_counts = {} for i, df in enumerate(self.data_frames): df_unique_vals_counts = {} for column_name in list(df.columns): temp_x, counts = np.unique([str(int_element) for int_element in df[column_name].tolist()], return_counts=True) counts = {u_data: c for u_data, c in zip(temp_x, counts)} temp_counts = list(counts.values()) counts_array = np.reshape(temp_counts, newshape=(len(temp_counts),)) df_unique_vals_counts[column_name] = [temp_x, counts_array] self.model.dfs_unique_vals_counts[str(i)] = df_unique_vals_counts # Setting self.model.labels = _labels self.model.types = self.types self.model.J = len(self.PFSMRunner.machines) # J: num of data types including missing and anomaly. self.model.K = self.model.J - 2 # K: num of possible column data types (excluding missing and anomaly) self.model.pi = [self.model.PI for j in range(self.model.K)] # mixture weights of row types self.model.current_runner = self.PFSMRunner training_error = [] training_error.append(self.calculate_error_df(self.data_frames, _labels)) save_object(self.PFSMRunner, self.experiment_output_name + '_training_runner_initial.pkl') print(training_error) # Iterates over whole data points for it in range(_max_iter): print_to_file('iteration = ' + str(it), filename=self.experiment_output_name + '_output.txt') # Trains machines using all of the training data frames self.PFSMRunner = self.train_all_models_multiple_dfs(self.PFSMRunner) self.model.current_runner = self.PFSMRunner # Calculate training and validation error at each iteration training_error.append(self.calculate_error_df(self.data_frames, _labels)) print(training_error) if it > 0: if (training_error[-2] - training_error[-1] < 1e-2): print_to_file('converged!', filename=self.experiment_output_name + '_output.txt') save_object(self.PFSMRunner, self.experiment_output_name + '_training_runner' + str(it) + '.pkl') break save_object(self.PFSMRunner, self.experiment_output_name + '_training_runner' + str(it) + '.pkl') save_object(training_error, self.experiment_output_name + '_training_error.pkl')
def save_posteriors(self, filename='all_posteriors.pkl'): save_object(self.all_posteriors, filename)
def train_loop(model, dname, data_dir, epochs=90, workers=4, gpu=None, resume='', weight_decay=1e-4, savedir='./', milestones=None, MC_samples=1, batch_size=256): mkdir(savedir) global best_acc1 if gpu is not None: print("Use GPU: {} for training".format(gpu)) if gpu is not None: # Check for single GPU torch.cuda.set_device(gpu) model = model.cuda(gpu) else: # # DataParallel will divide and allocate batch_size to all available GPUs model = torch.nn.DataParallel(model).cuda() # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss(reduction='mean').cuda(gpu) optimizer = torch.optim.SGD(model.parameters(), lr, momentum=momentum, weight_decay=weight_decay) if milestones is None: # if milestones are not specified, set to impossible value so LR is never decayed. milestones = [epochs + 1] scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=milestones, gamma=0.1) tr_acc1_vec = [] tr_acc5_vec = [] tr_loss_vec = [] acc1_vec = [] acc5_vec = [] loss_vec = [] start_epoch = 0 # optionally resume from a checkpoint if resume: if os.path.isfile(resume): print("=> loading checkpoint '{}'".format(resume)) if gpu is None: checkpoint = torch.load(resume) else: # Map model to be loaded to specified single gpu. loc = 'cuda:{}'.format(gpu) checkpoint = torch.load(resume, map_location=loc) start_epoch = checkpoint['epoch'] best_acc1 = checkpoint['best_acc1'] if gpu is not None: # best_acc1 may be from a checkpoint from a different GPU best_acc1 = best_acc1.to(gpu) model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) scheduler.load_state_dict(checkpoint['scheduler']) print("=> loaded checkpoint '{}' (epoch {})" .format(resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(resume)) candidate_progress_file = resume.split('/') candidate_progress_file = '/'.join(candidate_progress_file[:-1]) + '/stats_array.pkl' if os.path.isfile(candidate_progress_file): print("=> found progress file at '{}'".format(candidate_progress_file)) try: tr_acc1_vec, tr_acc5_vec, tr_loss_vec, acc1_vec, acc5_vec, loss_vec = \ load_object(candidate_progress_file) print("=> Loaded progress file at '{}'".format(candidate_progress_file)) except Exception: print("=> Unable to load progress file at '{}'".format(candidate_progress_file)) else: print("=> NOT found progress file at '{}'".format(candidate_progress_file)) cudnn.benchmark = True _, train_loader, val_loader, _, _, _ = \ get_image_loader(dname, batch_size, cuda=True, workers=workers, distributed=False, data_dir=data_dir) for epoch in range(start_epoch, epochs): # train for one epoch and update lr scheduler setting tr_acc1, tr_acc5, tr_loss = train(train_loader, model, criterion, optimizer, epoch, gpu) print('used lr: %f' % optimizer.param_groups[0]["lr"]) scheduler.step() tr_acc1_vec.append(tr_acc1) tr_acc5_vec.append(tr_acc5) tr_loss_vec.append(tr_loss) # evaluate on validation set acc1, acc5, loss = validate(val_loader, model, criterion, gpu, MC_samples=MC_samples) acc1_vec.append(acc1) acc5_vec.append(acc5) loss_vec.append(loss) # remember best acc@1 and save checkpoint is_best = acc1 > best_acc1 best_acc1 = max(acc1, best_acc1) save_checkpoint({ 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'best_acc1': best_acc1, 'optimizer': optimizer.state_dict(), 'scheduler': scheduler.state_dict(), }, is_best, savedir=savedir) all_results = [tr_acc1_vec, tr_acc5_vec, tr_loss_vec, acc1_vec, acc5_vec, loss_vec] save_object(all_results, os.path.join(savedir, 'stats_array.pkl'))