def train(epoch, model, optimizer, data, config, args, exp_folder): """ Train for one epoch. """ cp_path = os.path.join(exp_folder, 'checkpoint.opt') batch_size = config.get('training', {}).get('batch_size', 32) for batch_id, (qids, passages, queries, answers, _) in enumerate(data): print("{}-{}".format(batch_id * batch_size, data.n_samples)) start_log_probs, end_log_probs = model(passages[:2], passages[2], queries[:2], queries[2]) loss = model.get_loss(start_log_probs, end_log_probs, answers[:, 0], answers[:, 1]) optimizer.zero_grad() loss.backward() optimizer.step() checkpointing.checkpoint(model, epoch, optimizer, checkpoint, exp_folder) new_cp_path = os.path.join( exp_folder, 'checkpoint_ep_{}_batch_{}.opt'.format(epoch, batch_id)) os.system("cp ") return
def main(): """ Main training program. """ argparser = argparse.ArgumentParser() argparser.add_argument("exp_folder", help="Experiment folder") argparser.add_argument("data", help="Training data") argparser.add_argument("--force_restart", action="store_true", default=False, help="Force restart of experiment: " "will ignore checkpoints") argparser.add_argument("--word_rep", help="Text file containing pre-trained " "word representations.") argparser.add_argument("--cuda", type=bool, default=torch.cuda.is_available(), help="Use GPU if possible") argparser.add_argument("--use_covariance", action="store_true", default=False, help="Do not assume diagonal covariance matrix " "when generating random word representations.") args = argparser.parse_args() config_filepath = os.path.join(args.exp_folder, 'config.yaml') with open(config_filepath) as f: config = yaml.load(f) checkpoint, training_state, epoch = try_to_resume(args.force_restart, args.exp_folder) if checkpoint: print('Resuming training...') model, id_to_token, id_to_char, optimizer, data = reload_state( checkpoint, training_state, config, args) else: print('Preparing to train...') model, id_to_token, id_to_char, optimizer, data = init_state( config, args) checkpoint = h5py.File(os.path.join(args.exp_folder, 'checkpoint')) checkpointing.save_vocab(checkpoint, 'vocab', id_to_token) checkpointing.save_vocab(checkpoint, 'c_vocab', id_to_char) if torch.cuda.is_available() and args.cuda: data.tensor_type = torch.cuda.LongTensor train_for_epochs = config.get('training', {}).get('epochs') if train_for_epochs is not None: epochs = range(epoch, train_for_epochs) else: epochs = itertools.count(epoch) for epoch in epochs: print('Starting epoch', epoch) train(epoch, model, optimizer, data, args) checkpointing.checkpoint(model, epoch, optimizer, checkpoint, args.exp_folder) return
def main(): """ Main training program. """ argparser = argparse.ArgumentParser() argparser.add_argument("exp_folder", help="Experiment folder") argparser.add_argument("data", help="Training data") argparser.add_argument("--force_restart", action="store_true", default=False, help="Force restart of experiment: " "will ignore checkpoints") argparser.add_argument("--word_rep", help="Text file containing pre-trained " "word representations.") argparser.add_argument("--cuda", type=bool, default=torch.cuda.is_available(), help="Use GPU if possible") argparser.add_argument("--use_covariance", action="store_true", default=False, help="Do not assume diagonal covariance matrix " "when generating random word representations.") args = argparser.parse_args() config_filepath = os.path.join(args.exp_folder, 'config.yaml') with open(config_filepath) as f: config = yaml.load(f) checkpoint, training_state, epoch = try_to_resume( args.force_restart, args.exp_folder) if checkpoint: print('Resuming training...') model, id_to_token, id_to_char, optimizer, data = reload_state( checkpoint, training_state, config, args) else: print('Preparing to train...') model, id_to_token, id_to_char, optimizer, data = init_state( config, args) checkpoint = h5py.File(os.path.join(args.exp_folder, 'checkpoint')) checkpointing.save_vocab(checkpoint, 'vocab', id_to_token) checkpointing.save_vocab(checkpoint, 'c_vocab', id_to_char) if torch.cuda.is_available() and args.cuda: data.tensor_type = torch.cuda.LongTensor train_for_epochs = config.get('training', {}).get('epochs') if train_for_epochs is not None: epochs = range(epoch, train_for_epochs) else: epochs = itertools.count(epoch) for epoch in epochs: print('Starting epoch', epoch) train(epoch, model, optimizer, data, args) checkpointing.checkpoint(model, epoch, optimizer, checkpoint, args.exp_folder) return
checkpoint, training_state, epoch = try_to_resume(args.force_restart, args.exp_folder) if checkpoint: print('Resuming training...') model, id_to_token, id_to_char, optimizer, data = reload_state( checkpoint, training_state, config, args) else: print('Preparing to train...') model, id_to_token, id_to_char, optimizer, data = init_state( config, args) checkpoint = h5py.File(os.path.join(args.exp_folder, 'checkpoint')) checkpointing.save_vocab(checkpoint, 'vocab', id_to_token) checkpointing.save_vocab(checkpoint, 'c_vocab', id_to_char) if torch.cuda.is_available() and args.cuda: data.tensor_type = torch.cuda.LongTensor train_for_epochs = config.get('training', {}).get('epochs') if train_for_epochs is not None: epochs = range(epoch, train_for_epochs) else: epochs = itertools.count(epoch) for epoch in epochs: print('Starting epoch', epoch) train(epoch, model, optimizer, data, args) checkpointing.checkpoint(model, epoch, optimizer, checkpoint, args.exp_folder) return
def main(): """ Main training program. """ argparser = argparse.ArgumentParser() argparser.add_argument("exp_folder", help="Experiment folder") argparser.add_argument("data", help="Training data") argparser.add_argument("--force_restart", action="store_true", default=False, help="Force restart of experiment: " "will ignore checkpoints") argparser.add_argument("--word_rep", help="Text file containing pre-trained " "word representations.") argparser.add_argument("--cuda", type=bool, default=torch.cuda.is_available(), help="Use GPU if possible") argparser.add_argument("--use_covariance", action="store_true", default=False, help="Do not assume diagonal covariance matrix " "when generating random word representations.") args = argparser.parse_args() config_filepath = os.path.join(args.exp_folder, 'config.yaml') with open(config_filepath) as f: config = yaml.load(f) logger = Logger() checkpoint, training_state, epoch = try_to_resume(logger, args.force_restart, args.exp_folder) if checkpoint: logger.log('Resuming training...') model, id_to_token, id_to_char, optimizer, data = reload_state(logger, checkpoint, training_state, config, args) else: logger.log('Preparing to train...') model, id_to_token, id_to_char, optimizer, data = init_state(logger, config, args) #checkpoint = h5py.File(os.path.join(args.exp_folder, 'checkpoint')) #logger.log('Saving vocab...') #checkpointing.save_vocab(checkpoint, 'vocab', id_to_token) #checkpointing.save_vocab(checkpoint, 'c_vocab', id_to_char) if torch.cuda.is_available() and args.cuda: data.tensor_type = torch.cuda.LongTensor print("TENSOR TYPE:", data.tensor_type) train_for_epochs = config.get('training', {}).get('epochs') teacher_forcing_ratio = config.get('training', {}).get('teacher_forcing_ratio', 1.0) if train_for_epochs is not None: epochs = range(epoch, train_for_epochs) else: epochs = itertools.count(epoch) loss = Loss() best_epoch = 0 losses, perplexities = [], [] min_loss, min_perplexity, best_epoch = np.nan, np.nan, 0 for epoch in epochs: loss.reset() logger.log('\n --- STARTING EPOCH : %d --- \n' % epoch) train_epoch(loss, model, optimizer, data, args, logger, teacher_forcing_ratio) logger.log('\n --- END OF EPOCH : %d --- \n' % epoch) epoch_loss = loss.epoch_loss() epoch_perplexity = loss.epoch_perplexity() losses.append(epoch_loss) perplexities.append(epoch_perplexity) min_loss, min_perplexity = np.nanmin(losses), np.nanmin(perplexities) if min_loss == epoch_loss: best_epoch = epoch mode = 'train'.upper() logger.log('\n\tEpoch [%s] Loss = %.4f, Min [%s] Loss = %.4f' % (mode, epoch_loss, mode, min_loss)) logger.log('\tEpoch [%s] Perplexity = %.2f, Min [%s] Perplexity = %.2f' % (mode, epoch_perplexity, mode, min_perplexity)) logger.log('\tBest Epoch = %d' % (best_epoch)) # Compute epoch loss and perplexity checkpointing.checkpoint( model, epoch, best_epoch, optimizer, checkpoint, args.exp_folder ) return