def main(args): assert os.path.exists(args.src_emb) assert os.path.exists(args.tgt_emb) src_emb, tgt_emb, mapping, _ = build_model(args, False) # get the mapped word embeddings as vectors of shape [max_vocab_size, embedding_size] src_emb = mapping(src_emb.weight).data tgt_emb = tgt_emb.weight.data id2word1 = {id_: word for word, id_ in args.src_dico.word2id.items()} id2word2 = {id_: word for word, id_ in args.tgt_dico.word2id.items()} top_k_match_ids = get_word_translations(src_emb, tgt_emb, args.knn) output_file = '%s-%s.txt' % (args.src_lang, args.tgt_lang) print('Writing to %s...' % output_file) with open(output_file, 'w', encoding='utf-8') as f: for src_id, (tgt_ids, tgt_scores) in enumerate(top_k_match_ids): for tgt_id, score in zip(tgt_ids, tgt_scores): if args.cuda: tgt_id, score = tgt_id.cpu(), score.cpu() if args.output_scores: f.write('%s %s %.4f\n' % (id2word1[src_id], id2word2[int( tgt_id.numpy())], float(score.numpy()))) else: f.write('%s %s\n' % (id2word1[src_id], id2word2[int(tgt_id.numpy())]))
def main(cl_arguments): ''' Run REPL for a CoLA model ''' # Arguments handling # cl_args = handle_arguments(cl_arguments) args = config.params_from_file(cl_args.config_file, cl_args.overrides) check_arg_name(args) assert args.target_tasks == "cola", \ "Currently only supporting CoLA. ({})".format(args.target_tasks) if args.cuda >= 0: try: if not torch.cuda.is_available(): raise EnvironmentError("CUDA is not available, or not detected" " by PyTorch.") log.info("Using GPU %d", args.cuda) torch.cuda.set_device(args.cuda) except Exception: log.warning("GPU access failed. You might be using a CPU-only" " installation of PyTorch. Falling back to CPU.") args.cuda = -1 # Prepare data # _, target_tasks, vocab, word_embs = build_tasks(args) tasks = sorted(set(target_tasks), key=lambda x: x.name) # Build or load model # model = build_model(args, vocab, word_embs, tasks) log.info("Loading existing model from %s...", cl_args.model_file_path) load_model_state(model, cl_args.model_file_path, args.cuda, [], strict=False) # Inference Setup # model.eval() vocab = Vocabulary.from_files(os.path.join(args.exp_dir, 'vocab')) indexers = build_indexers(args) task = take_one(tasks) # Run Inference # if cl_args.inference_mode == "repl": assert cl_args.input_path is None assert cl_args.output_path is None print("Running REPL for task: {}".format(task.name)) run_repl(model, vocab, indexers, task, args) elif cl_args.inference_mode == "corpus": run_corpus_inference( model, vocab, indexers, task, args, cl_args.input_path, cl_args.input_format, cl_args.output_path, cl_args.eval_output_path, ) else: raise KeyError(cl_args.inference_mode)
def eval_model(cfg: CfgNode) -> Dict[str, float]: model = build_model(cfg) chainer.serializers.load_npz(cfg.TEST.CHECKPOINT, model) converter = get_converter(data_name=cfg.DATASET.NAME, use_iou=cfg.MODEL.USE_IOU) val_iterator = build_dataloader("val", cfg)[0] test_iterator = build_dataloader("test", cfg)[0] device_id = cfg.TEST.DEVICE val_pred_scores = get_predicted_scores(model, val_iterator, converter, device_id) label = val_iterator.dataset.label precision, recall, thresholds = precision_recall_curve( label, val_pred_scores) f1 = 2 * (precision * recall) / (precision + recall) best_ind = np.nanargmax(f1) best_threshold = thresholds[best_ind] pred_scores = get_predicted_scores(model, test_iterator, converter, device_id) label = test_iterator.dataset.label pred_label = pred_scores > best_threshold f1 = f1_score(label, pred_label) prec = precision_score(label, pred_label) recall = recall_score(label, pred_label) return {"f1": f1, "precision": prec, "recall": recall}
def load_model(model_filename, use_cuda): state_dict = torch.load(model_filename) model, discriminator = build_model( rnn_size=state_dict['rnn_size'], output_size=state_dict['output_size'], encoder_n_layers=state_dict['encoder_n_layers'], decoder_n_layers=state_dict['decoder_n_layers'], dropout=state_dict['dropout'], discriminator_hidden_size=state_dict['discriminator_hidden_size'], max_length=state_dict['max_length'], enable_embedding_training=state_dict['enable_embedding_training'], use_cuda=use_cuda, bidirectional=state_dict['bidirectional'], use_attention=state_dict['attention']) model.load_state_dict(state_dict['model']) discriminator.load_state_dict(state_dict['discriminator']) model = model.cuda() if use_cuda else model discriminator = discriminator.cuda() if use_cuda else discriminator main_optimizer, discriminator_optimizer = init_optimizers( model, discriminator) main_optimizer.load_state_dict(state_dict['main_optimizer']) discriminator_optimizer.load_state_dict( state_dict['discriminator_optimizer']) return model, discriminator, main_optimizer, discriminator_optimizer
def main(): torch.manual_seed(12345) args = parse_args() cfg = Config.fromfile(args.config) if args.work_dir is not None: cfg.work_dir = args.work_dir _logger = init_logger(cfg.work_dir, 'INFO') _logger.info(cfg) print('before init_process') init_process(cfg.dist_config) print('after init_process') print('before build_model') model = build_model(cfg.model) print('after build_model') print('before train_dataloader') train_dataloader = get_dataloader(cfg.data.train_data, cfg.data.train_dataloader) print('after train_dataloader') val_dataloader = train_dataloader dataloaders = {'train': train_dataloader, 'val': val_dataloader} try: train_model( model, dataloaders, cfg, ) except KeyboardInterrupt: print('KeyboardInterrupt') dist.destroy_process_group()
def __init__(self, config_path, model_path, model_type): print(config_path) print(model_path) print(model_type) self.model_type = model_type configs = prepare_configs(config_path) data_configs = configs['data_configs'] model_configs = configs['model_configs'] vocab_src = Vocabulary.build_from_file( **data_configs['vocabularies'][0]) vocab_tgt = Vocabulary.build_from_file( **data_configs['vocabularies'][1]) nmt_model = build_model(n_src_vocab=vocab_src.max_n_words, n_tgt_vocab=vocab_tgt.max_n_words, padding_idx=vocab_src.pad, **model_configs) params = load_model_parameters(model_path, map_location="cpu") nmt_model.load_state_dict(params) nmt_model.cuda() nmt_model.eval() self.model = nmt_model self.data_configs = data_configs self.model_configs = model_configs self.vocab_src = vocab_src self.vocab_tgt = vocab_tgt
def build_translate_model( victim_config, victim_model_path, vocab_src, vocab_trg, device, ): """ build translation env :param victim_config: victim configs :param victim_model_path: victim_models :param vocab_src: source vocabulary :param vocab_trg: target vocabulary :param device: map location (cpu or cuda:*) :return: nmt_models used in the beam-search """ translate_model_configs = victim_config["model_configs"] # build model for translation nmt_model = build_model(n_src_vocab=vocab_src.max_n_words, n_tgt_vocab=vocab_trg.max_n_words, **translate_model_configs) nmt_model.to(device) INFO("load embedding params to device %s" % device) params = load_translate_model(victim_model_path, map_location=device) nmt_model.load_state_dict(params) INFO("finished building translation model for environment on %s" % device) return nmt_model
def main(): args = parse_args() cfg = Config.fromfile(args.config) # set cuda cfg.cuda = not args.no_cuda and torch.cuda.is_available() # set cudnn_benchmark & cudnn_deterministic if cfg.get('cudnn_benchmark', False): torch.backends.cudnn.benchmark = True if cfg.get('cudnn_deterministic', False): torch.backends.cudnn.deterministic = True # update configs according to args if not hasattr(cfg, 'work_dir'): if args.work_dir is not None: cfg.work_dir = args.work_dir else: cfg_name = rm_suffix(os.path.basename(args.config)) cfg.work_dir = os.path.join('./data/work_dir', cfg_name) mkdir_if_no_exists(cfg.work_dir, is_folder=True) cfg.load_from = args.load_from cfg.resume_from = args.resume_from cfg.gpus = args.gpus cfg.distributed = args.distributed cfg.random_conns = args.random_conns cfg.eval_interim = args.eval_interim cfg.save_output = args.save_output cfg.force = args.force for data in ['train_data', 'test_data']: if not hasattr(cfg, data): continue cfg[data].eval_interim = cfg.eval_interim if not hasattr(cfg[data], 'knn_graph_path') or not os.path.isfile( cfg[data].knn_graph_path): cfg[data].prefix = cfg.prefix cfg[data].knn = cfg.knn cfg[data].knn_method = cfg.knn_method name = 'train_name' if data == 'train_data' else 'test_name' cfg[data].name = cfg[name] logger = create_logger() # set random seeds if args.seed is not None: logger.info('Set random seed to {}'.format(args.seed)) set_random_seed(args.seed) model = build_model(cfg.model['type'], **cfg.model['kwargs']) handler = build_handler(args.phase, cfg.model['type']) handler(model, cfg, logger)
def init_zero_supervised(vocabulary, save_file, use_cuda): model, discriminator = build_model( max_length=opt.max_length, output_size=vocabulary.size(), rnn_size=opt.rnn_size, encoder_n_layers=opt.layers, decoder_n_layers=opt.layers, dropout=opt.dropout, use_cuda=use_cuda, enable_embedding_training=bool(opt.sv_embedding_training), discriminator_hidden_size=opt.discriminator_hidden_size, bidirectional=bool(opt.bidirectional), use_attention=bool(opt.attention)) if opt.src_embeddings is not None: load_embeddings(model, src_embeddings_filename=opt.src_embeddings, tgt_embeddings_filename=opt.tgt_embeddings, vocabulary=vocabulary) model = model.cuda() if use_cuda else model discriminator = discriminator.cuda() if use_cuda else discriminator print_summary(model) trainer = Trainer( vocabulary, max_length=opt.max_length, use_cuda=use_cuda, discriminator_lr=opt.discriminator_lr, main_lr=opt.sv_learning_rate, main_betas=(opt.adam_beta1, 0.999), ) if opt.sv_load_from: model, discriminator, main_optimizer, discriminator_optimizer = load_model( opt.sv_load_from, use_cuda) trainer.main_optimizer = main_optimizer trainer.discriminator_optimizer = discriminator_optimizer else: pair_file_names = [ (opt.train_src_bi, opt.train_tgt_bi), ] trainer.train_supervised(model, discriminator, pair_file_names, vocabulary, num_words_in_batch=opt.sv_num_words_in_batch, max_length=opt.max_length, save_file=save_file, big_epochs=opt.supervised_epochs, print_every=opt.print_every, save_every=opt.save_every, max_batch_count=opt.n_supervised_batches) for param in model.parameters(): param.requires_grad = False return Translator(model, vocabulary, use_cuda)
def main(): torch.manual_seed(0) args = parse_args() cfg = Config.fromfile(args.config) if args.work_dir is not None: cfg.work_dir = args.work_dir _logger = init_logger(cfg.work_dir, 'INFO') _logger.info(cfg) print('before init_process') init_process(cfg.dist_config) rank = dist.get_rank() print('rank={}'.format(rank)) print('world_size={}'.format(dist.get_world_size())) print('after init_process') print('before build_model') model = build_model(cfg.model) print('after build_model') print('before train_dataloader') if rank in cfg.base_model_ranks: train_dataloader = get_dataloader(cfg.data.train_data, cfg.data.train_dataloader) val_dataloader = train_dataloader dataloaders = {'train': train_dataloader, 'val': val_dataloader} else: dataloaders = {'train': None, 'val': None} if cfg.data.train_num_samples: cfg.data.dataloader_lens = cfg.data.train_num_samples // len(cfg.base_model_ranks) // cfg.data.batch_size else: cfg.data.dataloader_lens = 5822653 // len(cfg.base_model_ranks) // cfg.data.batch_size if not cfg.load_top: if rank in cfg.top_model_ranks: cfg.load_from = None try: train_nbase_mtop_model( model, dataloaders, cfg, ) except KeyboardInterrupt: print('KeyboardInterrupt') dist.destroy_process_group()
def main(config: DictConfig, ckpt: dict, show_dir: Optional[str] = None): # seed if config.SEED is not None: make_deterministic(seed=config.SEED) # data test_loader = get_test_loader(config) # model model = build_model(config, model_state_dict=ckpt['model_state_dict']) # test test(model=model, data_loader=test_loader, device=config.DEVICE, threshold_edge=config.TEST.THRESHOLD_EDGE, show_dir=show_dir)
def get_models(params): assert not params.cuda or torch.cuda.is_available() assert 0 <= params.dis_dropout < 1 assert 0 <= params.dis_input_dropout < 1 assert 0 <= params.dis_smooth < 0.5 assert params.dis_lambda > 0 and params.dis_steps > 0 assert 0 < params.lr_shrink <= 1 assert os.path.isfile(params.src_emb) assert os.path.isfile(params.tgt_emb) assert params.dico_eval == 'default' or os.path.isfile(params.dico_eval) assert params.export in ["", "txt", "pth"] # build model / trainer / evaluator logger = initialize_exp(params) src_emb, tgt_emb, mapping, discriminator = build_model(params, True) trainer = Trainer(src_emb, tgt_emb, mapping, discriminator, params) trainer.reload_best() evaluator = Evaluator(trainer) return evaluator, trainer
def run_model(params, runid): params.exp_name = params.src_lang + params.tgt_lang if params.exp_name is None else params.exp_name seed = np.random.randint(10000, 20000) params.seed = seed params.exp_id = str(runid) params.exp_path = '' # build model / trainer / evaluator logger = initialize_exp(params) src_emb, tgt_emb, mapping, discriminator = build_model(params, True) trainer = Trainer(src_emb, tgt_emb, mapping, discriminator, params) evaluator = Evaluator(trainer) base_nn, base_csls = _adversarial(params, logger, trainer, evaluator) outputs = { "run": runid, "seed": seed, "base_nn": base_nn, "base_csls": base_csls } return logger, trainer, evaluator, outputs
def get_teacher_model(training_configs, model_configs, vocab_src, vocab_tgt, flags): # build teacher model if training_configs['use_odc']: INFO('Building teacher model...') teacher_model = build_model(n_src_vocab=vocab_src.max_n_words, n_tgt_vocab=vocab_tgt.max_n_words, padding_idx=vocab_src.pad, vocab_src=vocab_src, **model_configs) if Constants.USE_GPU: teacher_model.cuda() if training_configs.get('teacher_model_path', '') != '': teacher_model_path = training_configs['teacher_model_path'] teacher_model.load_state_dict( torch.load(teacher_model_path, map_location=Constants.CURRENT_DEVICE), strict=False) else: teacher_model_path = os.path.join(flags.saveto, flags.model_name + '.teacher.pth') INFO('Done.') else: teacher_model = None teacher_model_path = '' return teacher_model, teacher_model_path
def main(): logging.basicConfig(level=logging.DEBUG) logger = logging.getLogger("unmt") logger.propagate = False fh = logging.FileHandler(opt.log_file) fh.setLevel(logging.DEBUG) ch = logging.StreamHandler() ch.setLevel(logging.INFO) logger.addHandler(fh) logger.addHandler(ch) use_cuda = torch.cuda.is_available() logger.info("Use CUDA: " + str(use_cuda)) _, _, vocabulary = collect_vocabularies( src_vocabulary_path=opt.src_vocabulary, tgt_vocabulary_path=opt.tgt_vocabulary, all_vocabulary_path=opt.all_vocabulary, src_file_names=(opt.train_src_mono, ), tgt_file_names=(opt.train_tgt_mono, ), src_max_words=opt.src_vocab_size, tgt_max_words=opt.tgt_vocab_size, reset=bool(opt.reset_vocabularies)) if opt.src_to_tgt_dict is not None and opt.tgt_to_src_dict is not None: zero_model = WordByWordModel(opt.src_to_tgt_dict, opt.tgt_to_src_dict, vocabulary, opt.max_length) elif opt.bootstrapped_model is not None: model, discriminator, _, _ = load_model(opt.bootstrapped_model, use_cuda) for param in model.parameters(): param.requires_grad = False zero_model = Translator(model, vocabulary, use_cuda) elif opt.train_src_bi is not None and opt.train_tgt_bi is not None: zero_model = init_zero_supervised(vocabulary, opt.save_model, use_cuda) else: assert False, "Zero model was not initialized" trainer = Trainer(vocabulary, max_length=opt.max_length, use_cuda=use_cuda, discriminator_lr=opt.discriminator_lr, main_lr=opt.learning_rate, main_betas=(opt.adam_beta1, 0.999),) trainer.current_translation_model = zero_model model, discriminator = build_model( max_length=opt.max_length, output_size=vocabulary.size(), rnn_size=opt.rnn_size, encoder_n_layers=opt.layers, decoder_n_layers=opt.layers, dropout=opt.dropout, use_cuda=use_cuda, enable_embedding_training=bool(opt.usv_embedding_training), discriminator_hidden_size=opt.discriminator_hidden_size, bidirectional=bool(opt.bidirectional), use_attention=bool(opt.attention) ) if opt.src_embeddings is not None: load_embeddings(model, src_embeddings_filename=opt.src_embeddings, tgt_embeddings_filename=opt.tgt_embeddings, vocabulary=vocabulary) model = model.cuda() if use_cuda else model print_summary(model) print_summary(discriminator) discriminator = discriminator.cuda() if use_cuda else discriminator if opt.usv_load_from: model, discriminator, main_optimizer, discriminator_optimizer = load_model(opt.usv_load_from, use_cuda) trainer.main_optimizer = main_optimizer trainer.discriminator_optimizer = discriminator_optimizer trainer.train(model, discriminator, src_file_names=[opt.train_src_mono, ], tgt_file_names=[opt.train_tgt_mono, ], unsupervised_big_epochs=opt.unsupervised_epochs, num_words_in_batch=opt.usv_num_words_in_batch, print_every=opt.print_every, save_every=opt.save_every, save_file=opt.save_model, n_unsupervised_batches=opt.n_unsupervised_batches, enable_unsupervised_backtranslation=opt.enable_unsupervised_backtranslation, teacher_forcing=bool(opt.teacher_forcing), max_length=opt.max_length)
help="Normalize embeddings before training") # inference parameters parser.add_argument("--multilingual_inference_method", nargs='+', help="which inference methods to use", default=['BI', 'NT', 'CNT', 'CAT']) # parse parameters params = parser.parse_args() # check parameters assert not params.cuda or torch.cuda.is_available() assert all(os.path.isfile(emb) for emb in params.embs) assert len(params.langs) == len(params.embs) assert all([ inf_met in ['BI', 'NT', 'CNT', 'CAT'] or inf_met.startswith('CAT') for inf_met in params.multilingual_inference_method ]) # build logger / model / trainer / evaluator logger = initialize_exp(params) embs, mapping = build_model(params) trainer = Trainer(embs, mapping, params) evaluator = Evaluator(trainer) """ Inference with MWT (Multilingual Word Translation) """ logger.info('Starting inference...') # embeddings evaluation evaluator.word_translation() logger.info('End of inference.\n\n')
params = parser.parse_args() # check parameters assert not params.cuda or torch.cuda.is_available() assert 0 <= params.dis_dropout < 1 assert 0 <= params.dis_input_dropout < 1 assert 0 <= params.dis_smooth < 0.5 assert params.dis_lambda > 0 and params.dis_steps > 0 assert 0 < params.lr_shrink <= 1 assert os.path.isfile(params.src_emb) assert os.path.isfile(params.tgt_emb) assert params.export in ["", "txt", "pth"] # build model / trainer / evaluator logger = initialize_exp(params) src_emb, tgt_emb, mapping, discriminator = build_model(params, True) trainer = Trainer(src_emb, tgt_emb, mapping, discriminator, params) evaluator = Evaluator(trainer) """ Learning loop for Adversarial Training """ if params.adversarial: logger.info('----> ADVERSARIAL TRAINING <----\n\n') # training loop for n_epoch in range(params.n_epochs): logger.info('Starting adversarial training epoch %i...' % n_epoch) tic = time.time()
def ensemble_translate(FLAGS): GlobalNames.USE_GPU = FLAGS.use_gpu config_path = os.path.abspath(FLAGS.config_path) with open(config_path.strip()) as f: configs = yaml.load(f) data_configs = configs['data_configs'] model_configs = configs['model_configs'] timer = Timer() # ================================================================================== # # Load Data INFO('Loading data...') timer.tic() # Generate target dictionary vocab_src = Vocabulary(**data_configs["vocabularies"][0]) vocab_tgt = Vocabulary(**data_configs["vocabularies"][1]) valid_dataset = TextLineDataset(data_path=FLAGS.source_path, vocabulary=vocab_src) valid_iterator = DataIterator(dataset=valid_dataset, batch_size=FLAGS.batch_size, use_bucket=True, buffer_size=100000, numbering=True) INFO('Done. Elapsed time {0}'.format(timer.toc())) # ================================================================================== # # Build Model & Sampler & Validation INFO('Building model...') timer.tic() nmt_models = [] model_path = FLAGS.model_path for ii in range(len(model_path)): nmt_model = build_model(n_src_vocab=vocab_src.max_n_words, n_tgt_vocab=vocab_tgt.max_n_words, **model_configs) nmt_model.eval() INFO('Done. Elapsed time {0}'.format(timer.toc())) INFO('Reloading model parameters...') timer.tic() params = load_model_parameters(model_path[ii], map_location="cpu") nmt_model.load_state_dict(params) if GlobalNames.USE_GPU: nmt_model.cuda() nmt_models.append(nmt_model) INFO('Done. Elapsed time {0}'.format(timer.toc())) INFO('Begin...') result_numbers = [] result = [] n_words = 0 timer.tic() infer_progress_bar = tqdm(total=len(valid_iterator), desc=' - (Infer) ', unit="sents") valid_iter = valid_iterator.build_generator() for batch in valid_iter: numbers, seqs_x = batch batch_size_t = len(seqs_x) x = prepare_data(seqs_x=seqs_x, cuda=GlobalNames.USE_GPU) with torch.no_grad(): word_ids = ensemble_beam_search(nmt_models=nmt_models, beam_size=FLAGS.beam_size, max_steps=FLAGS.max_steps, src_seqs=x, alpha=FLAGS.alpha) word_ids = word_ids.cpu().numpy().tolist() # Append result for sent_t in word_ids: sent_t = [[wid for wid in line if wid != PAD] for line in sent_t] result.append(sent_t) n_words += len(sent_t[0]) infer_progress_bar.update(batch_size_t) infer_progress_bar.close() INFO('Done. Speed: {0:.2f} words/sec'.format( n_words / (timer.toc(return_seconds=True)))) translation = [] for sent in result: samples = [] for trans in sent: sample = [] for w in trans: if w == vocab_tgt.EOS: break sample.append(vocab_tgt.id2token(w)) samples.append(vocab_tgt.tokenizer.detokenize(sample)) translation.append(samples) # resume the ordering origin_order = np.argsort(result_numbers).tolist() translation = [translation[ii] for ii in origin_order] keep_n = FLAGS.beam_size if FLAGS.keep_n <= 0 else min( FLAGS.beam_size, FLAGS.keep_n) outputs = ['%s.%d' % (FLAGS.saveto, i) for i in range(keep_n)] with batch_open(outputs, 'w') as handles: for trans in translation: for i in range(keep_n): if i < len(trans): handles[i].write('%s\n' % trans[i]) else: handles[i].write('%s\n' % 'eos')
def train(flags): """ flags: saveto: str reload: store_true config_path: str pretrain_path: str, default="" model_name: str log_path: str """ # ================================================================================== # # Initialization for training on different devices # - CPU/GPU # - Single/Distributed Constants.USE_GPU = flags.use_gpu world_size = 1 rank = 0 local_rank = 0 if Constants.USE_GPU: torch.cuda.set_device(local_rank) Constants.CURRENT_DEVICE = "cuda:{0}".format(local_rank) else: Constants.CURRENT_DEVICE = "cpu" # If not root_rank, close logging # else write log of training to file. if rank == 0: write_log_to_file( os.path.join(flags.log_path, "%s.log" % time.strftime("%Y%m%d-%H%M%S"))) else: close_logging() # ================================================================================== # # Parsing configuration files # - Load default settings # - Load pre-defined settings # - Load user-defined settings configs = prepare_configs(flags.config_path, flags.predefined_config) data_configs = configs['data_configs'] model_configs = configs['model_configs'] optimizer_configs = configs['optimizer_configs'] training_configs = configs['training_configs'] INFO(pretty_configs(configs)) Constants.SEED = training_configs['seed'] set_seed(Constants.SEED) timer = Timer() # ================================================================================== # # Load Data INFO('Loading data...') timer.tic() # Generate target dictionary vocab_src = Vocabulary.build_from_file(**data_configs['vocabularies'][0]) Constants.EOS = vocab_src.eos Constants.PAD = vocab_src.pad Constants.BOS = vocab_src.bos train_bitext_dataset = TextLineDataset( data_path=data_configs['train_data'][0], vocabulary=vocab_src, max_len=data_configs['max_len'][0], is_train_dataset=True) valid_bitext_dataset = TextLineDataset( data_path=data_configs['valid_data'][0], vocabulary=vocab_src, is_train_dataset=False) training_iterator = DataIterator( dataset=train_bitext_dataset, batch_size=training_configs["batch_size"], use_bucket=training_configs['use_bucket'], buffer_size=training_configs['buffer_size'], batching_func=training_configs['batching_key'], world_size=world_size, rank=rank) valid_iterator = DataIterator( dataset=valid_bitext_dataset, batch_size=training_configs['valid_batch_size'], use_bucket=True, buffer_size=100000, numbering=True, shuffle=False, world_size=world_size, rank=rank) INFO('Done. Elapsed time {0}'.format(timer.toc())) # ================================ Begin ======================================== # # Build Model & Optimizer # We would do steps below on after another # 1. build models & criterion # 2. move models & criterion to gpu if needed # 3. load pre-trained model if needed # 4. build optimizer # 5. build learning rate scheduler if needed # 6. load checkpoints if needed # 0. Initial model_collections = Collections() checkpoint_saver = Saver( save_prefix="{0}.ckpt".format( os.path.join(flags.saveto, flags.model_name)), num_max_keeping=training_configs['num_kept_checkpoints']) best_model_prefix = os.path.join( flags.saveto, flags.model_name + Constants.MY_BEST_MODEL_SUFFIX) best_model_saver = Saver( save_prefix=best_model_prefix, num_max_keeping=training_configs['num_kept_best_model']) # 1. Build Model & Criterion INFO('Building model...') timer.tic() nmt_model = build_model(vocab_size=vocab_src.max_n_words, padding_idx=vocab_src.pad, vocab_src=vocab_src, **model_configs) INFO(nmt_model) # 损失函数 critic = torch.nn.CrossEntropyLoss(ignore_index=Constants.PAD) INFO(critic) # 2. Move to GPU if Constants.USE_GPU: nmt_model = nmt_model.cuda() critic = critic.cuda() # 3. Load pretrained model if needed load_pretrained_model(nmt_model, flags.pretrain_path, exclude_prefix=flags.pretrain_exclude_prefix, device=Constants.CURRENT_DEVICE) INFO('Done. Elapsed time {0}'.format(timer.toc())) # 4. Build optimizer INFO('Building Optimizer...') optimizer = torch.optim.Adam(nmt_model.parameters(), lr=optimizer_configs['learning_rate']) INFO('Done. Elapsed time {0}'.format(timer.toc())) # ================================================================================== # # Prepare training eidx = model_collections.get_collection("eidx", [0])[-1] uidx = model_collections.get_collection("uidx", [1])[-1] bad_count = model_collections.get_collection("bad_count", [0])[-1] oom_count = model_collections.get_collection("oom_count", [0])[-1] is_early_stop = model_collections.get_collection("is_early_stop", [ False, ])[-1] train_loss_meter = AverageMeter() sent_per_sec_meter = TimeMeter() tok_per_sec_meter = TimeMeter() grad_denom = 0 train_loss = 0.0 cum_n_words = 0 valid_loss = best_valid_loss = float('inf') if rank == 0: summary_writer = SummaryWriter(log_dir=flags.log_path) else: summary_writer = None sent_per_sec_meter.start() tok_per_sec_meter.start() INFO('Begin training...') while True: if summary_writer is not None: summary_writer.add_scalar("Epoch", (eidx + 1), uidx) # Build iterator and progress bar training_iter = training_iterator.build_generator() if rank == 0: training_progress_bar = tqdm(desc=' - (Epc {}, Upd {}) '.format( eidx, uidx), total=len(training_iterator), unit="sents") else: training_progress_bar = None for batch in training_iter: seqs_x = batch batch_size = len(seqs_x) cum_n_words = 0.0 train_loss = 0.0 try: # Prepare data grad_denom += batch_size x = prepare_data(seqs_x, seqs_y=None, cuda=Constants.USE_GPU) nmt_model.train() critic.train() critic.zero_grad() with torch.enable_grad(): logits = nmt_model(x[:-1]) logits = logits.view(-1, vocab_src.max_n_words) trg = x[1:] trg = trg.view(-1) loss = critic(logits, trg) loss.backward() optimizer.step() valid_token = (trg != Constants.PAD).long().sum().item() cum_n_words += valid_token train_loss += loss.item() * valid_token except RuntimeError as e: if 'out of memory' in str(e): print('| WARNING: ran out of memory, skipping batch') oom_count += 1 else: raise e # When update_cycle becomes 0, it means end of one batch. Several things will be done: # - update parameters # - reset update_cycle and grad_denom, update uidx # - learning rate scheduling # - update moving average if training_progress_bar is not None: training_progress_bar.update(grad_denom) training_progress_bar.set_description( ' - (Epc {}, Upd {}) '.format(eidx, uidx)) postfix_str = 'TrainLoss: {:.2f}, ValidLoss(best): {:.2f} ({:.2f}), '.format( train_loss / cum_n_words, valid_loss, best_valid_loss) training_progress_bar.set_postfix_str(postfix_str) # 4. update meters train_loss_meter.update(train_loss, cum_n_words) sent_per_sec_meter.update(grad_denom) tok_per_sec_meter.update(cum_n_words) # 5. reset accumulated variables, update uidx grad_denom = 0 uidx += 1 cum_n_words = 0.0 train_loss = 0.0 # ================================================================================== # # Display some information if should_trigger_by_steps( uidx, eidx, every_n_step=training_configs['disp_freq']): if summary_writer is not None: summary_writer.add_scalar( "Speed(sents/sec)", scalar_value=sent_per_sec_meter.ave, global_step=uidx) summary_writer.add_scalar( "Speed(words/sec)", scalar_value=tok_per_sec_meter.ave, global_step=uidx) summary_writer.add_scalar( "train_loss", scalar_value=train_loss_meter.ave, global_step=uidx) summary_writer.add_scalar("oom_count", scalar_value=oom_count, global_step=uidx) # Reset Meters sent_per_sec_meter.reset() tok_per_sec_meter.reset() train_loss_meter.reset() # ================================================================================== # # Loss Validation & Learning rate annealing if should_trigger_by_steps( global_step=uidx, n_epoch=eidx, every_n_step=training_configs['loss_valid_freq'], min_step=training_configs['bleu_valid_warmup'], debug=flags.debug): valid_iter = valid_iterator.build_generator() valid_loss = 0 total_tokens = 0 for batch in valid_iter: seq_number, seqs_x = batch x = prepare_data(seqs_x, seqs_y=None, cuda=Constants.USE_GPU) nmt_model.eval() critic.eval() with torch.no_grad(): logits = nmt_model(x[:-1]) logits = logits.view(-1, vocab_src.max_n_words) trg = x[1:] valid_token = (trg != Constants.PAD).sum(-1) batch_size, seq_len = trg.shape trg = trg.view(-1) # loss = critic(logits, trg) # valid_token = (trg != Constants.PAD).long().sum().item() # total_tokens += valid_token # valid_loss += loss.item() * valid_token import torch.nn.functional as F loss = F.cross_entropy(logits, trg, reduce=False, ignore_index=vocab_src.pad) loss = loss.view(batch_size, seq_len) loss = loss.sum(-1) print(seq_number) print(loss.double().div(valid_token.double())) exit(0) valid_loss = valid_loss / total_tokens model_collections.add_to_collection("history_losses", valid_loss) min_history_loss = np.array( model_collections.get_collection("history_losses")).min() best_valid_loss = min_history_loss if summary_writer is not None: summary_writer.add_scalar("loss", valid_loss, global_step=uidx) summary_writer.add_scalar("best_loss", min_history_loss, global_step=uidx) # If model get new best valid bleu score if valid_loss <= best_valid_loss: bad_count = 0 if is_early_stop is False: if rank == 0: # 1. save the best model torch.save(nmt_model.state_dict(), best_model_prefix + ".final") # 2. record all several best models best_model_saver.save( global_step=uidx, model=nmt_model, optimizer=optimizer, collections=model_collections) else: bad_count += 1 # At least one epoch should be traversed if bad_count >= training_configs[ 'early_stop_patience'] and eidx > 0: is_early_stop = True WARN("Early Stop!") exit(0) if summary_writer is not None: summary_writer.add_scalar("bad_count", bad_count, uidx) INFO("{0} Loss: {1:.2f} patience: {2}".format( uidx, valid_loss, bad_count)) # ================================================================================== # # # Saving checkpoints # if should_trigger_by_steps(uidx, eidx, every_n_step=training_configs['save_freq'], debug=flags.debug): # model_collections.add_to_collection("uidx", uidx) # model_collections.add_to_collection("eidx", eidx) # model_collections.add_to_collection("bad_count", bad_count) # # if not is_early_stop: # if rank == 0: # checkpoint_saver.save(global_step=uidx, # model=nmt_model, # optim=optimizer, # collections=model_collections) if training_progress_bar is not None: training_progress_bar.close() eidx += 1 if eidx > training_configs["max_epochs"]: break
def main(): params = load_args() logger = create_logger( os.path.join(params.exp_path, "lnmap-experiment.log")) logger.info("{}".format( jsbeautifier.beautify(json.dumps(params.__dict__), opts))) set_seed(params) src_emb, tgt_emb, mapping_G, mapping_F, encoder_A, decoder_A, encoder_B, decoder_B = build_model( params) trainer = Trainer(src_emb, tgt_emb, mapping_G, mapping_F, encoder_A, decoder_A, encoder_B, decoder_B, params) evaluator = Evaluator(trainer) trainer.load_training_dico(logger) trainer.load_training_dico(logger, src2tgt=False) logger.info("Seed dictionary size: {}".format(trainer.dico_AB.shape[0])) trainer.dico_AB_original = trainer.dico_AB.clone() trainer.dico_BA_original = trainer.dico_BA.clone() if params.load_autoenc_weights: load_autoenc_weights(params, trainer, logger) else: trainer.train_autoencoder_A(logger) trainer.train_autoencoder_B(logger) if params.save_autoenc_weights: save_autoenc_weights(params, trainer, logger) # Source to Target Training logger.info("\n \n Training for {} to {}".format(params.src_lang, params.tgt_lang)) for i in range(params.iteration): trainer.train_A2B() emb1 = (trainer.mapping_G( trainer.encoder_A( trainer.src_emb.weight.data)).data)[0:params.dico_max_rank] emb2 = (trainer.encoder_B( trainer.tgt_emb.weight.data).data)[0:params.dico_max_rank] emb1 = emb1 / emb1.norm(2, 1, keepdim=True).expand_as(emb1) emb2 = emb2 / emb2.norm(2, 1, keepdim=True).expand_as(emb2) all_pairs, all_scores = generate_new_dictionary_bidirectional( emb1, emb2) add_size = params.induced_dico_c * (i + 1) trainer.dico_AB = torch.cat( (trainer.dico_AB_original, all_pairs[:add_size].cuda()), 0) if i == 0: logger.info( "After first iteration train dictionary size: {}".format( trainer.dico_AB.shape[0])) logger.info("Final iteration train dictionary size: {}".format( trainer.dico_AB.shape[0])) trainer.set_eval() precision_at_1 = get_word_translation_accuracy( params, trainer.mapping_G(trainer.encoder_A( trainer.src_emb.weight.data).data).data, trainer.encoder_B(trainer.tgt_emb.weight.data).data, src2tgt=True) if params.save_model_weights: save_model_weights(params, trainer, src2tgt=True) # Target to Source Training logger.info("\n \n Training for {} to {}".format(params.tgt_lang, params.src_lang)) n_iter = 0 for i in range(params.iteration): trainer.train_B2A() emb1 = ((trainer.encoder_A( trainer.src_emb.weight.data)).data)[0:params.dico_max_rank] emb2 = (trainer.mapping_F( trainer.encoder_B( trainer.tgt_emb.weight.data)).data)[0:params.dico_max_rank] emb1 = emb1 / emb1.norm(2, 1, keepdim=True).expand_as(emb1) emb2 = emb2 / emb2.norm(2, 1, keepdim=True).expand_as(emb2) all_pairs, all_scores = generate_new_dictionary_bidirectional( emb2, emb1) add_size = params.induced_dico_c * (i + 1) trainer.dico_BA = torch.cat( (trainer.dico_BA_original, all_pairs[:add_size].cuda()), 0) if i == 0: logger.info( "After first iteration train dictionary size: {}".format( trainer.dico_BA.shape[0])) logger.info("Final iteration train dictionary size: {}".format( trainer.dico_BA.shape[0])) trainer.set_eval() precision_at_1 = get_word_translation_accuracy( params, trainer.mapping_F(trainer.encoder_B( trainer.tgt_emb.weight.data).data).data, trainer.encoder_A(trainer.src_emb.weight.data).data, src2tgt=False) if params.save_model_weights: save_model_weights(params, trainer, src2tgt=False)
# check parameters assert not params.cuda or torch.cuda.is_available() assert 0 <= params.dis_dropout < 1 assert 0 <= params.dis_input_dropout < 1 assert 0 <= params.dis_smooth < 0.5 assert params.dis_lambda > 0 and params.dis_steps > 0 assert 0 < params.lr_shrink <= 1 assert os.path.isfile(params.src_emb) assert os.path.isfile(params.tgt_emb) assert params.dico_eval == 'default' or params.dico_eval == 'vecmap' or os.path.isfile( params.dico_eval) assert params.export in ["", "txt", "pth"] # build model / trainer / evaluator logger = initialize_exp(params) src_emb, tgt_emb, mapping_G, mapping_F, discriminator_A, discriminator_B, encoder_A, decoder_A, encoder_B, decoder_B = build_model( params, True) trainer = Trainer(src_emb, tgt_emb, mapping_G, mapping_F, discriminator_A, discriminator_B, encoder_A, decoder_A, encoder_B, decoder_B, params) evaluator = Evaluator(trainer) """ Learning loop for Adversarial Training """ if params.adversarial: # first train the autoencoder to become mature trainer.train_autoencoder_A() trainer.train_autoencoder_B() logger.info('----> ADVERSARIAL TRAINING <----\n\n')
# parse parameters params = parser.parse_args() # check parameters assert not params.cuda or torch.cuda.is_available() assert 0 <= params.dis_dropout < 1 assert 0 <= params.dis_input_dropout < 1 assert 0 <= params.dis_smooth < 0.5 assert params.dis_lambda > 0 and params.dis_steps > 0 assert 0 < params.lr_shrink <= 1 assert os.path.isfile(params.src_emb) assert os.path.isfile(params.tgt_emb) # build model / trainer / evaluator logger = initialize_exp(params) src_emb, tgt_emb, mapping, discriminator = build_model(params, True) trainer = Trainer(src_emb, tgt_emb, mapping, discriminator, params) evaluator = Evaluator(trainer) """ Learning loop for Adversarial Training """ if params.adversarial: logger.info('----> ADVERSARIAL TRAINING <----\n\n') # training loop for n_epoch in range(params.n_epochs): logger.info('Starting adversarial training epoch %i...' % n_epoch) tic = time.time() n_words_proc = 0 stats = {'DIS_COSTS': []}
def train(FLAGS): """ FLAGS: saveto: str reload: store_true config_path: str pretrain_path: str, default="" model_name: str log_path: str """ # write log of training to file. write_log_to_file( os.path.join(FLAGS.log_path, "%s.log" % time.strftime("%Y%m%d-%H%M%S"))) GlobalNames.USE_GPU = FLAGS.use_gpu if GlobalNames.USE_GPU: CURRENT_DEVICE = "cpu" else: CURRENT_DEVICE = "cuda:0" config_path = os.path.abspath(FLAGS.config_path) with open(config_path.strip()) as f: configs = yaml.load(f) INFO(pretty_configs(configs)) # Add default configs configs = default_configs(configs) data_configs = configs['data_configs'] model_configs = configs['model_configs'] optimizer_configs = configs['optimizer_configs'] training_configs = configs['training_configs'] GlobalNames.SEED = training_configs['seed'] set_seed(GlobalNames.SEED) best_model_prefix = os.path.join( FLAGS.saveto, FLAGS.model_name + GlobalNames.MY_BEST_MODEL_SUFFIX) timer = Timer() # ================================================================================== # # Load Data INFO('Loading data...') timer.tic() # Generate target dictionary vocab_tgt = Vocabulary(**data_configs["vocabularies"][0]) train_batch_size = training_configs["batch_size"] * max( 1, training_configs["update_cycle"]) train_buffer_size = training_configs["buffer_size"] * max( 1, training_configs["update_cycle"]) train_bitext_dataset = ZipDataset(TextLineDataset( data_path=data_configs['train_data'][0], vocabulary=vocab_tgt, max_len=data_configs['max_len'][0], ), shuffle=training_configs['shuffle']) valid_bitext_dataset = ZipDataset( TextLineDataset( data_path=data_configs['valid_data'][0], vocabulary=vocab_tgt, )) training_iterator = DataIterator( dataset=train_bitext_dataset, batch_size=train_batch_size, use_bucket=training_configs['use_bucket'], buffer_size=train_buffer_size, batching_func=training_configs['batching_key']) valid_iterator = DataIterator( dataset=valid_bitext_dataset, batch_size=training_configs['valid_batch_size'], use_bucket=True, buffer_size=100000, numbering=True) INFO('Done. Elapsed time {0}'.format(timer.toc())) lrate = optimizer_configs['learning_rate'] is_early_stop = False # ================================ Begin ======================================== # # Build Model & Optimizer # We would do steps below on after another # 1. build models & criterion # 2. move models & criterion to gpu if needed # 3. load pre-trained model if needed # 4. build optimizer # 5. build learning rate scheduler if needed # 6. load checkpoints if needed # 0. Initial model_collections = Collections() checkpoint_saver = Saver( save_prefix="{0}.ckpt".format( os.path.join(FLAGS.saveto, FLAGS.model_name)), num_max_keeping=training_configs['num_kept_checkpoints']) best_model_saver = Saver( save_prefix=best_model_prefix, num_max_keeping=training_configs['num_kept_best_model']) # 1. Build Model & Criterion INFO('Building model...') timer.tic() lm_model = build_model(n_tgt_vocab=vocab_tgt.max_n_words, **model_configs) INFO(lm_model) params_total = sum([p.numel() for n, p in lm_model.named_parameters()]) params_with_embedding = sum([ p.numel() for n, p in lm_model.named_parameters() if n.find('embedding') == -1 ]) INFO('Total parameters: {}'.format(params_total)) INFO('Total parameters (excluding word embeddings): {}'.format( params_with_embedding)) critic = NMTCriterion(label_smoothing=model_configs['label_smoothing']) INFO(critic) INFO('Done. Elapsed time {0}'.format(timer.toc())) # 2. Move to GPU if GlobalNames.USE_GPU: lm_model = lm_model.cuda() critic = critic.cuda() # 3. Load pretrained model if needed lm_model.init_parameters(FLAGS.pretrain_path, device=CURRENT_DEVICE) # 4. Build optimizer INFO('Building Optimizer...') optim = Optimizer(name=optimizer_configs['optimizer'], model=lm_model, lr=lrate, grad_clip=optimizer_configs['grad_clip'], optim_args=optimizer_configs['optimizer_params']) # 5. Build scheduler for optimizer if needed if optimizer_configs['schedule_method'] is not None: if optimizer_configs['schedule_method'] == "loss": scheduler = ReduceOnPlateauScheduler( optimizer=optim, **optimizer_configs["scheduler_configs"]) elif optimizer_configs['schedule_method'] == "noam": scheduler = NoamScheduler(optimizer=optim, **optimizer_configs['scheduler_configs']) else: WARN( "Unknown scheduler name {0}. Do not use lr_scheduling.".format( optimizer_configs['schedule_method'])) scheduler = None else: scheduler = None # 6. build moving average if training_configs['moving_average_method'] is not None: ma = MovingAverage( moving_average_method=training_configs['moving_average_method'], named_params=lm_model.named_parameters(), alpha=training_configs['moving_average_alpha']) else: ma = None INFO('Done. Elapsed time {0}'.format(timer.toc())) # Reload from latest checkpoint if FLAGS.reload: checkpoint_saver.load_latest(model=lm_model, optim=optim, lr_scheduler=scheduler, collections=model_collections, ma=ma) # ================================================================================== # # Prepare training eidx = model_collections.get_collection("eidx", [0])[-1] uidx = model_collections.get_collection("uidx", [0])[-1] bad_count = model_collections.get_collection("bad_count", [0])[-1] oom_count = model_collections.get_collection("oom_count", [0])[-1] summary_writer = SummaryWriter(log_dir=FLAGS.log_path) cum_samples = 0 cum_words = 0 valid_loss = best_valid_loss = float('inf') # Max Float saving_files = [] # Timer for computing speed timer_for_speed = Timer() timer_for_speed.tic() INFO('Begin training...') while True: summary_writer.add_scalar("Epoch", (eidx + 1), uidx) # Build iterator and progress bar training_iter = training_iterator.build_generator() training_progress_bar = tqdm(desc=' - (Epc {}, Upd {}) '.format( eidx, uidx), total=len(training_iterator), unit="sents") for batch in training_iter: uidx += 1 if optimizer_configs[ "schedule_method"] is not None and optimizer_configs[ "schedule_method"] != "loss": scheduler.step(global_step=uidx) seqs_y = batch n_samples_t = len(seqs_y) n_words_t = sum(len(s) for s in seqs_y) cum_samples += n_samples_t cum_words += n_words_t train_loss = 0. optim.zero_grad() try: # Prepare data for (seqs_y_t, ) in split_shard( seqs_y, split_size=training_configs['update_cycle']): y = prepare_data(seqs_y_t, cuda=GlobalNames.USE_GPU) loss = compute_forward( model=lm_model, critic=critic, # seqs_x=x, seqs_y=y, eval=False, normalization=n_samples_t, norm_by_words=training_configs["norm_by_words"]) train_loss += loss / y.size( 1) if not training_configs["norm_by_words"] else loss optim.step() except RuntimeError as e: if 'out of memory' in str(e): print('| WARNING: ran out of memory, skipping batch') oom_count += 1 optim.zero_grad() else: raise e if ma is not None and eidx >= training_configs[ 'moving_average_start_epoch']: ma.step() training_progress_bar.update(n_samples_t) training_progress_bar.set_description( ' - (Epc {}, Upd {}) '.format(eidx, uidx)) training_progress_bar.set_postfix_str( 'TrainLoss: {:.2f}, ValidLoss(best): {:.2f} ({:.2f})'.format( train_loss, valid_loss, best_valid_loss)) summary_writer.add_scalar("train_loss", scalar_value=train_loss, global_step=uidx) # ================================================================================== # # Display some information if should_trigger_by_steps( uidx, eidx, every_n_step=training_configs['disp_freq']): # words per second and sents per second words_per_sec = cum_words / (timer.toc(return_seconds=True)) sents_per_sec = cum_samples / (timer.toc(return_seconds=True)) lrate = list(optim.get_lrate())[0] summary_writer.add_scalar("Speed(words/sec)", scalar_value=words_per_sec, global_step=uidx) summary_writer.add_scalar("Speed(sents/sen)", scalar_value=sents_per_sec, global_step=uidx) summary_writer.add_scalar("lrate", scalar_value=lrate, global_step=uidx) summary_writer.add_scalar("oom_count", scalar_value=oom_count, global_step=uidx) # Reset timer timer.tic() cum_words = 0 cum_samples = 0 # ================================================================================== # # Saving checkpoints if should_trigger_by_steps( uidx, eidx, every_n_step=training_configs['save_freq'], debug=FLAGS.debug): model_collections.add_to_collection("uidx", uidx) model_collections.add_to_collection("eidx", eidx) model_collections.add_to_collection("bad_count", bad_count) if not is_early_stop: checkpoint_saver.save(global_step=uidx, model=lm_model, optim=optim, lr_scheduler=scheduler, collections=model_collections, ma=ma) # ================================================================================== # # Loss Validation & Learning rate annealing if should_trigger_by_steps( global_step=uidx, n_epoch=eidx, every_n_step=training_configs['loss_valid_freq'], debug=FLAGS.debug): if ma is not None: origin_state_dict = deepcopy(lm_model.state_dict()) lm_model.load_state_dict(ma.export_ma_params(), strict=False) valid_loss = loss_validation( model=lm_model, critic=critic, valid_iterator=valid_iterator, norm_by_words=training_configs["norm_by_words"]) model_collections.add_to_collection("history_losses", valid_loss) min_history_loss = np.array( model_collections.get_collection("history_losses")).min() summary_writer.add_scalar("loss", valid_loss, global_step=uidx) summary_writer.add_scalar("best_loss", min_history_loss, global_step=uidx) if ma is not None: lm_model.load_state_dict(origin_state_dict) del origin_state_dict if optimizer_configs["schedule_method"] == "loss": scheduler.step(metric=best_valid_loss) # If model get new best valid loss if valid_loss < best_valid_loss: bad_count = 0 if is_early_stop is False: # 1. save the best model torch.save(lm_model.state_dict(), best_model_prefix + ".final") # 2. record all several best models best_model_saver.save(global_step=uidx, model=lm_model) else: bad_count += 1 # At least one epoch should be traversed if bad_count >= training_configs[ 'early_stop_patience'] and eidx > 0: is_early_stop = True WARN("Early Stop!") best_valid_loss = min_history_loss summary_writer.add_scalar("bad_count", bad_count, uidx) INFO("{0} Loss: {1:.2f} lrate: {2:6f} patience: {3}".format( uidx, valid_loss, lrate, bad_count)) training_progress_bar.close() eidx += 1 if eidx > training_configs["max_epochs"]: break
def main(): VALIDATION_METRIC_SUP = 'precision_at_1-csls_knn_10' VALIDATION_METRIC_UNSUP = 'mean_cosine-csls_knn_10-S2T-10000' # main parser = argparse.ArgumentParser(description='Supervised training') parser.add_argument("--seed", type=int, default=-1, help="Initialization seed") parser.add_argument("--verbose", type=int, default=2, help="Verbose level (2:debug, 1:info, 0:warning)") parser.add_argument("--exp_path", type=str, default="", help="Where to store experiment logs and models") parser.add_argument("--exp_name", type=str, default="debug", help="Experiment name") parser.add_argument("--exp_id", type=str, default="", help="Experiment ID") parser.add_argument("--cuda", type=bool_flag, default=True, help="Run on GPU") parser.add_argument("--export", type=str, default="txt", help="Export embeddings after training (txt / pth)") # data parser.add_argument("--src_lang", type=str, default='en', help="Source language") parser.add_argument("--tgt_lang", type=str, default='es', help="Target language") parser.add_argument("--aux_lang", type=str, default='', help="Auxiliary language") parser.add_argument("--emb_dim", type=int, default=300, help="Embedding dimension") parser.add_argument("--max_vocab", type=int, default=200000, help="Maximum vocabulary size (-1 to disable)") # training refinement parser.add_argument("--n_refinement", type=int, default=5, help="Number of refinement iterations (0 to disable the refinement procedure)") # dictionary creation parameters (for refinement) parser.add_argument("--dico_train", type=str, default="default", help="Path to training dictionary (default: use identical character strings)") parser.add_argument("--dico_eval", type=str, default="default", help="Path to evaluation dictionary") parser.add_argument("--dico_method", type=str, default='csls_knn_10', help="Method used for dictionary generation (nn/invsm_beta_30/csls_knn_10)") parser.add_argument("--dico_build", type=str, default='S2T&T2S', help="S2T,T2S,S2T|T2S,S2T&T2S") parser.add_argument("--dico_threshold", type=float, default=0, help="Threshold confidence for dictionary generation") parser.add_argument("--dico_max_rank", type=int, default=10000, help="Maximum dictionary words rank (0 to disable)") parser.add_argument("--dico_min_size", type=int, default=0, help="Minimum generated dictionary size (0 to disable)") parser.add_argument("--dico_max_size", type=int, default=0, help="Maximum generated dictionary size (0 to disable)") # reload pre-trained embeddings parser.add_argument("--src_emb", type=str, default='', help="Reload source embeddings") parser.add_argument("--tgt_emb", type=str, default='', help="Reload target embeddings") parser.add_argument("--aux_emb", type=str, default='', help="Reload auxiliary embeddings") parser.add_argument("--normalize_embeddings", type=str, default="", help="Normalize embeddings before training") parser.add_argument("--fitting_method", type=str, default="non_iterative", help="Method of fitting, one of [non_iterative, em, gauss_seidel, gradient_based]") # parse parameters params = parser.parse_args() # check parameters assert not params.cuda or torch.cuda.is_available() assert params.dico_train in ["identical_char", "default"] or os.path.isfile(params.dico_train) assert params.dico_build in ["S2T", "T2S", "S2T|T2S", "S2T&T2S"] assert params.dico_max_size == 0 or params.dico_max_size < params.dico_max_rank assert params.dico_max_size == 0 or params.dico_max_size > params.dico_min_size print(params.src_emb, params.tgt_emb, params.aux_emb) assert os.path.isfile(params.src_emb) assert os.path.isfile(params.tgt_emb) assert params.dico_eval == 'default' or os.path.isfile(params.dico_eval) assert params.export in ["", "txt", "pth"] # build logger / model / trainer / evaluator logger = initialize_exp(params) src_emb, tgt_emb, aux_emb, mapping, _ = build_model(params, False) trainer = Trainer(src_emb, tgt_emb, aux_emb, mapping, None, params) # load a training dictionary. if a dictionary path is not provided, use a default # one ("default") or create one based on identical character strings ("identical_char") trainer.load_training_dico(params.dico_train) # define the validation metric VALIDATION_METRIC = VALIDATION_METRIC_UNSUP if params.dico_train == 'identical_char' else VALIDATION_METRIC_SUP logger.info("Validation metric: %s" % VALIDATION_METRIC) # apply the PCCA solution trainer.fit(fitting_method=params.fitting_method) # IMPORTANT: EVALUATOR SHOULD BE CREATED AFTER TRAINER HAS BEEN FITTED evaluator = Evaluator(trainer) # embeddings evaluation to_log = OrderedDict({}) evaluator.all_eval(to_log) logger.info("__log__:%s" % json.dumps(to_log))
def main(cl_arguments): ''' Train a model for multitask-training.''' cl_args = handle_arguments(cl_arguments) args = config.params_from_file(cl_args.config_file, cl_args.overrides) # Check for deprecated arg names check_arg_name(args) args, seed = initial_setup(args, cl_args) # Load tasks log.info("Loading tasks...") start_time = time.time() pretrain_tasks, target_tasks, vocab, word_embs = build_tasks(args) tasks = sorted(set(pretrain_tasks + target_tasks), key=lambda x: x.name) log.info('\tFinished loading tasks in %.3fs', time.time() - start_time) log.info('\t Tasks: {}'.format([task.name for task in tasks])) # Build model log.info('Building model...') start_time = time.time() model = build_model(args, vocab, word_embs, tasks) log.info('\tFinished building model in %.3fs', time.time() - start_time) # Start Tensorboard if requested if cl_args.tensorboard: tb_logdir = os.path.join(args.run_dir, "tensorboard") _run_background_tensorboard(tb_logdir, cl_args.tensorboard_port) check_configurations(args, pretrain_tasks, target_tasks) if args.do_pretrain: # Train on pretrain tasks log.info("Training...") stop_metric = pretrain_tasks[0].val_metric if len( pretrain_tasks) == 1 else 'macro_avg' should_decrease = pretrain_tasks[0].val_metric_decreases if len( pretrain_tasks) == 1 else False trainer, _, opt_params, schd_params = build_trainer(args, [], model, args.run_dir, should_decrease, phase="pretrain") to_train = [(n, p) for n, p in model.named_parameters() if p.requires_grad] _ = trainer.train(pretrain_tasks, stop_metric, args.batch_size, args.weighting_method, args.scaling_method, to_train, opt_params, schd_params, args.shared_optimizer, args.load_model, phase="pretrain") # For checkpointing logic if not args.do_target_task_training: log.info("In strict mode because do_target_task_training is off. " "Will crash if any tasks are missing from the checkpoint.") strict = True else: strict = False if args.do_target_task_training: # Train on target tasks task_names_to_avoid_loading = setup_target_task_training( args, target_tasks, model, strict) if args.transfer_paradigm == "frozen": # might be empty if elmo = 0. scalar_mix_0 should always be # pretrain scalars elmo_scalars = [(n, p) for n, p in model.named_parameters() if "scalar_mix" in n and "scalar_mix_0" not in n] # Fails when sep_embs_for_skip is 0 and elmo_scalars has nonzero # length. assert_for_log( not elmo_scalars or args.sep_embs_for_skip, "Error: ELMo scalars loaded and will be updated in do_target_task_training but " "they should not be updated! Check sep_embs_for_skip flag or make an issue." ) for task in target_tasks: # Skip mnli-diagnostic # This has to be handled differently than probing tasks because probing tasks require the "is_probing_task" # to be set to True. For mnli-diagnostic this flag will be False because it is part of GLUE and # "is_probing_task is global flag specific to a run, not to a task. if task.name == 'mnli-diagnostic': continue if args.transfer_paradigm == "finetune": # Train both the task specific models as well as sentence # encoder. to_train = [(n, p) for n, p in model.named_parameters() if p.requires_grad] else: # args.transfer_paradigm == "frozen": # Only train task-specific module pred_module = getattr(model, "%s_mdl" % task.name) to_train = [(n, p) for n, p in pred_module.named_parameters() if p.requires_grad] to_train += elmo_scalars trainer, _, opt_params, schd_params = build_trainer( args, [task.name, 'target_train'], model, args.run_dir, task.val_metric_decreases, phase="target_train") _ = trainer.train(tasks=[task], stop_metric=task.val_metric, batch_size=args.batch_size, weighting_method=args.weighting_method, scaling_method=args.scaling_method, train_params=to_train, optimizer_params=opt_params, scheduler_params=schd_params, shared_optimizer=args.shared_optimizer, load_model=False, phase="target_train") # Now that we've trained a model, revert to the normal checkpoint # logic for this task. if task.name in task_names_to_avoid_loading: task_names_to_avoid_loading.remove(task.name) # The best checkpoint will accumulate the best parameters for each # task. layer_path = os.path.join(args.run_dir, "model_state_target_train_best.th") if args.transfer_paradigm == "finetune": # Save this fine-tune model with a task specific name. finetune_path = os.path.join( args.run_dir, "model_state_%s_best.th" % task.name) os.rename(layer_path, finetune_path) # Reload the original best model from before target-task # training. pre_finetune_path = get_best_checkpoint_path(args.run_dir) load_model_state(model, pre_finetune_path, args.cuda, skip_task_models=[], strict=strict) else: # args.transfer_paradigm == "frozen": # Load the current overall best model. # Save the best checkpoint from that target task training to be # specific to that target task. load_model_state(model, layer_path, args.cuda, strict=strict, skip_task_models=task_names_to_avoid_loading) if args.do_full_eval: # Evaluate log.info("Evaluating...") splits_to_write = evaluate.parse_write_preds_arg(args.write_preds) if args.transfer_paradigm == "finetune": for task in target_tasks: if task.name == 'mnli-diagnostic': # we'll load mnli-diagnostic during mnli continue # Special checkpointing logic here since we train the sentence encoder # and have a best set of sent encoder model weights per task. finetune_path = os.path.join( args.run_dir, "model_state_%s_best.th" % task.name) if os.path.exists(finetune_path): ckpt_path = finetune_path else: ckpt_path = get_best_checkpoint_path(args.run_dir) load_model_state(model, ckpt_path, args.cuda, skip_task_models=[], strict=strict) tasks = [task] if task.name == 'mnli': tasks += [ t for t in target_tasks if t.name == 'mnli-diagnostic' ] evaluate_and_write(args, model, tasks, splits_to_write) elif args.transfer_paradigm == "frozen": # Don't do any special checkpointing logic here # since model already has all the trained task specific modules. evaluate_and_write(args, model, target_tasks, splits_to_write) log.info("Done!")
# reload pre-trained embeddings parser.add_argument("--src_emb", type=str, default="", help="Reload source embeddings") parser.add_argument("--tgt_emb", type=str, default="", help="Reload target embeddings") parser.add_argument("--max_vocab", type=int, default=200000, help="Maximum vocabulary size") parser.add_argument("--emb_dim", type=int, default=300, help="Embedding dimension") parser.add_argument("--normalize_embeddings", type=str, default="", help="Normalize embeddings before training") # parse parameters params = parser.parse_args() # check parameters assert params.src_lang, "source language undefined" assert os.path.isfile(params.src_emb) assert not params.tgt_lang or os.path.isfile(params.tgt_emb) # build logger / model / trainer / evaluator logger = initialize_exp(params) src_emb, tgt_emb, mapping, _ = build_model(params, False) trainer = Trainer(src_emb, tgt_emb, mapping, None, params) evaluator = Evaluator(trainer) # run evaluations to_log = OrderedDict({'n_iter': 0}) evaluator.monolingual_wordsim(to_log) if params.tgt_lang: evaluator.crosslingual_wordsim(to_log) evaluator.word_translation(to_log) evaluator.sent_translation(to_log) # evaluator.dist_mean_cosine(to_log)
def main(cl_arguments): ''' Train or load a model. Evaluate on some tasks. ''' cl_args = handle_arguments(cl_arguments) args = config.params_from_file(cl_args.config_file, cl_args.overrides) # Logistics # maybe_make_dir(args.project_dir) # e.g. /nfs/jsalt/exp/$HOSTNAME maybe_make_dir(args.exp_dir) # e.g. <project_dir>/jiant-demo maybe_make_dir(args.run_dir) # e.g. <project_dir>/jiant-demo/sst log.getLogger().addHandler(log.FileHandler(args.local_log_path)) if cl_args.remote_log: gcp.configure_remote_logging(args.remote_log_name) if cl_args.notify: from src import emails global EMAIL_NOTIFIER log.info("Registering email notifier for %s", cl_args.notify) EMAIL_NOTIFIER = emails.get_notifier(cl_args.notify, args) if EMAIL_NOTIFIER: EMAIL_NOTIFIER(body="Starting run.", prefix="") _try_logging_git_info() log.info("Parsed args: \n%s", args) config_file = os.path.join(args.run_dir, "params.conf") config.write_params(args, config_file) log.info("Saved config to %s", config_file) seed = random.randint(1, 10000) if args.random_seed < 0 else args.random_seed random.seed(seed) torch.manual_seed(seed) log.info("Using random seed %d", seed) if args.cuda >= 0: try: if not torch.cuda.is_available(): raise EnvironmentError("CUDA is not available, or not detected" " by PyTorch.") log.info("Using GPU %d", args.cuda) torch.cuda.set_device(args.cuda) torch.cuda.manual_seed_all(seed) except Exception: log.warning( "GPU access failed. You might be using a CPU-only installation of PyTorch. Falling back to CPU.") args.cuda = -1 # Prepare data # log.info("Loading tasks...") start_time = time.time() train_tasks, eval_tasks, vocab, word_embs = build_tasks(args) if any([t.val_metric_decreases for t in train_tasks]) and any( [not t.val_metric_decreases for t in train_tasks]): log.warn("\tMixing training tasks with increasing and decreasing val metrics!") tasks = sorted(set(train_tasks + eval_tasks), key=lambda x: x.name) log.info('\tFinished loading tasks in %.3fs', time.time() - start_time) log.info('\t Tasks: {}'.format([task.name for task in tasks])) # Build or load model # log.info('Building model...') start_time = time.time() model = build_model(args, vocab, word_embs, tasks) log.info('\tFinished building model in %.3fs', time.time() - start_time) # Check that necessary parameters are set for each step. Exit with error if not. steps_log = [] if not args.load_eval_checkpoint == 'none': assert_for_log(os.path.exists(args.load_eval_checkpoint), "Error: Attempting to load model from non-existent path: [%s]" % args.load_eval_checkpoint) assert_for_log( not args.do_train, "Error: Attempting to train a model and then replace that model with one from a checkpoint.") steps_log.append("Loading model from path: %s" % args.load_eval_checkpoint) if args.do_train: assert_for_log(args.train_tasks != "none", "Error: Must specify at least on training task: [%s]" % args.train_tasks) assert_for_log( args.val_interval % args.bpp_base == 0, "Error: val_interval [%d] must be divisible by bpp_base [%d]" % (args.val_interval, args.bpp_base)) steps_log.append("Training model on tasks: %s" % args.train_tasks) if args.train_for_eval: steps_log.append("Re-training model for individual eval tasks") assert_for_log( args.eval_val_interval % args.bpp_base == 0, "Error: eval_val_interval [%d] must be divisible by bpp_base [%d]" % (args.eval_val_interval, args.bpp_base)) assert_for_log(len(set(train_tasks).intersection(eval_tasks)) == 0 or args.allow_reuse_of_pretraining_parameters or args.do_train == 0, "If you're pretraining on a task you plan to reuse as a target task, set\n" "allow_reuse_of_pretraining_parameters = 1(risky), or train in two steps:\n" " train with do_train = 1, train_for_eval = 0, stop, and restart with\n" " do_train = 0 and train_for_eval = 1.") if args.do_eval: assert_for_log(args.eval_tasks != "none", "Error: Must specify at least one eval task: [%s]" % args.eval_tasks) steps_log.append("Evaluating model on tasks: %s" % args.eval_tasks) # Start Tensorboard if requested if cl_args.tensorboard: tb_logdir = os.path.join(args.run_dir, "tensorboard") _run_background_tensorboard(tb_logdir, cl_args.tensorboard_port) log.info("Will run the following steps:\n%s", '\n'.join(steps_log)) if args.do_train: # Train on train tasks # log.info("Training...") params = build_trainer_params(args, task_names=[]) stop_metric = train_tasks[0].val_metric if len(train_tasks) == 1 else 'macro_avg' should_decrease = train_tasks[0].val_metric_decreases if len(train_tasks) == 1 else False trainer, _, opt_params, schd_params = build_trainer(params, model, args.run_dir, should_decrease) to_train = [(n, p) for n, p in model.named_parameters() if p.requires_grad] best_epochs = trainer.train(train_tasks, stop_metric, args.batch_size, args.bpp_base, args.weighting_method, args.scaling_method, to_train, opt_params, schd_params, args.shared_optimizer, args.load_model, phase="main") # Select model checkpoint from main training run to load if not args.train_for_eval: log.info("In strict mode because train_for_eval is off. " "Will crash if any tasks are missing from the checkpoint.") strict = True else: strict = False if args.train_for_eval and not args.allow_reuse_of_pretraining_parameters: # If we're training models for evaluation, which is always done from scratch with a fresh # optimizer, we shouldn't load parameters for those models. # Usually, there won't be trained parameters to skip, but this can happen if a run is killed # during the train_for_eval phase. task_names_to_avoid_loading = [task.name for task in eval_tasks] else: task_names_to_avoid_loading = [] if not args.load_eval_checkpoint == "none": log.info("Loading existing model from %s...", args.load_eval_checkpoint) load_model_state(model, args.load_eval_checkpoint, args.cuda, task_names_to_avoid_loading, strict=strict) else: # Look for eval checkpoints (available only if we're restoring from a run that already # finished), then look for training checkpoints. eval_best = glob.glob(os.path.join(args.run_dir, "model_state_eval_best.th")) if len(eval_best) > 0: load_model_state( model, eval_best[0], args.cuda, task_names_to_avoid_loading, strict=strict) else: macro_best = glob.glob(os.path.join(args.run_dir, "model_state_main_epoch_*.best_macro.th")) if len(macro_best) > 0: assert_for_log(len(macro_best) == 1, "Too many best checkpoints. Something is wrong.") load_model_state( model, macro_best[0], args.cuda, task_names_to_avoid_loading, strict=strict) else: assert_for_log( args.allow_untrained_encoder_parameters, "No best checkpoint found to evaluate.") log.warning("Evaluating untrained encoder parameters!") # Train just the task-specific components for eval tasks. if args.train_for_eval: # might be empty if no elmo. scalar_mix_0 should always be pretrain scalars elmo_scalars = [(n, p) for n, p in model.named_parameters() if "scalar_mix" in n and "scalar_mix_0" not in n] # fails when sep_embs_for_skip is 0 and elmo_scalars has nonzero length assert_for_log(not elmo_scalars or args.sep_embs_for_skip, "Error: ELMo scalars loaded and will be updated in train_for_eval but " "they should not be updated! Check sep_embs_for_skip flag or make an issue.") for task in eval_tasks: # Skip mnli-diagnostic # This has to be handled differently than probing tasks because probing tasks require the "is_probing_task" # to be set to True. For mnli-diagnostic this flag will be False because it is part of GLUE and # "is_probing_task is global flag specific to a run, not to a task. if task.name == 'mnli-diagnostic': continue pred_module = getattr(model, "%s_mdl" % task.name) to_train = elmo_scalars + [(n, p) for n, p in pred_module.named_parameters() if p.requires_grad] # Look for <task_name>_<param_name>, then eval_<param_name> params = build_trainer_params(args, task_names=[task.name, 'eval']) trainer, _, opt_params, schd_params = build_trainer(params, model, args.run_dir, task.val_metric_decreases) best_epoch = trainer.train([task], task.val_metric, args.batch_size, 1, args.weighting_method, args.scaling_method, to_train, opt_params, schd_params, args.shared_optimizer, load_model=False, phase="eval") # Now that we've trained a model, revert to the normal checkpoint logic for this task. task_names_to_avoid_loading.remove(task.name) # The best checkpoint will accumulate the best parameters for each task. # This logic looks strange. We think it works. best_epoch = best_epoch[task.name] layer_path = os.path.join(args.run_dir, "model_state_eval_best.th") load_model_state( model, layer_path, args.cuda, skip_task_models=task_names_to_avoid_loading, strict=strict) if args.do_eval: # Evaluate # log.info("Evaluating...") val_results, val_preds = evaluate.evaluate(model, eval_tasks, args.batch_size, args.cuda, "val") splits_to_write = evaluate.parse_write_preds_arg(args.write_preds) if 'val' in splits_to_write: evaluate.write_preds(eval_tasks, val_preds, args.run_dir, 'val', strict_glue_format=args.write_strict_glue_format) if 'test' in splits_to_write: _, te_preds = evaluate.evaluate(model, eval_tasks, args.batch_size, args.cuda, "test") evaluate.write_preds(tasks, te_preds, args.run_dir, 'test', strict_glue_format=args.write_strict_glue_format) run_name = args.get("run_name", os.path.basename(args.run_dir)) results_tsv = os.path.join(args.exp_dir, "results.tsv") log.info("Writing results for split 'val' to %s", results_tsv) evaluate.write_results(val_results, results_tsv, run_name=run_name) log.info("Done!")
def main(cl_arguments): ''' Train or load a model. Evaluate on some tasks. ''' cl_args = handle_arguments(cl_arguments) args = config.params_from_file(cl_args.config_file, cl_args.overrides) # Raise error if obsolete arg names are present check_arg_name(args) # Logistics # maybe_make_dir(args.project_dir) # e.g. /nfs/jsalt/exp/$HOSTNAME maybe_make_dir(args.exp_dir) # e.g. <project_dir>/jiant-demo maybe_make_dir(args.run_dir) # e.g. <project_dir>/jiant-demo/sst log.getLogger().addHandler(log.FileHandler(args.local_log_path)) if cl_args.remote_log: from src.utils import gcp gcp.configure_remote_logging(args.remote_log_name) if cl_args.notify: from src.utils import emails global EMAIL_NOTIFIER log.info("Registering email notifier for %s", cl_args.notify) EMAIL_NOTIFIER = emails.get_notifier(cl_args.notify, args) if EMAIL_NOTIFIER: EMAIL_NOTIFIER(body="Starting run.", prefix="") _try_logging_git_info() log.info("Parsed args: \n%s", args) config_file = os.path.join(args.run_dir, "params.conf") config.write_params(args, config_file) log.info("Saved config to %s", config_file) seed = random.randint(1, 10000) if args.random_seed < 0 else args.random_seed random.seed(seed) torch.manual_seed(seed) log.info("Using random seed %d", seed) if args.cuda >= 0: try: if not torch.cuda.is_available(): raise EnvironmentError("CUDA is not available, or not detected" " by PyTorch.") log.info("Using GPU %d", args.cuda) torch.cuda.set_device(args.cuda) torch.cuda.manual_seed_all(seed) except Exception: log.warning( "GPU access failed. You might be using a CPU-only installation of PyTorch. Falling back to CPU.") args.cuda = -1 # Prepare data # log.info("Loading tasks...") start_time = time.time() pretrain_tasks, target_tasks, vocab, word_embs = build_tasks(args) if any([t.val_metric_decreases for t in pretrain_tasks]) and any( [not t.val_metric_decreases for t in pretrain_tasks]): log.warn("\tMixing training tasks with increasing and decreasing val metrics!") tasks = sorted(set(pretrain_tasks + target_tasks), key=lambda x: x.name) log.info('\tFinished loading tasks in %.3fs', time.time() - start_time) log.info('\t Tasks: {}'.format([task.name for task in tasks])) # Build model # log.info('Building model...') start_time = time.time() model = build_model(args, vocab, word_embs, tasks) log.info('\tFinished building model in %.3fs', time.time() - start_time) # Check that necessary parameters are set for each step. Exit with error if not. steps_log = [] if not args.load_eval_checkpoint == 'none': assert_for_log(os.path.exists(args.load_eval_checkpoint), "Error: Attempting to load model from non-existent path: [%s]" % args.load_eval_checkpoint) assert_for_log( not args.do_pretrain, "Error: Attempting to train a model and then replace that model with one from a checkpoint.") steps_log.append("Loading model from path: %s" % args.load_eval_checkpoint) assert_for_log(args.transfer_paradigm in ["finetune", "frozen"], "Transfer paradigm %s not supported!" % args.transfer_paradigm) if args.do_pretrain: assert_for_log(args.pretrain_tasks != "none", "Error: Must specify at least on training task: [%s]" % args.pretrain_tasks) assert_for_log( args.val_interval % args.bpp_base == 0, "Error: val_interval [%d] must be divisible by bpp_base [%d]" % (args.val_interval, args.bpp_base)) steps_log.append("Training model on tasks: %s" % args.pretrain_tasks) if args.do_target_task_training: steps_log.append("Re-training model for individual eval tasks") assert_for_log( args.eval_val_interval % args.bpp_base == 0, "Error: eval_val_interval [%d] must be divisible by bpp_base [%d]" % (args.eval_val_interval, args.bpp_base)) assert_for_log(len(set(pretrain_tasks).intersection(target_tasks)) == 0 or args.allow_reuse_of_pretraining_parameters or args.do_pretrain == 0, "If you're pretraining on a task you plan to reuse as a target task, set\n" "allow_reuse_of_pretraining_parameters = 1(risky), or train in two steps:\n" " train with do_pretrain = 1, do_target_task_training = 0, stop, and restart with\n" " do_pretrain = 0 and do_target_task_training = 1.") if args.do_full_eval: assert_for_log(args.target_tasks != "none", "Error: Must specify at least one eval task: [%s]" % args.target_tasks) steps_log.append("Evaluating model on tasks: %s" % args.target_tasks) # Start Tensorboard if requested if cl_args.tensorboard: tb_logdir = os.path.join(args.run_dir, "tensorboard") _run_background_tensorboard(tb_logdir, cl_args.tensorboard_port) log.info("Will run the following steps:\n%s", '\n'.join(steps_log)) if args.do_pretrain: # Train on train tasks # log.info("Training...") stop_metric = pretrain_tasks[0].val_metric if len(pretrain_tasks) == 1 else 'macro_avg' should_decrease = pretrain_tasks[0].val_metric_decreases if len(pretrain_tasks) == 1 else False trainer, _, opt_params, schd_params = build_trainer(args, [], model, args.run_dir, should_decrease) to_train = [(n, p) for n, p in model.named_parameters() if p.requires_grad] _ = trainer.train(pretrain_tasks, stop_metric, args.batch_size, args.bpp_base, args.weighting_method, args.scaling_method, to_train, opt_params, schd_params, args.shared_optimizer, args.load_model, phase="main") # Select model checkpoint from main training run to load if not args.do_target_task_training: log.info("In strict mode because do_target_task_training is off. " "Will crash if any tasks are missing from the checkpoint.") strict = True else: strict = False if args.do_target_task_training and not args.allow_reuse_of_pretraining_parameters: # If we're training models for evaluation, which is always done from scratch with a fresh # optimizer, we shouldn't load parameters for those models. # Usually, there won't be trained parameters to skip, but this can happen if a run is killed # during the do_target_task_training phase. task_names_to_avoid_loading = [task.name for task in target_tasks] else: task_names_to_avoid_loading = [] if not args.load_eval_checkpoint == "none": # This is to load a particular eval checkpoint. log.info("Loading existing model from %s...", args.load_eval_checkpoint) load_model_state(model, args.load_eval_checkpoint, args.cuda, task_names_to_avoid_loading, strict=strict) else: # Look for eval checkpoints (available only if we're restoring from a run that already # finished), then look for training checkpoints. if args.transfer_paradigm == "finetune": # Save model so we have a checkpoint to go back to after each task-specific finetune. model_state = model.state_dict() model_path = os.path.join(args.run_dir, "model_state_untrained_prefinetune.th") torch.save(model_state, model_path) best_path = get_best_checkpoint_path(args.run_dir) if best_path: load_model_state(model, best_path, args.cuda, task_names_to_avoid_loading, strict=strict) else: assert_for_log(args.allow_untrained_encoder_parameters, "No best checkpoint found to evaluate.") log.warning("Evaluating untrained encoder parameters!") # Train just the task-specific components for eval tasks. if args.do_target_task_training: if args.transfer_paradigm == "frozen": # might be empty if elmo = 0. scalar_mix_0 should always be pretrain scalars elmo_scalars = [(n, p) for n, p in model.named_parameters() if "scalar_mix" in n and "scalar_mix_0" not in n] # Fails when sep_embs_for_skip is 0 and elmo_scalars has nonzero length. assert_for_log(not elmo_scalars or args.sep_embs_for_skip, "Error: ELMo scalars loaded and will be updated in do_target_task_training but " "they should not be updated! Check sep_embs_for_skip flag or make an issue.") for task in target_tasks: # Skip mnli-diagnostic # This has to be handled differently than probing tasks because probing tasks require the "is_probing_task" # to be set to True. For mnli-diagnostic this flag will be False because it is part of GLUE and # "is_probing_task is global flag specific to a run, not to a task. if task.name == 'mnli-diagnostic': continue if args.transfer_paradigm == "finetune": # Train both the task specific models as well as sentence encoder. to_train = [(n, p) for n, p in model.named_parameters() if p.requires_grad] else: # args.transfer_paradigm == "frozen": # Only train task-specific module. pred_module = getattr(model, "%s_mdl" % task.name) to_train = [(n, p) for n, p in pred_module.named_parameters() if p.requires_grad] to_train += elmo_scalars # Look for <task_name>_<param_name>, then eval_<param_name> trainer, _, opt_params, schd_params = build_trainer(args, [task.name, 'eval'], model, args.run_dir, task.val_metric_decreases) _ = trainer.train(tasks=[task], stop_metric=task.val_metric, batch_size=args.batch_size, n_batches_per_pass=1, weighting_method=args.weighting_method, scaling_method=args.scaling_method, train_params=to_train, optimizer_params=opt_params, scheduler_params=schd_params, shared_optimizer=args.shared_optimizer, load_model=False, phase="eval") # Now that we've trained a model, revert to the normal checkpoint logic for this task. if task.name in task_names_to_avoid_loading: task_names_to_avoid_loading.remove(task.name) # The best checkpoint will accumulate the best parameters for each task. # This logic looks strange. We think it works. layer_path = os.path.join(args.run_dir, "model_state_eval_best.th") if args.transfer_paradigm == "finetune": # If we finetune, # Save this fine-tune model with a task specific name. finetune_path = os.path.join(args.run_dir, "model_state_%s_best.th" % task.name) os.rename(layer_path, finetune_path) # Reload the original best model from before target-task training. pre_finetune_path = get_best_checkpoint_path(args.run_dir) load_model_state(model, pre_finetune_path, args.cuda, skip_task_models=[], strict=strict) else: # args.transfer_paradigm == "frozen": # Load the current overall best model. # Save the best checkpoint from that target task training to be # specific to that target task. load_model_state(model, layer_path, args.cuda, strict=strict, skip_task_models=task_names_to_avoid_loading) if args.do_full_eval: # Evaluate # log.info("Evaluating...") splits_to_write = evaluate.parse_write_preds_arg(args.write_preds) if args.transfer_paradigm == "finetune": for task in target_tasks: if task.name == 'mnli-diagnostic': # we'll load mnli-diagnostic during mnli continue finetune_path = os.path.join(args.run_dir, "model_state_%s_best.th" % task.name) if os.path.exists(finetune_path): ckpt_path = finetune_path else: ckpt_path = get_best_checkpoint_path(args.run_dir) load_model_state(model, ckpt_path, args.cuda, skip_task_models=[], strict=strict) tasks = [task] if task.name == 'mnli': tasks += [t for t in target_tasks if t.name == 'mnli-diagnostic'] evaluate_and_write(args, model, tasks, splits_to_write) elif args.transfer_paradigm == "frozen": evaluate_and_write(args, model, target_tasks, splits_to_write) log.info("Done!")
# check parameters assert not params.cuda or torch.cuda.is_available() assert 0 <= params.dis_dropout < 1 assert 0 <= params.dis_input_dropout < 1 assert 0 <= params.dis_smooth < 0.5 assert params.dis_lambda > 0 and params.dis_steps > 0 assert 0 < params.lr_shrink <= 1 assert os.path.isfile(params.src_emb) assert os.path.isfile(params.tgt_emb) assert params.dico_eval == 'default' or os.path.isfile(params.dico_eval) assert params.export in ["", "txt", "pth"] # build model / trainer / evaluator logger = initialize_exp(params) src_emb, tgt_emb, generator, discriminator = build_model(params, True) trainer = Trainer(src_emb, tgt_emb, generator, discriminator, params) evaluator = Evaluator(trainer) # Learning loop for Adversarial Training if params.adversarial: logger.info('----> ADVERSARIAL TRAINING <----\n\n') # training loop for n_epoch in range(params.n_epochs): logger.info('Starting adversarial training epoch %i...', n_epoch) tic = time.time() n_words_proc = 0 stats = {'DIS_COSTS': [], 'MAP_COSTS': []}
def train(config_path, model_path, model_type, src_filename, trg_filename): """ flags: saveto: str reload: store_true config_path: str pretrain_path: str, default="" model_name: str log_path: str """ # ================================================================================== # # Initialization for training on different devices # - CPU/GPU # - Single/Distributed Constants.USE_GPU = True print(config_path) print(model_path) print(model_type) world_size = 1 rank = 0 local_rank = 0 if Constants.USE_GPU: torch.cuda.set_device(local_rank) Constants.CURRENT_DEVICE = "cuda:{0}".format(local_rank) else: Constants.CURRENT_DEVICE = "cpu" # ================================================================================== # # Parsing configuration files # - Load default settings # - Load pre-defined settings # - Load user-defined settings configs = prepare_configs(config_path) data_configs = configs['data_configs'] model_configs = configs['model_configs'] training_configs = configs['training_configs'] INFO(pretty_configs(configs)) Constants.SEED = training_configs['seed'] set_seed(Constants.SEED) timer = Timer() # ================================================================================== # # Load Data INFO('Loading data...') timer.tic() # Generate target dictionary vocab_src = Vocabulary.build_from_file(**data_configs['vocabularies'][0]) vocab_tgt = Vocabulary.build_from_file(**data_configs['vocabularies'][1]) Constants.EOS = vocab_src.eos Constants.PAD = vocab_src.pad Constants.BOS = vocab_src.bos valid_bitext_dataset = ZipDataset( TextLineDataset( data_path=src_filename, vocabulary=vocab_src, max_len=100, is_train_dataset=False, ), TextLineDataset( data_path=trg_filename, vocabulary=vocab_tgt, is_train_dataset=False, max_len=100, )) valid_iterator = DataIterator(dataset=valid_bitext_dataset, batch_size=20, use_bucket=training_configs['use_bucket'], buffer_size=training_configs['buffer_size'], numbering=True, world_size=world_size, rank=rank) INFO('Done. Elapsed time {0}'.format(timer.toc())) # ================================ Begin ======================================== # # Build Model & Optimizer # We would do steps below on after another # 1. build models & criterion # 2. move models & criterion to gpu if needed # 3. load pre-trained model if needed # 4. build optimizer # 5. build learning rate scheduler if needed # 6. load checkpoints if needed # 0. Initial # 1. Build Model & Criterion INFO('Building model...') timer.tic() nmt_model = build_model(n_src_vocab=vocab_src.max_n_words, n_tgt_vocab=vocab_tgt.max_n_words, padding_idx=vocab_src.pad, vocab_src=vocab_src, **model_configs) INFO(nmt_model) # 2. Move to GPU if Constants.USE_GPU: nmt_model = nmt_model.cuda() # 3. Load pretrained model if needed load_pretrained_model(nmt_model, model_path, device=Constants.CURRENT_DEVICE) INFO('Done. Elapsed time {0}'.format(timer.toc())) # ================================================================================== # # Prepare training sent_per_sec_meter = TimeMeter() tok_per_sec_meter = TimeMeter() grad_denom = 0 train_loss = 0.0 cum_n_words = 0 valid_loss = best_valid_loss = float('inf') sent_per_sec_meter.start() tok_per_sec_meter.start() INFO('Begin training...') eidx = 0 uidx = 0 score_result = dict() # Build iterator and progress bar training_iter = valid_iterator.build_generator() training_progress_bar = tqdm(desc=' - (Epc {}, Upd {}) '.format( eidx, uidx), total=len(valid_iterator), unit="sents") for batch in training_iter: seqs_numbers, seqs_x, seqs_y = batch batch_size = len(seqs_x) cum_n_words += sum(len(s) for s in seqs_y) try: # Prepare data x, y = prepare_data(seqs_x, seqs_y, cuda=Constants.USE_GPU) y_inp = y[:, :-1].contiguous() y_label = y[:, 1:].contiguous() # [batch_size, seq_len] log_probs = nmt_model( x, y_inp, log_probs=True) # [batch_size, seq_len, vocab_size] _, seq_len = y_label.shape log_probs = log_probs.view(-1, vocab_tgt.max_n_words) y_label = y_label.view(-1) loss = F.nll_loss(log_probs, y_label, reduce=False, ignore_index=vocab_tgt.pad) loss = loss.view(batch_size, seq_len) loss = loss.sum(-1) y_label = y_label.view(batch_size, seq_len) valid_token = (y_label != vocab_tgt.pad).sum(-1) loss = loss.double().div(valid_token.double()) for seq_num, l in zip(seqs_numbers, loss): assert seq_num not in score_result score_result.update({seq_num: l.item()}) uidx += 1 grad_denom += batch_size except RuntimeError as e: if 'out of memory' in str(e): print('| WARNING: ran out of memory, skipping batch') else: raise e if training_progress_bar is not None: training_progress_bar.update(batch_size) training_progress_bar.set_description( ' - (Epc {}, Upd {}) '.format(eidx, uidx)) postfix_str = 'TrainLoss: {:.2f}, ValidLoss(best): {:.2f} ({:.2f}), '.format( train_loss, valid_loss, best_valid_loss) training_progress_bar.set_postfix_str(postfix_str) training_progress_bar.close() return score_result
params = parser.parse_args() # check parameters assert not params.cuda or torch.cuda.is_available() assert params.dico_train in ["identical_char", "default"] or os.path.isfile( params.dico_train) assert params.dico_build in ["S2T", "T2S", "S2T|T2S", "S2T&T2S"] assert params.dico_max_size == 0 or params.dico_max_size < params.dico_max_rank assert params.dico_max_size == 0 or params.dico_max_size > params.dico_min_size assert os.path.isfile(params.src_emb) assert os.path.isfile(params.tgt_emb) assert params.export in ["", "txt", "pth"] # build logger / model / trainer / evaluator logger = initialize_exp(params) src_emb, tgt_emb, mapping, _ = build_model(params, False) trainer = Trainer(src_emb, tgt_emb, mapping, None, params) evaluator = Evaluator(trainer) # load a training dictionary. if a dictionary path is not provided, use a default # one ("default") or create one based on identical character strings ("identical_char") trainer.load_training_dico(params.dico_train) """ Learning loop for Procrustes Iterative Learning """ for n_iter in range(params.n_refinement + 1): logger.info('Starting iteration %i...' % n_iter) # build a dictionary from aligned embeddings (unless # it is the first iteration and we use the init one)
params = parser.parse_args() # check parameters assert not params.cuda or torch.cuda.is_available() assert params.dico_train in ["identical_char", "default"] or os.path.isfile(params.dico_train) assert params.dico_build in ["S2T", "T2S", "S2T|T2S", "S2T&T2S"] assert params.dico_max_size == 0 or params.dico_max_size < params.dico_max_rank assert params.dico_max_size == 0 or params.dico_max_size > params.dico_min_size assert os.path.isfile(params.src_emb) assert os.path.isfile(params.tgt_emb) assert params.dico_eval == 'default' or os.path.isfile(params.dico_eval) assert params.export in ["", "txt", "pth"] # build logger / model / trainer / evaluator logger = initialize_exp(params) src_emb, tgt_emb, mapping = build_model(params) trainer = Trainer(src_emb, tgt_emb, mapping, params) evaluator = Evaluator(trainer) # load a training dictionary. if a dictionary path is not provided, use a default # one ("default") or create one based on identical character strings ("identical_char") trainer.load_training_dico(params.dico_train) # define the validation metric VALIDATION_METRIC = VALIDATION_METRIC_UNSUP if params.dico_train == 'identical_char' else VALIDATION_METRIC_SUP logger.info("Validation metric: %s" % VALIDATION_METRIC) """ Learning loop for crosslingual training """