def train(self, config, src_file, tgt_file, src_vocab_info, tgt_vocab_info, align_file=None, model_path=None, gpuid=0): if src_vocab_info['changed'] or tgt_vocab_info['changed']: model_path = checkpoint.update_vocab( model_path, os.path.join(self._output_dir, 'new_vocab_checkpoint'), src_vocab_info['model'], tgt_vocab_info['model'], new_src_vocab=src_vocab_info['current'] if src_vocab_info['changed'] else None, new_tgt_vocab=tgt_vocab_info['current'] if tgt_vocab_info['changed'] else None, mode='replace', session_config=tf.ConfigProto(device_count={'GPU': 0})) model_dir, model = self._load_model( model_type=config['options'].get('model_type'), model_file=config['options'].get('model'), model_path=model_path) run_config = copy.deepcopy(config['options'].get('config', {})) run_config['model_dir'] = model_dir if 'data' not in run_config: run_config['data'] = {} if 'train' not in run_config: run_config['train'] = {} run_config['data']['source_words_vocabulary'] = src_vocab_info[ 'current'] run_config['data']['target_words_vocabulary'] = tgt_vocab_info[ 'current'] run_config['data']['train_features_file'] = src_file run_config['data']['train_labels_file'] = tgt_file if align_file is not None and os.path.exists(align_file): run_config['data']['train_alignments'] = align_file if "params" not in run_config: run_config["params"] = {} if "guided_alignment_type" not in run_config["params"]: run_config["params"]["guided_alignment_type"] = "ce" if 'train_steps' not in run_config['train']: run_config['train']['single_pass'] = True run_config['train']['train_steps'] = None if 'sample_buffer_size' not in run_config['train']: run_config['train']['sample_buffer_size'] = -1 if 'average_last_checkpoints' not in run_config['train']: run_config['train']['average_last_checkpoints'] = 0 runner = onmt.Runner(model, run_config, num_devices=utils.count_devices(gpuid), auto_config=config['options'].get( 'auto_config', False)) output_dir = runner.train() if output_dir != model_dir: shutil.copy(os.path.join(model_dir, "model_description.py"), output_dir) return self._list_model_files(output_dir)
def _make_predict_runner(self, config, model_path): model_dir, model = self._load_model(model_path=model_path) run_config = copy.deepcopy(config['options']['config']) run_config['model_dir'] = model_dir for k, v in six.iteritems(run_config['data']): run_config['data'][k] = self._convert_vocab(v) return onmt.Runner(model, run_config)
def trans(self, config, model_path, input, output, gpuid=0): model_dir, model = self._load_model(model_path=model_path) run_config = copy.deepcopy(config['options']['config']) run_config['model_dir'] = model_dir for k, v in six.iteritems(run_config['data']): run_config['data'][k] = self._convert_vocab(v) onmt.Runner(model, run_config).infer(input, predictions_file=output)
def init_model_and_runner(config, d, **kwargs): model = opennmt.models.Transformer( opennmt.inputters.WordEmbedder(embedding_size=d), opennmt.inputters.WordEmbedder(embedding_size=d), **kwargs) runner = opennmt.Runner(model, config, auto_config=True) return runner
def minimal_transformer_training_example(): run_type = 'train' # Run type: 'train' or 'translate'. train_features_filepath = './toy-ende/src-train.txt' # Path to the source file. train_labels_filepath = './toy-ende/tgt-train.txt' # Path to the target file. eval_features_filepath = './toy-ende/src-val.txt' # Path to the source file. eval_labels_filepath = './toy-ende/tgt-val.txt' # Path to the target file. source_vocabulary_filepath = './toy-ende/src-vocab.txt' # Path to the source vocabulary. target_vocabulary_filepath = './toy-ende/tgt-vocab.txt' # Path to the target vocabulary. model_dir_path = './checkpoint' # Directory where checkpoint are written. # See http://opennmt.net/OpenNMT-tf/configuration.html for a complete specification of the configuration. config = { 'model_dir': model_dir_path, 'data': { 'source_vocabulary': source_vocabulary_filepath, 'target_vocabulary': target_vocabulary_filepath, 'train_features_file': train_features_filepath, 'train_labels_file': train_labels_filepath, 'eval_features_file': eval_features_filepath, 'eval_labels_file': eval_labels_filepath, } } model = onmt.models.TransformerBase() runner = onmt.Runner(model, config, auto_config=True) if run_type == 'train': runner.train() elif run_type == 'translate': runner.infer(eval_features_filepath)
def _make_predict_runner(self, config, model_path): model_dir, model = self._load_model( model_type=config['options'].get('model_type'), model_file=config['options'].get('model'), model_path=model_path) run_config = copy.deepcopy(config['options']['config']) run_config['model_dir'] = model_dir if 'data' not in run_config: run_config['data'] = {} run_config['data'] = self._register_vocab(config, run_config['data']) return onmt.Runner(model, run_config)
def _build_runner( self, config, src_vocab=None, tgt_vocab=None, src_file=None, tgt_file=None, align_file=None, example_weights_file=None, model_path=None, ): model_dir = os.path.join(self._output_dir, "model") if os.path.exists(model_dir): shutil.rmtree(model_dir) os.makedirs(model_dir) # Copy checkpoint files into the temporary model dir. if model_path is not None: checkpoint_files = _list_checkpoint_files(model_path) for filename, path in checkpoint_files.items(): shutil.copy(path, os.path.join(model_dir, filename)) # Prepare vocabulary if not already done. if src_vocab is None: src_vocab = self._convert_vocab( config["vocabulary"]["source"]["path"]) if tgt_vocab is None: tgt_vocab = self._convert_vocab( config["vocabulary"]["target"]["path"]) options = config["options"] run_config = _build_run_config( options.get("config"), model_dir, src_vocab, tgt_vocab, src_file=src_file, tgt_file=tgt_file, align_file=align_file, example_weights_file=example_weights_file, ) model = opennmt.load_model( model_dir, model_file=options.get("model"), model_name=options.get("model_type"), as_builder=True, ) return opennmt.Runner( model, run_config, auto_config=options.get("auto_config", False), mixed_precision=options.get("mixed_precision", False), )
def _build_runner(self, config, src_vocab=None, tgt_vocab=None, src_file=None, tgt_file=None, align_file=None, example_weights_file=None, model_path=None): model_dir = os.path.join(self._output_dir, 'model') os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true' if os.path.exists(model_dir): shutil.rmtree(model_dir) os.makedirs(model_dir) # Copy checkpoint files into the temporary model dir. if model_path is not None: checkpoint_files = _list_checkpoint_files(model_path) for filename, path in checkpoint_files.items(): shutil.copy(path, os.path.join(model_dir, filename)) # Prepare vocabulary if not already done. if src_vocab is None: src_vocab = self._convert_vocab(config['vocabulary']['source']['path']) if tgt_vocab is None: tgt_vocab = self._convert_vocab(config['vocabulary']['target']['path']) options = config['options'] run_config = _build_run_config( options.get('config'), model_dir, src_vocab, tgt_vocab, src_file=src_file, tgt_file=tgt_file, align_file=align_file, example_weights_file=example_weights_file) model = opennmt.load_model( model_dir, model_file=options.get('model'), model_name=options.get('model_type'), as_builder=True, ) return opennmt.Runner( model, run_config, auto_config=options.get('auto_config', False), mixed_precision=options.get('mixed_precision', False), )
def train(self, config, src_file, tgt_file, src_vocab_info, tgt_vocab_info, model_path=None, gpuid=0): if src_vocab_info['changed'] or tgt_vocab_info['changed']: model_path = checkpoint.update_vocab( model_path, os.path.join(self._output_dir, 'new_vocab_checkpoint'), src_vocab_info['model'], tgt_vocab_info['model'], new_src_vocab=src_vocab_info['current'] if src_vocab_info['changed'] else None, new_tgt_vocab=tgt_vocab_info['current'] if tgt_vocab_info['changed'] else None, mode='replace', session_config=tf.ConfigProto(device_count={'GPU': 0})) model_dir, model = self._load_model( model_type=config['options'].get('model_type'), model_file=config['options'].get('model'), model_path=model_path) run_config = copy.deepcopy(config['options'].get('config', {})) run_config['model_dir'] = model_dir if 'data' not in run_config: run_config['data'] = {} if 'train' not in run_config: run_config['train'] = {} run_config['data']['source_words_vocabulary'] = src_vocab_info[ 'current'] run_config['data']['target_words_vocabulary'] = tgt_vocab_info[ 'current'] run_config['data']['train_features_file'] = src_file run_config['data']['train_labels_file'] = tgt_file if 'train_steps' not in run_config['train']: run_config['train']['single_pass'] = True run_config['train']['train_steps'] = None if 'sample_buffer_size' not in run_config['train']: run_config['train']['sample_buffer_size'] = -1 runner = onmt.Runner(model, run_config, num_devices=utils.count_devices(gpuid), auto_config=config['options'].get( 'auto_config', False)) runner.train() return self._list_model_files(model_dir)
def train(self, config, src_file, tgt_file, model_path=None, gpuid=0): model_dir, model = self._load_model( model_type=config['options'].get('model_type'), model_file=config['options'].get('model'), model_path=model_path) run_config = copy.deepcopy(config['options']['config']) run_config['model_dir'] = model_dir for k, v in six.iteritems(run_config['data']): run_config['data'][k] = self._convert_vocab(v) run_config['data']['train_features_file'] = src_file run_config['data']['train_labels_file'] = tgt_file if 'train_steps' not in run_config['train']: run_config['train']['single_pass'] = True run_config['train']['train_steps'] = None if 'sample_buffer_size' not in run_config['train']: run_config['train']['sample_buffer_size'] = -1 onmt.Runner(model, run_config, num_devices=utils.count_devices(gpuid)).train() return self._list_model_files(model_dir)
def train(self, config, src_file, tgt_file, model_path=None, gpuid=0): model_dir, model = self._load_model( model_file=config['options']['model'], model_path=model_path) run_config = copy.deepcopy(config['options']['config']) run_config['model_dir'] = model_dir for k, v in six.iteritems(run_config['data']): run_config['data'][k] = self._convert_vocab(v) run_config['data']['train_features_file'] = src_file run_config['data']['train_labels_file'] = tgt_file if 'train_steps' not in run_config['train']: run_config['train']['single_pass'] = True run_config['train']['train_steps'] = None onmt.Runner(model, run_config).train() return self._list_model_files(model_dir)
def _build_runner(self, config, src_vocab=None, tgt_vocab=None, src_file=None, tgt_file=None, align_file=None, model_path=None): model_dir = os.path.join(self._output_dir, 'model') if os.path.exists(model_dir): shutil.rmtree(model_dir) os.makedirs(model_dir) # Copy checkpoint files into the temporary model dir. if model_path is not None: checkpoint_files = _list_checkpoint_files(model_path) for filename, path in checkpoint_files.items(): shutil.copy(path, os.path.join(model_dir, filename)) # Prepare vocabulary if not already done. if src_vocab is None: src_vocab = self._convert_vocab( config['tokenization']['source']['vocabulary']) if tgt_vocab is None: tgt_vocab = self._convert_vocab( config['tokenization']['target']['vocabulary']) options = config['options'] run_config = _build_run_config(options.get('config'), model_dir, src_vocab, tgt_vocab, src_file=src_file, tgt_file=tgt_file, align_file=align_file) model = opennmt.load_model(model_dir, model_file=options.get('model'), model_name=options.get('model_type'), serialize_model=False) return opennmt.Runner(model, run_config, auto_config=options.get('auto_config', False))
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("run", choices=["train", "translate"], help="Run type.") parser.add_argument("--src", required=True, help="Path to the source file.") parser.add_argument("--tgt", help="Path to the target file.") parser.add_argument("--src_vocab", required=True, help="Path to the source vocabulary.") parser.add_argument("--tgt_vocab", required=True, help="Path to the target vocabulary.") parser.add_argument("--model_dir", default="checkpoint", help="Directory where checkpoint are written.") args = parser.parse_args() # See http://opennmt.net/OpenNMT-tf/configuration.html for a complete specification # of the configuration. config = { "model_dir": args.model_dir, "data": { "source_vocabulary": args.src_vocab, "target_vocabulary": args.tgt_vocab, "train_features_file": args.src, "train_labels_file": args.tgt, } } model = onmt.models.TransformerBase() runner = onmt.Runner(model, config, auto_config=True) if args.run == "train": runner.train() elif args.run == "translate": runner.infer(args.src)
import opennmt import seq_tagger_updated as SequenceTagger config = { "model_dir": "run/", "data": { "train_features_file": "train_words_bitext.txt", "train_labels_file": "train_tags_bitext.txt", "eval_features_file": "valid_words_bitext.txt", "eval_labels_file": "valid_tags_bitext.txt", "source_1_vocabulary": "src-train-vocab.txt", "source_2_vocabulary": "src-train-tkt-vocab.txt", "target_vocabulary": "tgt-train-vocab.txt", }, } # model = SequenceTagger.model() #model = opennmt.models.SequenceTagger() model = opennmt.models.catalog.LstmCnnCrfTagger() runner = opennmt.Runner(model, config, auto_config=True) runner.train(num_devices=2, with_eval=True)