def process(self, corpora, output_path, test_data_path=None, dev_data_path=None, log=None): if log is None: log = osutils.DEVNULL args = [ '-s', self._source_lang, '-t', self._target_lang, '--output', output_path, '--input' ] for root in set([corpus.get_folder() for corpus in corpora]): args.append(root) if dev_data_path is not None: args.append('--dev') args.append(dev_data_path) if test_data_path is not None: args.append('--test') args.append(test_data_path) command = mmt_javamain(self._java_main, args) osutils.shell_exec(command, stdout=log, stderr=log) return BilingualCorpus.list(self._source_lang, self._target_lang, output_path)
def build(self, corpora, log=None): if log is None: log = osutils.DEVNULL shutil.rmtree(self._model, ignore_errors=True) osutils.makedirs(self._model, exist_ok=True) source_path = set([corpus.get_folder() for corpus in corpora]) assert len(source_path) == 1 source_path = source_path.pop() command = [ self._build_bin, '-s', self._source_lang, '-t', self._target_lang, '-i', source_path, '-m', self._model, '-I', '4' ] osutils.shell_exec(command, stdout=log, stderr=log)
def calculate(self, document, reference): script = os.path.join(cli.PYOPT_DIR, 'charcut.py') command = ['python', script, '-c', '/dev/stdin', '-r', reference] with open(document) as input_stream: stdout, _ = osutils.shell_exec(command, stdin=input_stream) return 1.0 - float(stdout)
def calculate(self, document, reference): script = os.path.join(cli.PYOPT_DIR, 'mmt-bleu.perl') command = ['perl', script, reference] with open(document) as input_stream: stdout, _ = osutils.shell_exec(command, stdin=input_stream) return float(stdout)
def train_model(self, train_dir, output_dir, batch_size=1024, n_train_steps=None, n_eval_steps=1000, hparams='transformer_base', log=None, fromModel=None): if log is None: log = osutils.DEVNULL if not os.path.isdir(output_dir): osutils.makedirs(output_dir) # if an existing checkpoint is loaded for starting the training (i.e fromModel != None) # copy the checkpoint files into the right location if fromModel is not None: self._copy_and_fix_model(fromModel, output_dir, gpus=self._gpus) data_dir = os.path.join(train_dir, 'data') src_model_vocab = os.path.join(data_dir, 'model.vcb') tgt_model_vocab = os.path.join(output_dir, 'model.vcb') if not os.path.isfile(tgt_model_vocab): os.symlink(src_model_vocab, tgt_model_vocab) env = self._get_env() hparams_p = 'batch_size=%d' % batch_size command = [ 't2t-trainer', '--t2t_usr_dir', self._t2t_dir, '--data_dir=%s' % data_dir, '--problem=translate_mmt', '--model=transformer', '--hparams_set=%s' % hparams, '--output_dir=%s' % output_dir, '--local_eval_frequency=%d' % n_eval_steps, '--train_steps=%d' % (n_train_steps if n_train_steps is not None else 100000000), '--worker_gpu=%d' % len(self._gpus), '--hparams', hparams_p ] process = osutils.shell_exec(command, stdout=log, stderr=log, env=env, background=True) try: return_code = process.wait() if return_code != 0: raise ShellError(' '.join(command), return_code, None) except KeyboardInterrupt: process.kill()
def process_file(self, input_path, output_path, lang): if lang == self._source_lang: args = ['-s', self._source_lang, '-t', self._target_lang] elif lang == self._target_lang: args = ['-s', self._target_lang, '-t', self._source_lang] else: raise ValueError('Unsupported language "%s"' % lang) if not self._print_tags: args.append('--no-tags') if self._print_placeholders: args.append('--print-placeholders') command = mmt_javamain(self._java_main, args=args) with open(input_path) as input_stream: with open(output_path, 'w') as output_stream: osutils.shell_exec(command, stdin=input_stream, stdout=output_stream)
def prepare_data(self, train_corpora, eval_corpora, output_path, log=None, bpe_symbols=2**15, fromModel=None): if log is None: log = osutils.DEVNULL data_dir = os.path.join(output_path, 'data') tmp_dir = os.path.join(output_path, 'tmp') train_path = self._get_common_root(train_corpora) eval_path = self._get_common_root(eval_corpora) shutil.rmtree(data_dir, ignore_errors=True) osutils.makedirs(data_dir) # if an existing checkpoint is loaded for starting the training (i.e fromModel!=None) # copy the subtoken vocabulary associated to the existing checkpoint into the right location, # so that the subtoken vocabulary is not re-created from the new training data, # and so that it is only exploited to bpe-fy the new data # it assumes that the vocabulary is called "model.vcb" and is located in the same directory of the checkpoint if fromModel is not None: shutil.copyfile(os.path.join(fromModel, 'model.vcb'), os.path.join(data_dir, 'model.vcb')) if not os.path.isdir(tmp_dir): osutils.makedirs(tmp_dir) env = self._get_env(train_path, eval_path, bpe=bpe_symbols) command = [ 't2t-datagen', '--t2t_usr_dir', self._t2t_dir, '--data_dir=%s' % data_dir, '--tmp_dir=%s' % tmp_dir, '--problem=translate_mmt' ] osutils.shell_exec(command, stdout=log, stderr=log, env=env)
def clean(self, corpora, output_path, log=None): if log is None: log = osutils.DEVNULL args = [ '-s', self._source_lang, '-t', self._target_lang, '--output', output_path, '--input' ] input_paths = set([corpus.get_folder() for corpus in corpora]) for root in input_paths: args.append(root) extended_heap_mb = int(osutils.mem_size() * 90 / 100) command = mmt_javamain(self._java_main, args=args, max_heap_mb=extended_heap_mb) osutils.shell_exec(command, stdout=log, stderr=log) return BilingualCorpus.list(self._source_lang, self._target_lang, output_path)