Ejemplo n.º 1
0
    def process(self,
                corpora,
                output_path,
                test_data_path=None,
                dev_data_path=None,
                log=None):
        if log is None:
            log = osutils.DEVNULL

        args = [
            '-s', self._source_lang, '-t', self._target_lang, '--output',
            output_path, '--input'
        ]

        for root in set([corpus.get_folder() for corpus in corpora]):
            args.append(root)

        if dev_data_path is not None:
            args.append('--dev')
            args.append(dev_data_path)
        if test_data_path is not None:
            args.append('--test')
            args.append(test_data_path)

        command = mmt_javamain(self._java_main, args)
        osutils.shell_exec(command, stdout=log, stderr=log)

        return BilingualCorpus.list(self._source_lang, self._target_lang,
                                    output_path)
Ejemplo n.º 2
0
    def build(self, corpora, log=None):
        if log is None:
            log = osutils.DEVNULL

        shutil.rmtree(self._model, ignore_errors=True)
        osutils.makedirs(self._model, exist_ok=True)

        source_path = set([corpus.get_folder() for corpus in corpora])
        assert len(source_path) == 1
        source_path = source_path.pop()

        command = [
            self._build_bin, '-s', self._source_lang, '-t', self._target_lang,
            '-i', source_path, '-m', self._model, '-I', '4'
        ]
        osutils.shell_exec(command, stdout=log, stderr=log)
Ejemplo n.º 3
0
    def calculate(self, document, reference):
        script = os.path.join(cli.PYOPT_DIR, 'charcut.py')
        command = ['python', script, '-c', '/dev/stdin', '-r', reference]

        with open(document) as input_stream:
            stdout, _ = osutils.shell_exec(command, stdin=input_stream)

        return 1.0 - float(stdout)
Ejemplo n.º 4
0
    def calculate(self, document, reference):
        script = os.path.join(cli.PYOPT_DIR, 'mmt-bleu.perl')
        command = ['perl', script, reference]

        with open(document) as input_stream:
            stdout, _ = osutils.shell_exec(command, stdin=input_stream)

        return float(stdout)
Ejemplo n.º 5
0
    def train_model(self,
                    train_dir,
                    output_dir,
                    batch_size=1024,
                    n_train_steps=None,
                    n_eval_steps=1000,
                    hparams='transformer_base',
                    log=None,
                    fromModel=None):
        if log is None:
            log = osutils.DEVNULL

        if not os.path.isdir(output_dir):
            osutils.makedirs(output_dir)

        # if an existing checkpoint is loaded for starting the training (i.e fromModel != None)
        # copy the checkpoint files into the right location
        if fromModel is not None:
            self._copy_and_fix_model(fromModel, output_dir, gpus=self._gpus)

        data_dir = os.path.join(train_dir, 'data')

        src_model_vocab = os.path.join(data_dir, 'model.vcb')
        tgt_model_vocab = os.path.join(output_dir, 'model.vcb')

        if not os.path.isfile(tgt_model_vocab):
            os.symlink(src_model_vocab, tgt_model_vocab)

        env = self._get_env()
        hparams_p = 'batch_size=%d' % batch_size
        command = [
            't2t-trainer', '--t2t_usr_dir', self._t2t_dir,
            '--data_dir=%s' % data_dir, '--problem=translate_mmt',
            '--model=transformer',
            '--hparams_set=%s' % hparams,
            '--output_dir=%s' % output_dir,
            '--local_eval_frequency=%d' % n_eval_steps,
            '--train_steps=%d' %
            (n_train_steps if n_train_steps is not None else 100000000),
            '--worker_gpu=%d' % len(self._gpus), '--hparams', hparams_p
        ]

        process = osutils.shell_exec(command,
                                     stdout=log,
                                     stderr=log,
                                     env=env,
                                     background=True)

        try:
            return_code = process.wait()
            if return_code != 0:
                raise ShellError(' '.join(command), return_code, None)
        except KeyboardInterrupt:
            process.kill()
Ejemplo n.º 6
0
    def process_file(self, input_path, output_path, lang):
        if lang == self._source_lang:
            args = ['-s', self._source_lang, '-t', self._target_lang]
        elif lang == self._target_lang:
            args = ['-s', self._target_lang, '-t', self._source_lang]
        else:
            raise ValueError('Unsupported language "%s"' % lang)

        if not self._print_tags:
            args.append('--no-tags')
        if self._print_placeholders:
            args.append('--print-placeholders')

        command = mmt_javamain(self._java_main, args=args)

        with open(input_path) as input_stream:
            with open(output_path, 'w') as output_stream:
                osutils.shell_exec(command,
                                   stdin=input_stream,
                                   stdout=output_stream)
Ejemplo n.º 7
0
    def prepare_data(self,
                     train_corpora,
                     eval_corpora,
                     output_path,
                     log=None,
                     bpe_symbols=2**15,
                     fromModel=None):
        if log is None:
            log = osutils.DEVNULL

        data_dir = os.path.join(output_path, 'data')
        tmp_dir = os.path.join(output_path, 'tmp')

        train_path = self._get_common_root(train_corpora)
        eval_path = self._get_common_root(eval_corpora)

        shutil.rmtree(data_dir, ignore_errors=True)
        osutils.makedirs(data_dir)

        # if an existing checkpoint is loaded for starting the training (i.e fromModel!=None)
        # copy the subtoken vocabulary associated to the existing checkpoint into the right location,
        # so that the subtoken vocabulary is not re-created from the new training data,
        # and so that it is only exploited to bpe-fy the new data
        # it assumes that the vocabulary is called "model.vcb" and is located in the same directory of the checkpoint
        if fromModel is not None:
            shutil.copyfile(os.path.join(fromModel, 'model.vcb'),
                            os.path.join(data_dir, 'model.vcb'))

        if not os.path.isdir(tmp_dir):
            osutils.makedirs(tmp_dir)

        env = self._get_env(train_path, eval_path, bpe=bpe_symbols)
        command = [
            't2t-datagen', '--t2t_usr_dir', self._t2t_dir,
            '--data_dir=%s' % data_dir,
            '--tmp_dir=%s' % tmp_dir, '--problem=translate_mmt'
        ]

        osutils.shell_exec(command, stdout=log, stderr=log, env=env)
Ejemplo n.º 8
0
    def clean(self, corpora, output_path, log=None):
        if log is None:
            log = osutils.DEVNULL

        args = [
            '-s', self._source_lang, '-t', self._target_lang, '--output',
            output_path, '--input'
        ]

        input_paths = set([corpus.get_folder() for corpus in corpora])

        for root in input_paths:
            args.append(root)

        extended_heap_mb = int(osutils.mem_size() * 90 / 100)

        command = mmt_javamain(self._java_main,
                               args=args,
                               max_heap_mb=extended_heap_mb)
        osutils.shell_exec(command, stdout=log, stderr=log)

        return BilingualCorpus.list(self._source_lang, self._target_lang,
                                    output_path)