def mmt_dedup(src_lang, tgt_lang, in_path, out_path, length_threshold=None): args = ['-s', src_lang, '-t', tgt_lang, '--input', in_path, '--output', out_path] if length_threshold is not None and length_threshold > 0: args += ['-l', length_threshold] command = mmt_java('eu.modernmt.cli.DeduplicationMain', args) osutils.shell_exec(command, env=__mmt_env())
def __test(self, src, tgt, filename): src_file = os.path.join(TEST_RESOURCES, 'tag_projection_dataset', filename + '.' + src) tgt_file = os.path.join(TEST_RESOURCES, 'tag_projection_dataset', filename + '.' + tgt) alg_file = os.path.join(TEST_RESOURCES, 'tag_projection_dataset', filename + '.alg') if not os.path.isfile(src_file) or not os.path.isfile( tgt_file) or not os.path.isfile(alg_file): self.skipTest("external resource not available") java_cmd = mmt_java( 'eu.modernmt.processing.tags.cli.XMLProjectorTestMain', [src_file, tgt_file, alg_file]) with tempfile.NamedTemporaryFile() as out_stream: osutils.shell_exec(java_cmd, stdout=out_stream) out_stream.flush() with _Reader(src_file, out_stream.name, alg_file) as reader: for src_line, tgt_line, alg_line in reader: src_line, tgt_line = src_line.rstrip(), tgt_line.rstrip() src_tags, tgt_tags = self._extract_tags( src_line), self._extract_tags(tgt_line) if set(src_tags) != set(tgt_tags): self.fail( 'Not all tags were projected:\n\t%s\n\t%s\n\t%s' % (src_line, tgt_line, alg_line)) if not self.__validate_tags(tgt_tags): self.fail('Invalid tag projection:\n\t%s\n\t%s\n\t%s' % (src_line, tgt_line, alg_line))
def datagen(self): os.makedirs(self.args.output_path, exist_ok=True) train_pref = os.path.join(self.state.encoded_corpora, 'train') valid_pref = os.path.join(self.state.encoded_corpora, 'dev') cmd = ['fairseq-preprocess', '--source-lang', 'sl', '--target-lang', 'tl', '--user-dir', MMT_FAIRSEQ_USER_DIR, '--task', 'mmt_translation', '--trainpref', train_pref, '--validpref', valid_pref, '--destdir', self.args.output_path, '--workers', str(multiprocessing.cpu_count()), '--srcdict', self.state.vocab, '--joined-dictionary', '--dataset-impl', 'mmap'] osutils.shell_exec(cmd, stdout=self.log_fobj, stderr=self.log_fobj)
def mmt_tmsclean(src_lang, tgt_lang, in_path, out_path, out_format=None, filters=None): args = ['-s', src_lang, '-t', tgt_lang, '--input', in_path, '--output', out_path] if out_format is not None: args += ['--output-format', out_format] if filters is not None and len(filters) > 0: args += ['--filters'] + filters extended_heap_mb = int(osutils.mem_size() * 90 / 100) java_ops = ['-DentityExpansionLimit=0', '-DtotalEntitySizeLimit=0', '-Djdk.xml.totalEntitySizeLimit=0'] command = mmt_java('eu.modernmt.cli.CleaningPipelineMain', args, max_heap_mb=extended_heap_mb, java_ops=java_ops) osutils.shell_exec(command, env=__mmt_env())
def mmt_preprocess(src_lang, tgt_lang, in_paths, out_path, dev_path=None, test_path=None, partition_size=None): args = ['-s', src_lang, '-t', tgt_lang, '--output', out_path, '--input'] if isinstance(in_paths, str): in_paths = [in_paths] args += in_paths if partition_size is not None: args += ['--size', str(partition_size)] if dev_path is not None: args += ['--dev', dev_path] if test_path is not None: args += ['--test', test_path] command = mmt_java('eu.modernmt.cli.TrainingPipelineMain', args) osutils.shell_exec(command, env=__mmt_env())
def cli(self, *args, stdin=None, stdout=subprocess.PIPE, stderr=subprocess.PIPE): return osutils.shell_exec([self._mmt_script] + list(args), stdin=stdin, stdout=stdout, stderr=stderr)
def fastalign_build(src_lang, tgt_lang, in_path, out_model, iterations=None, case_sensitive=True, favor_diagonal=True, log=None): os.makedirs(out_model, exist_ok=True) out_model = os.path.join(out_model, '%s__%s.fam' % (src_lang, tgt_lang)) if log is None: log = osutils.DEVNULL command = [os.path.join(MMT_BIN_DIR, 'fa_build'), '-s', src_lang, '-t', tgt_lang, '-i', in_path, '-m', out_model] if iterations is not None: command.extend(['-I', str(iterations)]) if not case_sensitive: command.append('--case-insensitive') if not favor_diagonal: command.append('--no-favor-diagonal') osutils.shell_exec(command, stdout=log, stderr=log, env=__mmt_env())
def __init__(self, model_path) -> None: std_out, _ = osutils.shell_exec(['java', '-cp', mmt.MMT_JAR, 'eu.modernmt.context.lucene.storage.utils.Dump', os.path.join(model_path, 'storage')]) self._content_by_memory = defaultdict(set) for line in std_out.splitlines(keepends=False): memory, src_lang, tgt_lang, line = line.strip().split('\t', maxsplit=3) self._content_by_memory[int(memory)].add('%s\t%s\t%s' % (src_lang, tgt_lang, line))
def __init__(self, model_path, main_class=None) -> None: if main_class is None: main_class = 'eu.modernmt.decoder.neural.memory.lucene.utils.Dump' cmd = ['java', '-cp', mmt.MMT_JAR, main_class, model_path] std_out, _ = osutils.shell_exec(cmd) self._content_by_memory = defaultdict(set) for line in std_out.splitlines(keepends=False): memory, src_lang, tgt_lang, src_line, tgt_line = line.strip().split('\t') self._content_by_memory[int(memory)].add(self.Entry(src_lang, tgt_lang, src_line, tgt_line))
def fastalign_score(src_lang, tgt_lang, model_path, in_path, out_path=None): model_path = os.path.join(model_path, '%s__%s.fam' % (src_lang, tgt_lang)) command = [os.path.join(MMT_BIN_DIR, 'fa_score'), '-s', src_lang, '-t', tgt_lang, '-m', model_path, '-i', in_path, '-o', out_path or in_path] stdout, _ = osutils.shell_exec(command, env=__mmt_env()) result = dict() for line in stdout.splitlines(keepends=False): key, value = line.split('=', maxsplit=1) result[key] = float(value) return result['good_avg'], result['good_std_dev'], result['bad_avg'], result['bad_std_dev']
def __get_java_version(): try: stdout, stderr = osutils.shell_exec(['java', '-version']) java_output = stdout + '\n' + stderr for line in java_output.split('\n'): tokens = line.split() if 'version' in tokens: version = tokens[tokens.index('version') + 1] version = version.strip('"') if version.startswith('1.'): version = version[2:] version = re.match('^[0-9]+', version) return int(version.group()) return None except OSError: return None
def start(self): command = mmt_java( 'eu.modernmt.cli.BackupDaemonMain', args=['-e', self.engine.name, '-i', '3600', '-l', '1']) env = dict(os.environ, MMT_Q_HOST=network.get_ip()) self._process = osutils.shell_exec(command, background=True, env=env)
def train_nn(self): self.state.nn_path = self.wdir('nn_model') last_ckpt_path = os.path.join(self.state.nn_path, 'checkpoint_last.pt') if not os.path.isfile(last_ckpt_path) and self.args.init_model is not None: shutil.copy(self.args.init_model, last_ckpt_path) # Create command tensorboard_logdir = self.state.tensorboard_logdir = self.wdir('tensorboard_logdir') cmd = ['fairseq-train', self.args.data_path, '--save-dir', self.state.nn_path, '--task', 'mmt_translation', '--user-dir', MMT_FAIRSEQ_USER_DIR, '--share-all-embeddings', '--no-progress-bar', '--tensorboard-logdir', tensorboard_logdir, '--dataset-impl', 'mmap'] if self.args.train_steps is not None: cmd.extend(['--max-update', str(self.args.train_steps)]) cmd += self.extra_argv # Create environment env = None if self.args.gpus is not None: env = os.environ.copy() env['CUDA_VISIBLE_DEVICES'] = ','.join([str(gpu) for gpu in self.args.gpus]) # Start process tensorboard = None if self.args.tensorboard_port is not None: tensorboard_env = os.environ.copy() tensorboard_env['CUDA_VISIBLE_DEVICES'] = '' tensorboard_log = open(os.path.join(self.state.tensorboard_logdir, 'server.log'), 'wb') tensorboard_cmd = ['tensorboard', '--logdir', tensorboard_logdir, '--port', str(self.args.tensorboard_port)] tensorboard = osutils.shell_exec(tensorboard_cmd, stderr=tensorboard_log, stdout=tensorboard_log, env=tensorboard_env, background=True) process_timeout = None if self.args.train_steps is None: process_timeout = 5 * 60 # 5 minutes process = osutils.shell_exec(cmd, stderr=self.log_fobj, stdout=self.log_fobj, background=True, env=env) last_checkpoint = None try: while True: try: return_code = process.wait(process_timeout) if return_code != 0: raise ShellError(' '.join(cmd), return_code) break except KeyboardInterrupt: process.terminate() self._logger.info('Training manually interrupted by user') break except TimeoutExpired: checkpoints = _last_n_checkpoints(self.state.nn_path, 1) checkpoint = checkpoints[0] if len(checkpoints) > 0 else None if last_checkpoint != checkpoint and self._training_should_stop(): process.terminate() self._logger.info('Training interrupted by termination policy: ' 'validation loss has reached its plateau') break last_checkpoint = checkpoint finally: if tensorboard is not None: tensorboard.terminate()
def pip_install(): requirements_txt = os.path.join(mmt.MMT_HOME_DIR, 'requirements.txt') osutils.shell_exec(['pip3', 'install', '-r', requirements_txt], stderr=sys.stderr, stdout=sys.stdout)