def generate(self, bilingual_corpora, monolingual_corpora, output, log_file=None): fileutils.makedirs(self._model, exist_ok=True) args = ['--db', os.path.join(self._model, 'domains.db'), '-l', self._source_lang, '-c'] source_paths = set([corpus.get_folder() for corpus in bilingual_corpora]) for source_path in source_paths: args.append(source_path) command = cli.mmt_javamain(self._java_mainclass, args) log = shell.DEVNULL try: if log_file is not None: log = open(log_file, 'w') stdout, _ = shell.execute(command, stderr=log) domains = {} for domain, name in [line.rstrip('\n').split('\t', 2) for line in stdout.splitlines()]: domains[name] = domain return self._make_training_folder(bilingual_corpora, monolingual_corpora, domains, output) finally: if log_file is not None: log.close()
def process(self, corpora, output_path, data_path=None): args = [ '-s', self._source_lang, '-t', self._target_lang, '-v', self._vocabulary_path, '--output', output_path, '--input' ] input_paths = set([corpus.get_folder() for corpus in corpora]) for root in input_paths: args.append(root) if data_path is not None: args.append('--dev') args.append( os.path.join(data_path, TrainingPreprocessor.DEV_FOLDER_NAME)) args.append('--test') args.append( os.path.join(data_path, TrainingPreprocessor.TEST_FOLDER_NAME)) command = mmt_javamain(self._java_mainclass, args) shell.execute(command, stdin=shell.DEVNULL, stdout=shell.DEVNULL, stderr=shell.DEVNULL) return BilingualCorpus.splitlist(self._source_lang, self._target_lang, roots=output_path)
def _start_process(self): if not os.path.isdir(self.engine.get_runtime_path()): fileutils.makedirs(self.engine.get_runtime_path(), exist_ok=True) self._log_file = self.engine.get_logfile(ClusterNode.__LOG_FILENAME, ensure=True) args = ['-e', self.engine.name, '-p', str(self._cluster_ports[0]), str(self._cluster_ports[1]), '--status-file', self._status_file] if self._start_rest_server: args.append('-a') args.append(str(self._api_port)) if self._verbosity is not None: args.append('-v') args.append(str(self._verbosity)) if self._sibling is not None: args.append('--member') args.append(str(self._sibling)) command = mmt_javamain('eu.modernmt.cli.ClusterNodeMain', args, hserr_path=os.path.abspath(os.path.join(self._log_file, os.pardir))) log = open(self._log_file, 'wa') if os.path.isfile(self._status_file): os.remove(self._status_file) return subprocess.Popen(command, stdout=open(os.devnull), stderr=log, shell=False)
def process(self, corpora, output_path, test_data_path=None, dev_data_path=None, log=None): if log is None: log = osutils.DEVNULL args = [ '-s', self._source_lang, '-t', self._target_lang, '--output', output_path, '--input' ] for root in set([corpus.get_folder() for corpus in corpora]): args.append(root) if dev_data_path is not None: args.append('--dev') args.append(dev_data_path) if test_data_path is not None: args.append('--test') args.append(test_data_path) command = mmt_javamain(self._java_main, args) osutils.shell_exec(command, stdout=log, stderr=log) return BilingualCorpus.list(self._source_lang, self._target_lang, output_path)
def clean(self, corpora, output_path, log=None): if log is None: log = shell.DEVNULL # read memory size mem_bytes = os.sysconf('SC_PAGE_SIZE') * os.sysconf( 'SC_PHYS_PAGES') # e.g. 4015976448 mem_mb = mem_bytes / (1024.**2) # e.g. 3.74 extended_heap_mb = int(mem_mb * 90 / 100) args = [ '-s', self._source_lang, '-t', self._target_lang, '--output', output_path, '--input' ] input_paths = set([corpus.get_folder() for corpus in corpora]) for root in input_paths: args.append(root) command = mmt_javamain(self._java_mainclass, args=args, max_heap_mb=extended_heap_mb) shell.execute(command, stdout=log, stderr=log) return BilingualCorpus.list(output_path)
def __get_command(self, lang, print_tags, print_placeholders, original_spacing): args = ['--lang', lang] if original_spacing: args.append('--original-spacing') if not print_tags: args.append('--no-tags') if print_placeholders: args.append('--print-placeholders') return mmt_javamain(self._java_mainclass, args)
def clean(self, source, target, input_paths, output_path): args = ['-s', source, '-t', target, '--output', output_path, '--input'] for root in input_paths: args.append(root) command = mmt_javamain(self._java_mainclass, args) shell.execute(command, stdin=shell.DEVNULL, stdout=shell.DEVNULL, stderr=shell.DEVNULL) return BilingualCorpus.splitlist(source, target, roots=output_path)[0]
def process_file(self, source, dest, lang): args = ['--lang', self._lang] if not self._print_tags: args.append('--no-tags') if self._print_placeholders: args.append('--print-placeholders') command = mmt_javamain(self._java_mainclass, args=args) with open(source) as input_stream: with open(dest.get_file(lang), 'w') as output_stream: shell.execute(command, stdin=input_stream, stdout=output_stream)
def clean(self, corpora, output_path): args = ['-s', self._source_lang, '-t', self._target_lang, '--output', output_path, '--input'] input_paths = set([corpus.get_folder() for corpus in corpora]) for root in input_paths: args.append(root) command = mmt_javamain(self._java_mainclass, args) shell.execute(command, stdin=shell.DEVNULL, stdout=shell.DEVNULL, stderr=shell.DEVNULL) return BilingualCorpus.list(output_path)
def clean(self, corpora, output_path, log=None): if log is None: log = shell.DEVNULL args = ['-s', self._source_lang, '-t', self._target_lang, '--output', output_path, '--input'] input_paths = set([corpus.get_folder() for corpus in corpora]) for root in input_paths: args.append(root) command = mmt_javamain(self._java_mainclass, args) shell.execute(command, stdout=log, stderr=log) return BilingualCorpus.list(output_path)
def process(self, source, target, input_paths, output_path, data_path=None): args = ['-s', source, '-t', target, '--output', output_path, '--input'] for root in input_paths: args.append(root) if data_path is not None: args.append('--dev') args.append(os.path.join(data_path, TrainingPreprocessor.DEV_FOLDER_NAME)) args.append('--test') args.append(os.path.join(data_path, TrainingPreprocessor.TEST_FOLDER_NAME)) command = mmt_javamain(self._java_mainclass, args) shell.execute(command, stdin=shell.DEVNULL, stdout=shell.DEVNULL, stderr=shell.DEVNULL) return BilingualCorpus.splitlist(source, target, roots=output_path)
def reduce(self, corpora, output_path, word_limit, log=None): if log is None: log = shell.DEVNULL args = [ '-s', self._source_lang, '-t', self._target_lang, '--words', str(word_limit), '--output', output_path, '--input' ] for root in set([corpus.get_folder() for corpus in corpora]): args.append(root) command = mmt_javamain(self._reduce_mainclass, args=args) shell.execute(command, stdout=log, stderr=log) return BilingualCorpus.list(output_path)
def _start_process(self, api_port, cluster_port, datastream_port, db_port, leader, verbosity): if not os.path.isdir(self.engine.runtime_path): fileutils.makedirs(self.engine.runtime_path, exist_ok=True) logs_folder = os.path.abspath(os.path.join(self._log_file, os.pardir)) args = [ '-e', self.engine.name, '--status-file', self._status_file, '--logs', logs_folder ] if cluster_port is not None: args.append('--cluster-port') args.append(str(cluster_port)) if api_port is not None: args.append('--api-port') args.append(str(api_port)) if datastream_port is not None: args.append('--datastream-port') args.append(str(datastream_port)) if db_port is not None: args.append('--db-port') args.append(str(db_port)) if verbosity is not None: args.append('-v') args.append(str(verbosity)) if leader is not None: args.append('--leader') args.append(leader) command = mmt_javamain('eu.modernmt.cli.ClusterNodeMain', args, hserr_path=logs_folder) if os.path.isfile(self._status_file): os.remove(self._status_file) return subprocess.Popen(command, stdout=shell.DEVNULL, stderr=shell.DEVNULL, shell=False)
def create_index(self, corpora, log=None): if log is None: log = shell.DEVNULL source_paths = set() for corpus in corpora: source_paths.add(corpus.get_folder()) shutil.rmtree(self._index, ignore_errors=True) fileutils.makedirs(self._index, exist_ok=True) args = ['-s', self._source_lang, '-t', self._target_lang, '-i', self._index, '-c'] for source_path in source_paths: args.append(source_path) command = mmt_javamain(self._java_mainclass, args) shell.execute(command, stdout=log, stderr=log)
def process(self, corpora, output_path, data_path=None): args = ['-s', self._source_lang, '-t', self._target_lang, '-v', self._vocabulary_path, '--output', output_path, '--input'] input_paths = set([corpus.get_folder() for corpus in corpora]) for root in input_paths: args.append(root) if data_path is not None: args.append('--dev') args.append(os.path.join(data_path, TrainingPreprocessor.DEV_FOLDER_NAME)) args.append('--test') args.append(os.path.join(data_path, TrainingPreprocessor.TEST_FOLDER_NAME)) command = mmt_javamain(self._java_mainclass, args) shell.execute(command, stdin=shell.DEVNULL, stdout=shell.DEVNULL, stderr=shell.DEVNULL) return BilingualCorpus.splitlist(self._source_lang, self._target_lang, roots=output_path)
def process_file(self, input_path, output_path, lang): if lang == self._source_lang: args = ['-s', self._source_lang, '-t', self._target_lang] elif lang == self._target_lang: args = ['-s', self._target_lang, '-t', self._source_lang] else: raise ValueError('Unsupported language "%s"' % lang) if not self._print_tags: args.append('--no-tags') if self._print_placeholders: args.append('--print-placeholders') command = mmt_javamain(self._java_main, args=args) with open(input_path) as input_stream: with open(output_path, 'w') as output_stream: osutils.shell_exec(command, stdin=input_stream, stdout=output_stream)
def generate(self, bilingual_corpora, monolingual_corpora, output, log=None): if log is None: log = shell.DEVNULL fileutils.makedirs(self._model, exist_ok=True) args = [ '--db', os.path.join(self._model, 'domains.db'), '-s', self._source_lang, '-t', self._target_lang, '-c' ] source_paths = set( [corpus.get_folder() for corpus in bilingual_corpora]) for source_path in source_paths: args.append(source_path) command = cli.mmt_javamain(self._java_mainclass, args) stdout, _ = shell.execute(command, stderr=log) domains = {} for domain, name in [ line.rstrip('\n').split('\t', 2) for line in stdout.splitlines() ]: domains[name] = domain bilingual_corpora = [ corpus.symlink(output, name=domains[corpus.name]) for corpus in bilingual_corpora ] monolingual_corpora = [ corpus.symlink(output) for corpus in monolingual_corpora ] return bilingual_corpora, monolingual_corpora
def _start_process(self): if not os.path.isdir(self.engine.get_runtime_path()): fileutils.makedirs(self.engine.get_runtime_path(), exist_ok=True) self._log_file = self.engine.get_logfile(ClusterNode.__LOG_FILENAME, ensure=True) args = [ '-e', self.engine.name, '-p', str(self._cluster_ports[0]), str(self._cluster_ports[1]), '--status-file', self._status_file ] if self._start_rest_server: args.append('-a') args.append(str(self._api_port)) if self._verbosity is not None: args.append('-v') args.append(str(self._verbosity)) if self._sibling is not None: args.append('--member') args.append(str(self._sibling)) command = mmt_javamain('eu.modernmt.cli.ClusterNodeMain', args, hserr_path=os.path.abspath( os.path.join(self._log_file, os.pardir))) log = open(self._log_file, 'wa') if os.path.isfile(self._status_file): os.remove(self._status_file) return subprocess.Popen(command, stdout=open(os.devnull), stderr=log, shell=False)
def clean(self, corpora, output_path, log=None): if log is None: log = osutils.DEVNULL args = [ '-s', self._source_lang, '-t', self._target_lang, '--output', output_path, '--input' ] input_paths = set([corpus.get_folder() for corpus in corpora]) for root in input_paths: args.append(root) extended_heap_mb = int(osutils.mem_size() * 90 / 100) command = mmt_javamain(self._java_main, args=args, max_heap_mb=extended_heap_mb) osutils.shell_exec(command, stdout=log, stderr=log) return BilingualCorpus.list(self._source_lang, self._target_lang, output_path)
def create_index(self, corpora, lang, log_file=None): source_paths = set() for corpus in corpora: source_paths.add(corpus.get_folder()) fileutils.makedirs(self._index, exist_ok=True) args = ['-l', lang, '-i', self._index, '-c'] for source_path in source_paths: args.append(source_path) command = mmt_javamain(self._java_mainclass, args) log = shell.DEVNULL try: if log_file is not None: log = open(log_file, 'w') shell.execute(command, stdout=log, stderr=log) finally: if log_file is not None: log.close()
def _start_process(self): if not os.path.isdir(self.engine.runtime_path): fileutils.makedirs(self.engine.runtime_path, exist_ok=True) logs_folder = os.path.abspath(os.path.join(self._log_file, os.pardir)) args = [ '-e', self.engine.name, '-p', str(self._cluster_ports[0]), str(self._cluster_ports[1]), '--datastream-port', str(self._datastream_port), '--status-file', self._status_file, '--logs', logs_folder ] if self._start_rest_server: args.append('-a') args.append(str(self._api_port)) if self._verbosity is not None: args.append('-v') args.append(str(self._verbosity)) if self._sibling is not None: args.append('--member') args.append(str(self._sibling)) command = mmt_javamain('eu.modernmt.cli.ClusterNodeMain', args, hserr_path=logs_folder) if os.path.isfile(self._status_file): os.remove(self._status_file) return subprocess.Popen(command, stdout=shell.DEVNULL, stderr=shell.DEVNULL, shell=False)
def start(self, api_port=None, cluster_port=None, datastream_port=None, db_port=None, leader=None, verbosity=None, remote_debug=False, log_file=None): if log_file is not None: self._log_file = log_file if not os.path.isdir(self.engine.runtime_path): os.makedirs(self.engine.runtime_path) args = [ '-e', self.engine.name, '--status-file', self._status_file, '--log-file', self._log_file ] if cluster_port is not None: args.append('--cluster-port') args.append(str(cluster_port)) if api_port is not None: args.append('--api-port') args.append(str(api_port)) if datastream_port is not None: args.append('--datastream-port') args.append(str(datastream_port)) if db_port is not None: args.append('--db-port') args.append(str(db_port)) if verbosity is not None: args.append('-v') args.append(str(verbosity)) if leader is not None: args.append('--leader') args.append(leader) # read memory size mem_bytes = os.sysconf('SC_PAGE_SIZE') * os.sysconf( 'SC_PHYS_PAGES') # e.g. 4015976448 mem_mb = mem_bytes / (1024.**2) # e.g. 3.74 heap_mb = max(min(mem_mb / 4, 16 * 1024), 1024) heap_mb = int(heap_mb / 1024) * 1024 logs_folder = os.path.abspath(os.path.join(self._log_file, os.pardir)) command = mmt_javamain('eu.modernmt.cli.ClusterNodeMain', args, logs_path=logs_folder, remote_debug=remote_debug, max_heap_mb=heap_mb, server=True) if os.path.isfile(self._status_file): os.remove(self._status_file) if not super(ClusterNode, self)._start(command): raise Exception( 'failed to start node, check log file for more details: %s' % self._log_file)