def start(self): if self.is_running(): raise IllegalStateException('process is already running') self._log_file = self._engine.get_logfile('embedded-kafka', ensure=True) success = False zpid, kpid = 0, 0 log = open(self._log_file, 'w') try: zpid = self._start_zookeeper(log) if zpid is None: raise IllegalStateException( 'failed to start zookeeper, check log file for more details: ' + self._log_file) kpid = self._start_kafka(log) if kpid is None: raise IllegalStateException( 'failed to start kafka, check log file for more details: ' + self._log_file) self._set_pids(kpid, zpid) success = True except: if not success: daemon.kill(kpid) daemon.kill(zpid) log.close() raise
def start(self): if self.is_running(): raise IllegalStateException('process is already running') if self._kafka: self._kafka.start() success = False process = self._start_process() pid = process.pid if pid > 0: self._set_pid(pid) for _ in range(0, 5): success = self.is_running() if success: break time.sleep(1) if not success: if self._kafka: self._kafka.stop() raise Exception( 'failed to start node, check log file for more details: ' + self._log_file)
def stop(self): kpid, zpid = self._get_pids() if not self.is_running(): raise IllegalStateException('process is not running') daemon.kill(kpid, 5) daemon.kill(zpid)
def start(self): if self.is_running(): raise IllegalStateException( 'Cannot start Kafka process. Kafka process is already running') if not netutils.is_free(self.port): raise IllegalStateException( 'port %d is already in use, please specify another port with --datastream-port' % self.port) self._log_file = self._engine.get_logfile('embedded-kafka', ensure=True) shutil.rmtree(self._runtime, ignore_errors=True) fileutils.makedirs(self._runtime, exist_ok=True) success = False zpid, kpid = 0, 0 log = open(self._log_file, 'w') try: zookeeper_port = netutils.get_free_tcp_port() zpid = self._start_zookeeper(log, zookeeper_port) if zpid is None: raise IllegalStateException( 'failed to start zookeeper, check log file for more details: ' + self._log_file) kpid = self._start_kafka(log, zookeeper_port) if kpid is None: raise IllegalStateException( 'failed to start kafka, check log file for more details: ' + self._log_file) self._set_pids(kpid, zpid) success = True except: if not success: daemon.kill(kpid) daemon.kill(zpid) log.close() raise
def stop(self): pid = self._get_pid() if not self.is_running(): raise IllegalStateException('process is not running') daemon.kill(pid, ClusterNode.__SIGTERM_TIMEOUT) if self._kafka: self._kafka.stop()
def _on_fields_injected(self, injector): if self.target_lang is None or self.source_lang is None: config = self.config if config is not None: self.target_lang = config.get(self.injector_section, 'target_lang') self.source_lang = config.get(self.injector_section, 'source_lang') if self.target_lang is None or self.source_lang is None: raise IllegalStateException('Engine target language or source language must be specified') if self._lm_type is None: self._lm_type = LanguageModel.available_types[0] if self._aligner_type is None: self._aligner_type = WordAligner.available_types[0] self.analyzer = injector.inject(ContextAnalyzer(self._context_index)) self.cleaner = TMCleaner(self.source_lang, self.target_lang) self.pt = injector.inject(SuffixArraysPhraseTable(self._pt_model, (self.source_lang, self.target_lang))) self.aligner = injector.inject( WordAligner.instantiate(self._aligner_type, self._aligner_model, self.source_lang, self.target_lang) ) self.lm = injector.inject(LanguageModel.instantiate(self._lm_type, self._lm_model)) self.training_preprocessor = injector.inject( TrainingPreprocessor(self.source_lang, self.target_lang, self._vocabulary_model) ) self.db = _DomainMapBuilder(self._db_path, self.source_lang, self.target_lang) self.moses = injector.inject(Moses(self._moses_ini_file)) self.moses.add_feature(MosesFeature('UnknownWordPenalty')) self.moses.add_feature(MosesFeature('WordPenalty')) self.moses.add_feature(MosesFeature('Distortion')) self.moses.add_feature(MosesFeature('PhrasePenalty')) self.moses.add_feature(self.pt, 'Sapt') # self.moses.add_feature(LexicalReordering(), 'DM0') self.moses.add_feature(self.lm, 'InterpolatedLM') self._optimal_weights = { 'InterpolatedLM': [0.24759], 'Sapt': [0.118797, 0.172922, 0.0134384, 0.0143003], 'Distortion0': [0.197845], 'WordPenalty0': [-0.217267], 'PhrasePenalty0': [0.0178411], } if self._config is None: self._config = injector.to_config() self._config.set(self.injector_section, 'source_lang', self.source_lang) self._config.set(self.injector_section, 'target_lang', self.target_lang)
def start(self): if self.is_running(): raise IllegalStateException( 'Cannot start Cassandra process. Cassandra process is already running' ) if not netutils.is_free(self.port): raise IllegalStateException( 'port %d is already in use, please specify another port with --db-port' % self.port) self._log_file = self._engine.get_logfile('embedded-cassandra', ensure=True) shutil.rmtree(self._runtime, ignore_errors=True) fileutils.makedirs(self._runtime, exist_ok=True) success = False cpid = 0 log = open(self._log_file, 'w') try: cpid = self._start_cassandra(log) if cpid is None: raise IllegalStateException( 'failed to start Cassandra, check log file for more details: ' + self._log_file) self._set_pid(cpid) success = True except: if not success: daemon.kill(cpid) log.close() raise
def execute(self, line): if len(line) == 0: return try: translation = self._translate(line) if self._print_nbest is not None: for nbest in translation['nbest']: self._nbest_out.write((u' ||| '.join(self._encode_nbest(nbest))).encode('utf-8')) self._nbest_out.write('\n') print '>>', self._encode_translation(translation) except requests.exceptions.ConnectionError: raise IllegalStateException('connection problem: MMT server not running, start it with "./mmt start"') except requests.exceptions.HTTPError as e: raise Exception('HTTP ERROR: ' + e.message)
def tune(self, corpora=None, debug=False, context_enabled=True): if corpora is None: corpora = BilingualCorpus.list( os.path.join(self.engine.data_path, TrainingPreprocessor.DEV_FOLDER_NAME)) if len(corpora) == 0: raise IllegalArgumentException('empty corpora') if not self.is_running(): raise IllegalStateException( 'No MMT Server running, start the engine first') tokenizer = Tokenizer() target_lang = self.engine.target_lang source_lang = self.engine.source_lang source_corpora = [ BilingualCorpus.make_parallel(corpus.name, corpus.get_folder(), [source_lang]) for corpus in corpora ] reference_corpora = [ BilingualCorpus.make_parallel(corpus.name, corpus.get_folder(), [target_lang]) for corpus in corpora ] cmdlogger = _tuning_logger(4) cmdlogger.start(self, corpora) working_dir = self.engine.get_tempdir('tuning') mert_wd = os.path.join(working_dir, 'mert') try: # Tokenization tokenized_output = os.path.join(working_dir, 'reference_corpora') fileutils.makedirs(tokenized_output, exist_ok=True) with cmdlogger.step('Corpora tokenization') as _: reference_corpora = tokenizer.process_corpora( reference_corpora, tokenized_output) # Create merged corpus with cmdlogger.step('Merging corpus') as _: # source source_merged_corpus = os.path.join(working_dir, 'corpus.' + source_lang) with open(source_merged_corpus, 'wb') as out: for corpus in source_corpora: out.write(corpus.get_file(source_lang) + '\n') # target target_merged_corpus = os.path.join(working_dir, 'corpus.' + target_lang) fileutils.merge([ corpus.get_file(target_lang) for corpus in reference_corpora ], target_merged_corpus) # Run MERT algorithm with cmdlogger.step('Tuning') as _: # Start MERT decoder_flags = ['--port', str(self.api.port)] if not context_enabled: decoder_flags.append('--skip-context-analysis') decoder_flags.append('1') fileutils.makedirs(mert_wd, exist_ok=True) with tempfile.NamedTemporaryFile() as runtime_moses_ini: command = [ self._mert_script, source_merged_corpus, target_merged_corpus, self._mert_i_script, runtime_moses_ini.name, '--threads', str(multiprocessing.cpu_count()), '--mertdir', cli.BIN_DIR, '--mertargs', '\'--binary --sctype BLEU\'', '--working-dir', mert_wd, '--nbest', '100', '--decoder-flags', '"' + ' '.join(decoder_flags) + '"', '--nonorm', '--closest', '--no-filter-phrase-table' ] with open(self.engine.get_logfile('mert'), 'wb') as log: shell.execute(' '.join(command), stdout=log, stderr=log) # Read optimized configuration with cmdlogger.step('Applying changes') as _: bleu_score = 0 weights = {} found_weights = False with open(os.path.join(mert_wd, 'moses.ini')) as moses_ini: for line in moses_ini: line = line.strip() if len(line) == 0: continue elif found_weights: tokens = line.split() weights[tokens[0].rstrip('=')] = [ float(val) for val in tokens[1:] ] elif line.startswith('# BLEU'): bleu_score = float(line.split()[2]) elif line == '[weight]': found_weights = True _ = self.api.update_features(weights) cmdlogger.completed(bleu_score) finally: if not debug: self.engine.clear_tempdir()