Example #1
0
    def _set_pid(self, pid):
        parent_dir = os.path.abspath(os.path.join(self._pidfile, os.pardir))
        if not os.path.isdir(parent_dir):
            fileutils.makedirs(parent_dir, exist_ok=True)

        with open(self._pidfile, 'w') as pid_file:
            pid_file.write(str(pid))
Example #2
0
    def _start_kafka(self, log, port=9092, zookeeper_port=2181):
        kdata = os.path.abspath(os.path.join(self._data, 'kdata'))
        if not os.path.isdir(kdata):
            fileutils.makedirs(kdata, exist_ok=True)

        config = os.path.join(self._data, 'kafka.properties')
        with open(config, 'w') as cout:
            cout.write('broker.id=0\n')
            cout.write(
                'listeners=PLAINTEXT://0.0.0.0:{port}\n'.format(port=port))
            cout.write('log.dirs={data}\n'.format(data=kdata))
            cout.write('num.partitions=1\n')
            cout.write('log.retention.hours=8760000\n')
            cout.write('zookeeper.connect=localhost:{port}\n'.format(
                port=zookeeper_port))

        command = [self._kafka_bin, config]
        kafka = subprocess.Popen(command, stdout=log, stderr=log,
                                 shell=False).pid

        for i in range(1, 5):
            with open(log.name, 'r') as rlog:
                for line in rlog:
                    if 'INFO [Kafka Server 0], started (kafka.server.KafkaServer)' in line:
                        return kafka

            time.sleep(1)

        daemon.kill(kafka)
        return None
Example #3
0
    def _start_process(self):
        if not os.path.isdir(self.engine.get_runtime_path()):
            fileutils.makedirs(self.engine.get_runtime_path(), exist_ok=True)
        self._log_file = self.engine.get_logfile(ClusterNode.__LOG_FILENAME, ensure=True)

        args = ['-e', self.engine.name, '-p', str(self._cluster_ports[0]), str(self._cluster_ports[1]),
                '--status-file', self._status_file]

        if self._start_rest_server:
            args.append('-a')
            args.append(str(self._api_port))

        if self._verbosity is not None:
            args.append('-v')
            args.append(str(self._verbosity))

        if self._sibling is not None:
            args.append('--member')
            args.append(str(self._sibling))

        command = mmt_javamain('eu.modernmt.cli.ClusterNodeMain', args,
                               hserr_path=os.path.abspath(os.path.join(self._log_file, os.pardir)))

        log = open(self._log_file, 'wa')

        if os.path.isfile(self._status_file):
            os.remove(self._status_file)

        return subprocess.Popen(command, stdout=open(os.devnull), stderr=log, shell=False)
Example #4
0
    def generate(self, bilingual_corpora, monolingual_corpora, output, log_file=None):
        fileutils.makedirs(self._model, exist_ok=True)

        args = ['--db', os.path.join(self._model, 'domains.db'), '-l', self._source_lang, '-c']

        source_paths = set([corpus.get_folder() for corpus in bilingual_corpora])
        for source_path in source_paths:
            args.append(source_path)

        command = cli.mmt_javamain(self._java_mainclass, args)

        log = shell.DEVNULL

        try:
            if log_file is not None:
                log = open(log_file, 'w')

            stdout, _ = shell.execute(command, stderr=log)

            domains = {}

            for domain, name in [line.rstrip('\n').split('\t', 2) for line in stdout.splitlines()]:
                domains[name] = domain

            return self._make_training_folder(bilingual_corpora, monolingual_corpora, domains, output)
        finally:
            if log_file is not None:
                log.close()
Example #5
0
    def _start_zookeeper(self, log, port=2181):
        zdata = os.path.abspath(os.path.join(self._data, 'zdata'))
        if not os.path.isdir(zdata):
            fileutils.makedirs(zdata, exist_ok=True)

        config = os.path.join(self._data, 'zookeeper.properties')
        with open(config, 'w') as cout:
            cout.write('dataDir={data}\n'.format(data=zdata))
            cout.write('clientPort={port}\n'.format(port=port))
            cout.write('maxClientCnxns=0\n')

        command = [self._zookeeper_bin, config]
        zookeeper = subprocess.Popen(command,
                                     stdout=log,
                                     stderr=log,
                                     shell=False).pid

        for i in range(1, 5):
            try:
                msg = fileutils.netcat('127.0.0.1', port, 'ruok', timeout=2)
            except:
                msg = None

            if 'imok' == msg:
                return zookeeper
            else:
                time.sleep(1)

        daemon.kill(zookeeper)
        return None
Example #6
0
    def _start_zookeeper(self, log, port=2181):
        zdata = os.path.abspath(os.path.join(self._data, 'zdata'))
        if not os.path.isdir(zdata):
            fileutils.makedirs(zdata, exist_ok=True)

        config = os.path.join(self._data, 'zookeeper.properties')
        with open(config, 'w') as cout:
            cout.write('dataDir={data}\n'.format(data=zdata))
            cout.write('clientPort={port}\n'.format(port=port))
            cout.write('maxClientCnxns=0\n')

        command = [self._zookeeper_bin, config]
        zookeeper = subprocess.Popen(command, stdout=log, stderr=log, shell=False).pid

        for i in range(1, 5):
            try:
                msg = fileutils.netcat('127.0.0.1', port, 'ruok', timeout=2)
            except:
                msg = None

            if 'imok' == msg:
                return zookeeper
            else:
                time.sleep(1)

        daemon.kill(zookeeper)
        return None
Example #7
0
    def train(self, corpora, aligner, working_dir='.', log=None):
        if log is None:
            log = shell.DEVNULL

        if os.path.isdir(self._model) and len(os.listdir(self._model)) > 0:
            raise Exception('Model already exists at ' + self._model)

        if not os.path.isdir(self._model):
            fileutils.makedirs(self._model, exist_ok=True)

        if not os.path.isdir(working_dir):
            fileutils.makedirs(working_dir, exist_ok=True)

        train_corpora = []  # Prepare training folder
        for corpus in corpora:
            dest_corpus = BilingualCorpus.make_parallel(corpus.name, working_dir,
                                                        (self._source_lang, self._target_lang))
            source_file = corpus.get_file(self._source_lang)
            target_file = corpus.get_file(self._target_lang)

            os.symlink(source_file, dest_corpus.get_file(self._source_lang))
            os.symlink(target_file, dest_corpus.get_file(self._target_lang))

            train_corpora.append(dest_corpus)

        # Align corpora
        aligner.align(train_corpora, working_dir, log=log)

        # Build models
        command = [self._build_bin, '--input', working_dir, '--model', self._model,
                   '-s', self._source_lang, '-t', self._target_lang]
        shell.execute(command, stdout=log, stderr=log)
Example #8
0
    def train(self, corpora, aligner, working_dir='.', log=None):
        if log is None:
            log = shell.DEVNULL

        shutil.rmtree(self._model, ignore_errors=True)
        fileutils.makedirs(self._model, exist_ok=True)

        train_corpora_path = os.path.join(working_dir, 'corpora')
        lex_model_path = os.path.join(working_dir, 'model.tlex')

        if not os.path.isdir(train_corpora_path):
            fileutils.makedirs(train_corpora_path, exist_ok=True)

        train_corpora = []  # Prepare training folder
        for corpus in corpora:
            dest_corpus = BilingualCorpus.make_parallel(corpus.name, train_corpora_path,
                                                        (self._source_lang, self._target_lang))
            source_file = corpus.get_file(self._source_lang)
            target_file = corpus.get_file(self._target_lang)

            os.symlink(source_file, dest_corpus.get_file(self._source_lang))
            os.symlink(target_file, dest_corpus.get_file(self._target_lang))

            train_corpora.append(dest_corpus)

        # Align corpora
        aligner.align(train_corpora, train_corpora_path, log=log)
        aligner.export(lex_model_path)

        # Build models
        command = [self._build_bin, '--lex', lex_model_path, '--input', train_corpora_path, '--model', self._model,
                   '-s', self._source_lang, '-t', self._target_lang, '-v', self._vb.model]
        shell.execute(command, stdout=log, stderr=log)
Example #9
0
    def build(self, corpora, working_dir='.', log=None):
        if log is None:
            log = shell.DEVNULL

        shutil.rmtree(self._model, ignore_errors=True)
        fileutils.makedirs(self._model, exist_ok=True)

        if not os.path.isdir(working_dir):
            fileutils.makedirs(working_dir, exist_ok=True)

        merged_corpus = BilingualCorpus.make_parallel(
            'merge', working_dir, (self._source_lang, self._target_lang))

        fileutils.merge(
            [corpus.get_file(self._source_lang) for corpus in corpora],
            merged_corpus.get_file(self._source_lang))
        fileutils.merge(
            [corpus.get_file(self._target_lang) for corpus in corpora],
            merged_corpus.get_file(self._target_lang))

        command = [
            self._build_bin, '-s',
            merged_corpus.get_file(self._source_lang), '-t',
            merged_corpus.get_file(self._target_lang), '-m', self._model, '-I',
            '4'
        ]
        shell.execute(command, stdout=log, stderr=log)
Example #10
0
    def train(self, corpora, aligner, working_dir='.', log_file=None):
        if os.path.isdir(self._model) and len(os.listdir(self._model)) > 0:
            raise Exception('Model already exists at ' + self._model)

        if not os.path.isdir(self._model):
            fileutils.makedirs(self._model, exist_ok=True)

        if not os.path.isdir(working_dir):
            fileutils.makedirs(working_dir, exist_ok=True)

        log = shell.DEVNULL

        try:
            if log_file is not None:
                log = open(log_file, 'a')

            # Prepare training folder
            for corpus in corpora:
                dest_corpus = BilingualCorpus.make_parallel(corpus.name, working_dir,
                                                            (self._source_lang, self._target_lang))
                source_file = corpus.get_file(self._source_lang)
                target_file = corpus.get_file(self._target_lang)

                os.symlink(source_file, dest_corpus.get_file(self._source_lang))
                os.symlink(target_file, dest_corpus.get_file(self._target_lang))

                aligner.align(corpus, os.path.join(working_dir, corpus.name + '.align'))

            # Build models
            command = [self._build_bin, '--input', working_dir, '--model', self._model,
                       '-s', self._source_lang, '-t', self._target_lang]
            shell.execute(command, stdout=log, stderr=log)
        finally:
            if log_file is not None:
                log.close()
Example #11
0
    def build(self, corpora, working_dir='.', log_file=None):
        if not os.path.isdir(working_dir):
            fileutils.makedirs(working_dir, exist_ok=True)
        if not os.path.isdir(self._model):
            fileutils.makedirs(self._model, exist_ok=True)

        merged_corpus = BilingualCorpus.make_parallel(
            'merge', working_dir, (self._source_lang, self._target_lang))

        fileutils.merge(
            [corpus.get_file(self._source_lang) for corpus in corpora],
            merged_corpus.get_file(self._source_lang))
        fileutils.merge(
            [corpus.get_file(self._target_lang) for corpus in corpora],
            merged_corpus.get_file(self._target_lang))

        log = shell.DEVNULL

        try:
            if log_file is not None:
                log = open(log_file, 'a')

            # Train model
            command = [
                self._build_bin, '-s',
                merged_corpus.get_file(self._source_lang), '-t',
                merged_corpus.get_file(self._target_lang), '-m', self._model,
                '-I', '4'
            ]
            shell.execute(command, stderr=log)
        finally:
            if log_file is not None:
                log.close()
Example #12
0
    def _start_cassandra(self, log):

        if not os.path.isdir(self._model):
            fileutils.makedirs(self._model, exist_ok=True)

        # create a runtime version of the configuration file
        config = os.path.join(self._runtime, 'cassandra.yaml')

        self._yaml_transform(config)

        env = {"CASSANDRA_JMX_PORT": str(netutils.get_free_tcp_port())}

        # launch cassandra -d _runtime
        command = [
            self._cassandra_bin, '-R', '-Dcassandra.config=file:///' + config,
            "-f"
        ]

        cassandra = subprocess.Popen(command,
                                     stdout=log,
                                     stderr=log,
                                     shell=False,
                                     env=env).pid

        # If Starting listening for CQL clients is not in the rlog
        # in the first 80 seconds
        # kill Cassandra and return none?
        for i in range(1, 100):
            with open(log.name, 'r') as rlog:
                for line in rlog:
                    if 'Starting listening for CQL clients' in line:
                        return cassandra

            time.sleep(1)
        return None
Example #13
0
    def build(self, corpora, working_dir='.', log_file=None):
        if not os.path.isdir(working_dir):
            fileutils.makedirs(working_dir, exist_ok=True)
        if not os.path.isdir(self._model):
            fileutils.makedirs(self._model, exist_ok=True)

        merged_corpus = BilingualCorpus.make_parallel('merge', working_dir, (self._source_lang, self._target_lang))

        fileutils.merge([corpus.get_file(self._source_lang) for corpus in corpora],
                        merged_corpus.get_file(self._source_lang))
        fileutils.merge([corpus.get_file(self._target_lang) for corpus in corpora],
                        merged_corpus.get_file(self._target_lang))

        log = shell.DEVNULL

        try:
            if log_file is not None:
                log = open(log_file, 'a')

            # Train model
            command = [self._build_bin,
                       '-s', merged_corpus.get_file(self._source_lang), '-t', merged_corpus.get_file(self._target_lang),
                       '-m', self._model, '-I', '4']
            shell.execute(command, stderr=log)
        finally:
            if log_file is not None:
                log.close()
Example #14
0
    def _set_pid(self, pid):
        parent_dir = os.path.abspath(os.path.join(self._pidfile, os.pardir))
        if not os.path.isdir(parent_dir):
            fileutils.makedirs(parent_dir, exist_ok=True)

        with open(self._pidfile, 'w') as pid_file:
            pid_file.write(str(pid))
Example #15
0
    def generate(self, bilingual_corpora, monolingual_corpora, output, log_file=None):
        fileutils.makedirs(self._model, exist_ok=True)

        args = ['--db', os.path.join(self._model, 'domains.db'), '-l', self._source_lang, '-c']

        source_paths = set([corpus.get_folder() for corpus in bilingual_corpora])
        for source_path in source_paths:
            args.append(source_path)

        command = cli.mmt_javamain(self._java_mainclass, args)

        log = shell.DEVNULL

        try:
            if log_file is not None:
                log = open(log_file, 'w')

            stdout, _ = shell.execute(command, stderr=log)

            domains = {}

            for domain, name in [line.rstrip('\n').split('\t', 2) for line in stdout.splitlines()]:
                domains[name] = domain

            return self._make_training_folder(bilingual_corpora, monolingual_corpora, domains, output)
        finally:
            if log_file is not None:
                log.close()
Example #16
0
 def _get_tempdir(self, name, delete_if_exists=False):
     path = os.path.join(self._temp_dir, name)
     if delete_if_exists:
         shutil.rmtree(path, ignore_errors=True)
     if not os.path.isdir(path):
         fileutils.makedirs(path, exist_ok=True)
     return path
Example #17
0
    def _start_kafka(self, log, port=9092, zookeeper_port=2181):
        kdata = os.path.abspath(os.path.join(self._data, 'kdata'))
        if not os.path.isdir(kdata):
            fileutils.makedirs(kdata, exist_ok=True)

        config = os.path.join(self._data, 'kafka.properties')
        with open(config, 'w') as cout:
            cout.write('broker.id=0\n')
            cout.write('listeners=PLAINTEXT://0.0.0.0:{port}\n'.format(port=port))
            cout.write('log.dirs={data}\n'.format(data=kdata))
            cout.write('num.partitions=1\n')
            cout.write('log.retention.hours=8760000\n')
            cout.write('zookeeper.connect=localhost:{port}\n'.format(port=zookeeper_port))

        command = [self._kafka_bin, config]
        kafka = subprocess.Popen(command, stdout=log, stderr=log, shell=False).pid

        for i in range(1, 5):
            with open(log.name, 'r') as rlog:
                for line in rlog:
                    if 'INFO [Kafka Server 0], started (kafka.server.KafkaServer)' in line:
                        return kafka

            time.sleep(1)

        daemon.kill(kafka)
        return None
Example #18
0
    def __process_file(self, source, dest, lang, print_tags=True, print_placeholders=False, original_spacing=False):
        command = self.__get_command(lang, print_tags, print_placeholders, original_spacing)

        if not os.path.isdir(dest.get_folder()):
            fileutils.makedirs(dest.get_folder(), exist_ok=True)

        with open(source) as input_stream:
            with open(dest.get_file(lang), 'w') as output_stream:
                shell.execute(command, stdin=input_stream, stdout=output_stream, stderr=shell.DEVNULL)
Example #19
0
    def train(self, corpora, lang, working_dir='.', log=None):
        if log is None:
            log = shell.DEVNULL

        bicorpora = []
        for corpus in corpora:
            if len(corpus.langs) > 1:
                bicorpora.append(corpus)

        shutil.rmtree(self._model, ignore_errors=True)
        fileutils.makedirs(self._model, exist_ok=True)

        if not os.path.isdir(working_dir):
            fileutils.makedirs(working_dir, exist_ok=True)

        # Train static LM
        static_lm_model = os.path.join(self._model, 'background.slm')
        static_lm_wdir = os.path.join(working_dir, 'slm.temp')

        fileutils.makedirs(static_lm_wdir, exist_ok=True)

        merged_corpus = os.path.join(working_dir, 'merged_corpus')
        fileutils.merge([corpus.get_file(lang) for corpus in corpora],
                        merged_corpus)

        command = [
            self._create_slm_bin, '--discount-fallback', '-o',
            str(self._order), '-a',
            str(self._compression), '-q',
            str(self._quantization), '--type', 'trie', '--model',
            static_lm_model, '-T', static_lm_wdir
        ]
        if self._order > 2 and self._prune:
            command += ['--prune', '0', '1', '2']

        with open(merged_corpus) as stdin:
            shell.execute(command, stdin=stdin, stdout=log, stderr=log)

        # Create AdaptiveLM training folder
        alm_train_folder = os.path.join(working_dir, 'alm_train')
        fileutils.makedirs(alm_train_folder, exist_ok=True)

        for corpus in bicorpora:
            os.symlink(
                corpus.get_file(lang),
                os.path.join(alm_train_folder, corpus.name + '.' + lang))

        # Train adaptive LM
        adaptive_lm_model = os.path.join(self._model, 'foreground.alm')
        fileutils.makedirs(adaptive_lm_model, exist_ok=True)

        command = [
            self._create_alm_bin, '-m', adaptive_lm_model, '-i',
            alm_train_folder, '-b', '50000000'
        ]
        shell.execute(command, stdout=log, stderr=log)
Example #20
0
File: lm.py Project: kmlx/MMT
    def train(self, corpora, lang, working_dir='.', log_file=None):
        if os.path.isfile(self._model):
            raise Exception('Model already exists at ' + self._model)

        parent_dir = os.path.abspath(os.path.join(self._model, os.pardir))
        if not os.path.isdir(parent_dir):
            fileutils.makedirs(parent_dir, exist_ok=True)

        if not os.path.isdir(working_dir):
            fileutils.makedirs(working_dir, exist_ok=True)
Example #21
0
    def train(self, corpora, lang, working_dir='.', log_file=None):
        if os.path.isfile(self._model):
            raise Exception('Model already exists at ' + self._model)

        parent_dir = os.path.abspath(os.path.join(self._model, os.pardir))
        if not os.path.isdir(parent_dir):
            fileutils.makedirs(parent_dir, exist_ok=True)

        if not os.path.isdir(working_dir):
            fileutils.makedirs(working_dir, exist_ok=True)
Example #22
0
    def _clean_file(self, source, dest_folder, langs):
        if not os.path.isdir(dest_folder):
            fileutils.makedirs(dest_folder, exist_ok=True)

        input_folder = os.path.join(source.get_folder(), source.name)
        output_folder = os.path.join(dest_folder, source.name)

        command = ['perl', self._cleaner_script, '-ratio', str(self._ratio), input_folder, langs[0], langs[1],
                   output_folder, str(self._min), str(self._max)]
        shell.execute(command, stdout=shell.DEVNULL, stderr=shell.DEVNULL)
Example #23
0
    def get_logfile(self, name, ensure=True):
        if ensure and not os.path.isdir(self._logs_path):
            fileutils.makedirs(self._logs_path, exist_ok=True)

        logfile = os.path.join(self._logs_path, name + '.log')

        if ensure and os.path.isfile(logfile):
            os.remove(logfile)

        return logfile
Example #24
0
    def _clean_file(self, source, dest_folder, langs):
        if not os.path.isdir(dest_folder):
            fileutils.makedirs(dest_folder, exist_ok=True)

        input_folder = os.path.join(source.get_folder(), source.name)
        output_folder = os.path.join(dest_folder, source.name)

        command = ['perl', self._cleaner_script, '-ratio', str(self._ratio), input_folder, langs[0], langs[1],
                   output_folder, str(self._min), str(self._max)]
        shell.execute(command, stdout=shell.DEVNULL, stderr=shell.DEVNULL)
Example #25
0
    def get_logfile(self, name, ensure=True, append=False):
        if ensure and not os.path.isdir(self.logs_path):
            fileutils.makedirs(self.logs_path, exist_ok=True)

        logfile = os.path.join(self.logs_path, name + '.log')

        if not append and ensure and os.path.isfile(logfile):
            os.remove(logfile)

        return logfile
Example #26
0
    def get_tempdir(self, name, ensure=True):
        if ensure and not os.path.isdir(self._temp_path):
            fileutils.makedirs(self._temp_path, exist_ok=True)

        folder = os.path.join(self._temp_path, name)

        if ensure:
            shutil.rmtree(folder, ignore_errors=True)
            os.makedirs(folder)

        return folder
Example #27
0
    def get_tempdir(self, name, ensure=True):
        if ensure and not os.path.isdir(self.temp_path):
            fileutils.makedirs(self.temp_path, exist_ok=True)

        folder = os.path.join(self.temp_path, name)

        if ensure:
            shutil.rmtree(folder, ignore_errors=True)
            os.makedirs(folder)

        return folder
Example #28
0
    def process_corpora(self, corpora, dest_folder):
        fileutils.makedirs(dest_folder, exist_ok=True)

        for corpus in corpora:
            for lang in corpus.langs:
                source = corpus.get_file(lang)
                dest = BilingualCorpus.make_parallel(corpus.name, dest_folder, [lang])

                self.process_file(source, dest, lang)

        return BilingualCorpus.list(dest_folder)
Example #29
0
    def process_corpora(self, corpora, dest_folder):
        fileutils.makedirs(dest_folder, exist_ok=True)

        for corpus in corpora:
            for lang in corpus.langs:
                source = corpus.get_file(lang)
                dest = BilingualCorpus.make_parallel(corpus.name, dest_folder, [lang])

                self.process_file(source, dest, lang)

        return BilingualCorpus.list(dest_folder)
Example #30
0
    def encode(self, corpora, dest_folder):
        if not os.path.isdir(dest_folder):
            fileutils.makedirs(dest_folder, exist_ok=True)

        for corpus in corpora:
            for lang in corpus.langs:
                source = corpus.get_file(lang)
                dest_file = BilingualCorpus.make_parallel(corpus.name, dest_folder, [lang]).get_file(lang)

                self.encode_file(source, dest_file, delete_nl=True)

        return BilingualCorpus.list(dest_folder)
Example #31
0
    def encode(self, corpora, dest_folder):
        if not os.path.isdir(dest_folder):
            fileutils.makedirs(dest_folder, exist_ok=True)

        for corpus in corpora:
            for lang in corpus.langs:
                source = corpus.get_file(lang)
                dest_file = BilingualCorpus.make_parallel(corpus.name, dest_folder, [lang]).get_file(lang)

                self.encode_file(source, dest_file, delete_nl=True)

        return BilingualCorpus.list(dest_folder)
Example #32
0
    def process_corpora(self, corpora, output_folder):
        fileutils.makedirs(output_folder, exist_ok=True)

        for corpus in corpora:
            output_corpus = BilingualCorpus.make_parallel(
                corpus.name, output_folder, corpus.langs)

            for lang in corpus.langs:
                input_path = corpus.get_file(lang)
                output_path = output_corpus.get_file(lang)

                self.process_file(input_path, output_path, lang)

        return BilingualCorpus.list(output_folder)
Example #33
0
    def train(self, corpora, lang, working_dir='.', log_file=None):
        LanguageModel.train(self, corpora, lang, working_dir, log_file)

        bicorpora = []
        for corpus in corpora:
            if len(corpus.langs) > 1:
                bicorpora.append(corpus)

        log = shell.DEVNULL

        try:
            if log_file is not None:
                log = open(log_file, 'w')

            fileutils.makedirs(self._model, exist_ok=True)

            # Train static LM
            static_lm_model = os.path.join(self._model, 'background.slm')
            static_lm_wdir = os.path.join(working_dir, 'slm.temp')

            fileutils.makedirs(static_lm_wdir, exist_ok=True)

            merged_corpus = os.path.join(working_dir, 'merged_corpus')
            fileutils.merge([corpus.get_file(lang) for corpus in corpora],
                            merged_corpus)

            command = [
                self._create_slm_bin, '--discount_fallback', '-o',
                str(self._order), '--model', static_lm_model, '-S',
                str(KenLM.get_mem_percent()) + '%', '-T', static_lm_wdir
            ]
            if self._order > 2 and self.prune:
                command += ['--prune', '0', '0', '1']

            with open(merged_corpus) as stdin:
                shell.execute(command, stdin=stdin, stdout=log, stderr=log)

            # Create AdaptiveLM training folder
            alm_train_folder = os.path.join(working_dir, 'alm_train')
            fileutils.makedirs(alm_train_folder, exist_ok=True)

            for corpus in bicorpora:
                os.symlink(
                    corpus.get_file(lang),
                    os.path.join(alm_train_folder, corpus.name + '.' + lang))

            # Train adaptive LM
            adaptive_lm_model = os.path.join(self._model, 'foreground.alm')
            fileutils.makedirs(adaptive_lm_model, exist_ok=True)

            command = [
                self._create_alm_bin, '-m', adaptive_lm_model, '-i',
                alm_train_folder, '-b', '100000000'
            ]
            shell.execute(command, stdout=log, stderr=log)
        finally:
            if log_file is not None:
                log.close()
Example #34
0
    def _start_process(self, api_port, cluster_port, datastream_port, db_port,
                       leader, verbosity):
        if not os.path.isdir(self.engine.runtime_path):
            fileutils.makedirs(self.engine.runtime_path, exist_ok=True)
        logs_folder = os.path.abspath(os.path.join(self._log_file, os.pardir))

        args = [
            '-e', self.engine.name, '--status-file', self._status_file,
            '--logs', logs_folder
        ]

        if cluster_port is not None:
            args.append('--cluster-port')
            args.append(str(cluster_port))

        if api_port is not None:
            args.append('--api-port')
            args.append(str(api_port))

        if datastream_port is not None:
            args.append('--datastream-port')
            args.append(str(datastream_port))

        if db_port is not None:
            args.append('--db-port')
            args.append(str(db_port))

        if verbosity is not None:
            args.append('-v')
            args.append(str(verbosity))

        if leader is not None:
            args.append('--leader')
            args.append(leader)

        command = mmt_javamain('eu.modernmt.cli.ClusterNodeMain',
                               args,
                               hserr_path=logs_folder)

        if os.path.isfile(self._status_file):
            os.remove(self._status_file)

        return subprocess.Popen(command,
                                stdout=shell.DEVNULL,
                                stderr=shell.DEVNULL,
                                shell=False)
Example #35
0
    def start(self):
        if self.is_running():
            raise IllegalStateException(
                'Cannot start Kafka process. Kafka process is already running')

        if not netutils.is_free(self.port):
            raise IllegalStateException(
                'port %d is already in use, please specify another port with --datastream-port'
                % self.port)

        self._log_file = self._engine.get_logfile('embedded-kafka',
                                                  ensure=True)

        shutil.rmtree(self._runtime, ignore_errors=True)
        fileutils.makedirs(self._runtime, exist_ok=True)

        success = False
        zpid, kpid = 0, 0

        log = open(self._log_file, 'w')

        try:
            zookeeper_port = netutils.get_free_tcp_port()

            zpid = self._start_zookeeper(log, zookeeper_port)
            if zpid is None:
                raise IllegalStateException(
                    'failed to start zookeeper, check log file for more details: '
                    + self._log_file)

            kpid = self._start_kafka(log, zookeeper_port)
            if kpid is None:
                raise IllegalStateException(
                    'failed to start kafka, check log file for more details: '
                    + self._log_file)

            self._set_pids(kpid, zpid)

            success = True
        except:
            if not success:
                daemon.kill(kpid)
                daemon.kill(zpid)
                log.close()
            raise
Example #36
0
    def create_index(self, corpora, log=None):
        if log is None:
            log = shell.DEVNULL

        source_paths = set()

        for corpus in corpora:
            source_paths.add(corpus.get_folder())

        shutil.rmtree(self._index, ignore_errors=True)
        fileutils.makedirs(self._index, exist_ok=True)

        args = ['-s', self._source_lang, '-t', self._target_lang, '-i', self._index, '-c']
        for source_path in source_paths:
            args.append(source_path)

        command = mmt_javamain(self._java_mainclass, args)
        shell.execute(command, stdout=log, stderr=log)
Example #37
0
    def generate(self,
                 bilingual_corpora,
                 monolingual_corpora,
                 output,
                 log=None):
        if log is None:
            log = shell.DEVNULL

        fileutils.makedirs(self._model, exist_ok=True)

        args = [
            '--db',
            os.path.join(self._model, 'domains.db'), '-s', self._source_lang,
            '-t', self._target_lang, '-c'
        ]

        source_paths = set(
            [corpus.get_folder() for corpus in bilingual_corpora])
        for source_path in source_paths:
            args.append(source_path)

        command = cli.mmt_javamain(self._java_mainclass, args)
        stdout, _ = shell.execute(command, stderr=log)

        domains = {}

        for domain, name in [
                line.rstrip('\n').split('\t', 2)
                for line in stdout.splitlines()
        ]:
            domains[name] = domain

        bilingual_corpora = [
            corpus.symlink(output, name=domains[corpus.name])
            for corpus in bilingual_corpora
        ]
        monolingual_corpora = [
            corpus.symlink(output) for corpus in monolingual_corpora
        ]

        return bilingual_corpora, monolingual_corpora
Example #38
0
    def train(self, corpora, aligner, working_dir='.', log_file=None):
        if os.path.isdir(self._model) and len(os.listdir(self._model)) > 0:
            raise Exception('Model already exists at ' + self._model)

        if not os.path.isdir(self._model):
            fileutils.makedirs(self._model, exist_ok=True)

        if not os.path.isdir(working_dir):
            fileutils.makedirs(working_dir, exist_ok=True)

        log = shell.DEVNULL

        try:
            if log_file is not None:
                log = open(log_file, 'a')

            # Prepare training folder
            for corpus in corpora:
                dest_corpus = BilingualCorpus.make_parallel(
                    corpus.name, working_dir,
                    (self._source_lang, self._target_lang))
                source_file = corpus.get_file(self._source_lang)
                target_file = corpus.get_file(self._target_lang)

                os.symlink(source_file,
                           dest_corpus.get_file(self._source_lang))
                os.symlink(target_file,
                           dest_corpus.get_file(self._target_lang))

                aligner.align(
                    corpus, os.path.join(working_dir, corpus.name + '.align'))

            # Build models
            command = [
                self._build_bin, '--input', working_dir, '--model',
                self._model, '-s', self._source_lang, '-t', self._target_lang
            ]
            shell.execute(command, stdout=log, stderr=log)
        finally:
            if log_file is not None:
                log.close()
Example #39
0
    def _start_process(self):
        if not os.path.isdir(self.engine.get_runtime_path()):
            fileutils.makedirs(self.engine.get_runtime_path(), exist_ok=True)
        self._log_file = self.engine.get_logfile(ClusterNode.__LOG_FILENAME,
                                                 ensure=True)

        args = [
            '-e', self.engine.name, '-p',
            str(self._cluster_ports[0]),
            str(self._cluster_ports[1]), '--status-file', self._status_file
        ]

        if self._start_rest_server:
            args.append('-a')
            args.append(str(self._api_port))

        if self._verbosity is not None:
            args.append('-v')
            args.append(str(self._verbosity))

        if self._sibling is not None:
            args.append('--member')
            args.append(str(self._sibling))

        command = mmt_javamain('eu.modernmt.cli.ClusterNodeMain',
                               args,
                               hserr_path=os.path.abspath(
                                   os.path.join(self._log_file, os.pardir)))

        log = open(self._log_file, 'wa')

        if os.path.isfile(self._status_file):
            os.remove(self._status_file)

        return subprocess.Popen(command,
                                stdout=open(os.devnull),
                                stderr=log,
                                shell=False)
Example #40
0
    def create_index(self, corpora, lang, log_file=None):
        source_paths = set()

        for corpus in corpora:
            source_paths.add(corpus.get_folder())

        fileutils.makedirs(self._index, exist_ok=True)

        args = ['-l', lang, '-i', self._index, '-c']
        for source_path in source_paths:
            args.append(source_path)

        command = mmt_javamain(self._java_mainclass, args)

        log = shell.DEVNULL

        try:
            if log_file is not None:
                log = open(log_file, 'w')

            shell.execute(command, stdout=log, stderr=log)
        finally:
            if log_file is not None:
                log.close()
Example #41
0
    def start(self):
        if self.is_running():
            raise IllegalStateException(
                'Cannot start Cassandra process. Cassandra process is already running'
            )

        if not netutils.is_free(self.port):
            raise IllegalStateException(
                'port %d is already in use, please specify another port with --db-port'
                % self.port)

        self._log_file = self._engine.get_logfile('embedded-cassandra',
                                                  ensure=True)

        shutil.rmtree(self._runtime, ignore_errors=True)
        fileutils.makedirs(self._runtime, exist_ok=True)

        success = False
        cpid = 0

        log = open(self._log_file, 'w')

        try:
            cpid = self._start_cassandra(log)

            if cpid is None:
                raise IllegalStateException(
                    'failed to start Cassandra, check log file for more details: '
                    + self._log_file)
            self._set_pid(cpid)
            success = True
        except:
            if not success:
                daemon.kill(cpid)
                log.close()
            raise
Example #42
0
    def _start_process(self):
        if not os.path.isdir(self.engine.runtime_path):
            fileutils.makedirs(self.engine.runtime_path, exist_ok=True)
        logs_folder = os.path.abspath(os.path.join(self._log_file, os.pardir))

        args = [
            '-e', self.engine.name, '-p',
            str(self._cluster_ports[0]),
            str(self._cluster_ports[1]), '--datastream-port',
            str(self._datastream_port), '--status-file', self._status_file,
            '--logs', logs_folder
        ]

        if self._start_rest_server:
            args.append('-a')
            args.append(str(self._api_port))

        if self._verbosity is not None:
            args.append('-v')
            args.append(str(self._verbosity))

        if self._sibling is not None:
            args.append('--member')
            args.append(str(self._sibling))

        command = mmt_javamain('eu.modernmt.cli.ClusterNodeMain',
                               args,
                               hserr_path=logs_folder)

        if os.path.isfile(self._status_file):
            os.remove(self._status_file)

        return subprocess.Popen(command,
                                stdout=shell.DEVNULL,
                                stderr=shell.DEVNULL,
                                shell=False)
Example #43
0
 def align(self, corpus, langs, model_dir, working_dir='.', log_file=None):
     if not os.path.isdir(working_dir):
         fileutils.makedirs(working_dir, exist_ok=True)
Example #44
0
 def _get_tempdir(self, name):
     path = os.path.join(self._temp_dir, name)
     if not os.path.isdir(path):
         fileutils.makedirs(path, exist_ok=True)
     return path
Example #45
0
    def train(self, corpora, aligner, working_dir='.', log_file=None):
        if os.path.isdir(self._model) and len(os.listdir(self._model)) > 0:
            raise Exception('Model already exists at ' + self._model)

        if not os.path.isdir(self._model):
            fileutils.makedirs(self._model, exist_ok=True)

        if not os.path.isdir(working_dir):
            fileutils.makedirs(working_dir, exist_ok=True)

        l1 = self._source_lang
        l2 = self._target_lang
        langs = (l1, l2)
        langs_suffix = l1 + '-' + l2

        mct_base = self._get_model_basename()
        dmp_file = mct_base + '.dmp'
        mam_file = mct_base + '.' + langs_suffix + '.mam'
        lex_file = mct_base + '.' + langs_suffix + '.lex'

        log = shell.DEVNULL

        try:
            if log_file is not None:
                log = open(log_file, 'a')

            # Clean corpus for training
            clean_output = os.path.join(working_dir, 'clean_corpora')
            fileutils.makedirs(clean_output, exist_ok=True)
            corpora = self._cleaner.clean(corpora, clean_output, (self._source_lang, self._target_lang))

            # Create merged corpus and domains list file (dmp)
            merged_corpus = BilingualCorpus.make_parallel(os.path.basename(mct_base), working_dir, langs)

            fileutils.merge([corpus.get_file(l1) for corpus in corpora], merged_corpus.get_file(l1))
            fileutils.merge([corpus.get_file(l2) for corpus in corpora], merged_corpus.get_file(l2))
            with open(dmp_file, 'w') as dmp:
                for corpus in corpora:
                    dmp.write(str(corpus.name) + ' ' + str(corpus.count_lines()) + '\n')

            # Create alignments in 'bal' file and symmetrize
            bal_file = aligner.align(merged_corpus, langs, self._model, working_dir, log_file)

            symal_file = os.path.join(working_dir, 'alignments.' + langs_suffix + '.symal')
            symal_command = [self._symal_bin, '-a=g', '-d=yes', '-f=yes', '-b=yes']
            with open(bal_file) as stdin:
                with open(symal_file, 'w') as stdout:
                    shell.execute(symal_command, stdin=stdin, stdout=stdout, stderr=log)

            # Execute mtt-build
            mttbuild_command = self._get_mttbuild_command(mct_base, dmp_file, l1)
            with open(merged_corpus.get_file(l1)) as stdin:
                shell.execute(mttbuild_command, stdin=stdin, stdout=log, stderr=log)

            mttbuild_command = self._get_mttbuild_command(mct_base, dmp_file, l2)
            with open(merged_corpus.get_file(l2)) as stdin:
                shell.execute(mttbuild_command, stdin=stdin, stdout=log, stderr=log)

            # Create 'mam' file
            mam_command = [self._symal2mam_bin, mam_file]
            with open(symal_file) as stdin:
                shell.execute(mam_command, stdin=stdin, stdout=log, stderr=log)

            # Create 'lex' file
            lex_command = [self._mmlexbuild_bin, mct_base + '.', l1, l2, '-o', lex_file]
            shell.execute(lex_command, stdout=log, stderr=log)
        finally:
            if log_file is not None:
                log.close()
Example #46
0
    def tune(self,
             corpora=None,
             debug=False,
             context_enabled=True,
             random_seeds=False,
             max_iterations=25,
             early_stopping_value=None):
        if corpora is None:
            corpora = BilingualCorpus.list(
                os.path.join(self.engine.data_path,
                             TrainingPreprocessor.DEV_FOLDER_NAME))

        target_lang = self.engine.target_lang
        source_lang = self.engine.source_lang

        corpora = [
            corpus for corpus in corpora
            if source_lang in corpus.langs and target_lang in corpus.langs
        ]
        if len(corpora) == 0:
            raise IllegalArgumentException(
                'No %s > %s corpora found into specified path' %
                (source_lang, target_lang))

        source_corpora = [
            BilingualCorpus.make_parallel(corpus.name, corpus.get_folder(),
                                          [source_lang]) for corpus in corpora
        ]
        reference_corpora = [
            BilingualCorpus.make_parallel(corpus.name, corpus.get_folder(),
                                          [target_lang]) for corpus in corpora
        ]

        cmdlogger = _tuning_logger(4)
        cmdlogger.start(self, corpora)

        working_dir = self.engine.get_tempdir('tuning')
        mert_wd = os.path.join(working_dir, 'mert')

        try:
            # Tokenization
            tokenizer = Tokenizer(target_lang)
            tokenized_output = os.path.join(working_dir, 'reference_corpora')
            fileutils.makedirs(tokenized_output, exist_ok=True)

            with cmdlogger.step('Corpora tokenization') as _:
                reference_corpora = tokenizer.process_corpora(
                    reference_corpora, tokenized_output)

            # Create merged corpus
            with cmdlogger.step('Merging corpus') as _:
                # source
                source_merged_corpus = os.path.join(working_dir,
                                                    'corpus.' + source_lang)

                with open(source_merged_corpus, 'wb') as out:
                    for corpus in source_corpora:
                        out.write(corpus.get_file(source_lang) + '\n')

                # target
                target_merged_corpus = os.path.join(working_dir,
                                                    'corpus.' + target_lang)
                fileutils.merge([
                    corpus.get_file(target_lang)
                    for corpus in reference_corpora
                ], target_merged_corpus)

            # Run MERT algorithm
            with cmdlogger.step('Tuning') as _:
                # Start MERT
                decoder_flags = ['--port', str(self.api.port)]

                if self.api.root is not None:
                    decoder_flags += ['--root', self.api.root]

                if not context_enabled:
                    decoder_flags.append('--skip-context-analysis')
                    decoder_flags.append('1')

                fileutils.makedirs(mert_wd, exist_ok=True)

                with tempfile.NamedTemporaryFile() as runtime_moses_ini:
                    command = [
                        self._mert_script, source_merged_corpus,
                        target_merged_corpus, self._mert_i_script,
                        runtime_moses_ini.name, '--threads',
                        str(multiprocessing.cpu_count()), '--mertdir',
                        cli.BIN_DIR, '--mertargs',
                        '\'--binary --sctype BLEU\'', '--working-dir', mert_wd,
                        '--nbest', '100', '--decoder-flags',
                        '"' + ' '.join(decoder_flags) + '"', '--nonorm',
                        '--closest', '--no-filter-phrase-table'
                    ]

                    if early_stopping_value is not None:
                        command += [
                            '--bleuscorer', self._scorer_script,
                            '--bleuscorer-flags "-nt" --early-stopping-value %d'
                            % early_stopping_value
                        ]

                    if not random_seeds:
                        command.append('--predictable-seeds')
                    if max_iterations > 0:
                        command.append('--maximum-iterations={num}'.format(
                            num=max_iterations))

                    with open(self.engine.get_logfile('mert'), 'wb') as log:
                        shell.execute(' '.join(command),
                                      stdout=log,
                                      stderr=log)

            # Read optimized configuration
            with cmdlogger.step('Applying changes') as _:
                bleu_score = 0
                weights = {}
                found_weights = False

                with open(os.path.join(mert_wd, 'moses.ini')) as moses_ini:
                    for line in moses_ini:
                        line = line.strip()

                        if len(line) == 0:
                            continue
                        elif found_weights:
                            tokens = line.split()
                            weights[tokens[0].rstrip('=')] = [
                                float(val) for val in tokens[1:]
                            ]
                        elif line.startswith('# BLEU'):
                            bleu_score = float(line.split()[2])
                        elif line == '[weight]':
                            found_weights = True

                _ = self.api.update_features(weights)

            cmdlogger.completed(bleu_score)
        finally:
            if not debug:
                self.engine.clear_tempdir("tuning")
Example #47
0
    def evaluate(self, corpora, heval_output=None, debug=False):
        if len(corpora) == 0:
            raise IllegalArgumentException('empty corpora')
        if heval_output is not None:
            fileutils.makedirs(heval_output, exist_ok=True)

        target_lang = self._engine.target_lang
        source_lang = self._engine.source_lang

        logger = _evaluate_logger()
        logger.start(corpora)

        working_dir = self._engine.get_tempdir('evaluation')

        try:
            results = []

            # Process references
            with logger.step('Preparing corpora') as _:
                corpora_path = os.path.join(working_dir, 'corpora')
                corpora = self._xmlencoder.encode(corpora, corpora_path)

                reference = os.path.join(working_dir, 'reference.' + target_lang)
                source = os.path.join(working_dir, 'source.' + source_lang)
                fileutils.merge([corpus.get_file(target_lang) for corpus in corpora], reference)
                fileutils.merge([corpus.get_file(source_lang) for corpus in corpora], source)

                if heval_output is not None:
                    self._heval_outputter.write(lang=target_lang, input_file=reference,
                                                output_file=os.path.join(heval_output, 'reference.' + target_lang))
                    self._heval_outputter.write(lang=source_lang, input_file=source,
                                                output_file=os.path.join(heval_output, 'source.' + source_lang))

            # Translate
            for translator in self._translators:
                name = translator.name()

                with logger.step('Translating with %s' % name) as _:
                    result = _EvaluationResult(translator)
                    results.append(result)

                    translations_path = os.path.join(working_dir, 'translations', result.id + '.raw')
                    xmltranslations_path = os.path.join(working_dir, 'translations', result.id)
                    fileutils.makedirs(translations_path, exist_ok=True)

                    try:
                        translated, mtt, parallelism = translator.translate(corpora, translations_path)
                        filename = result.id + '.' + target_lang

                        result.mtt = mtt
                        result.parallelism = parallelism
                        result.translated_corpora = self._xmlencoder.encode(translated, xmltranslations_path)
                        result.merge = os.path.join(working_dir, filename)

                        fileutils.merge([corpus.get_file(target_lang)
                                         for corpus in result.translated_corpora], result.merge)

                        if heval_output is not None:
                            self._heval_outputter.write(lang=target_lang, input_file=result.merge,
                                                        output_file=os.path.join(heval_output, filename))
                    except TranslateError as e:
                        result.error = e
                    except Exception as e:
                        result.error = TranslateError('Unexpected ERROR: ' + str(e.message))

            # Check corpora length
            reference_lines = fileutils.linecount(reference)
            for result in results:
                if result.error is not None:
                    continue
                    
                lines = fileutils.linecount(result.merge)

                if lines != reference_lines:
                    raise TranslateError('Invalid line count for translator %s: expected %d, found %d.'
                                         % (result.translator.name(), reference_lines, lines))

            # Scoring
            scorers = [(MatecatScore(), 'pes'), (BLEUScore(), 'bleu')]

            for scorer, field in scorers:
                with logger.step('Calculating %s' % scorer.name()) as _:
                    for result in results:
                        if result.error is not None:
                            continue
                        setattr(result, field, scorer.calculate(result.merge, reference))

            logger.completed(results, scorers)

            return results
        finally:
            if not debug:
                self._engine.clear_tempdir('evaluation')
Example #48
0
 def get_tempfile(self, name, ensure=True):
     if ensure and not os.path.isdir(self.temp_path):
         fileutils.makedirs(self.temp_path, exist_ok=True)
     return os.path.join(self.temp_path, name)
Example #49
0
    def tune(self, corpora=None, debug=False, context_enabled=True, random_seeds=False, max_iterations=25):
        if corpora is None:
            corpora = BilingualCorpus.list(os.path.join(self.engine.data_path, TrainingPreprocessor.DEV_FOLDER_NAME))

        if len(corpora) == 0:
            raise IllegalArgumentException('empty corpora')

        if not self.is_running():
            raise IllegalStateException('No MMT Server running, start the engine first')

        tokenizer = Tokenizer()

        target_lang = self.engine.target_lang
        source_lang = self.engine.source_lang

        source_corpora = [BilingualCorpus.make_parallel(corpus.name, corpus.get_folder(), [source_lang])
                          for corpus in corpora]
        reference_corpora = [BilingualCorpus.make_parallel(corpus.name, corpus.get_folder(), [target_lang])
                             for corpus in corpora]

        cmdlogger = _tuning_logger(4)
        cmdlogger.start(self, corpora)

        working_dir = self.engine.get_tempdir('tuning')
        mert_wd = os.path.join(working_dir, 'mert')

        try:
            # Tokenization
            tokenized_output = os.path.join(working_dir, 'reference_corpora')
            fileutils.makedirs(tokenized_output, exist_ok=True)

            with cmdlogger.step('Corpora tokenization') as _:
                reference_corpora = tokenizer.process_corpora(reference_corpora, tokenized_output)

            # Create merged corpus
            with cmdlogger.step('Merging corpus') as _:
                # source
                source_merged_corpus = os.path.join(working_dir, 'corpus.' + source_lang)

                with open(source_merged_corpus, 'wb') as out:
                    for corpus in source_corpora:
                        out.write(corpus.get_file(source_lang) + '\n')

                # target
                target_merged_corpus = os.path.join(working_dir, 'corpus.' + target_lang)
                fileutils.merge([corpus.get_file(target_lang) for corpus in reference_corpora], target_merged_corpus)

            # Run MERT algorithm
            with cmdlogger.step('Tuning') as _:
                # Start MERT
                decoder_flags = ['--port', str(self.api.port)]

                if not context_enabled:
                    decoder_flags.append('--skip-context-analysis')
                    decoder_flags.append('1')

                fileutils.makedirs(mert_wd, exist_ok=True)

                with tempfile.NamedTemporaryFile() as runtime_moses_ini:
                    command = [self._mert_script, source_merged_corpus, target_merged_corpus,
                               self._mert_i_script, runtime_moses_ini.name, '--threads',
                               str(multiprocessing.cpu_count()), '--mertdir', cli.BIN_DIR,
                               '--mertargs', '\'--binary --sctype BLEU\'', '--working-dir', mert_wd, '--nbest', '100',
                               '--decoder-flags', '"' + ' '.join(decoder_flags) + '"', '--nonorm', '--closest',
                               '--no-filter-phrase-table']

                    if not random_seeds:
                        command.append('--predictable-seeds')
                    if max_iterations > 0:
                        command.append('--maximum-iterations={num}'.format(num=max_iterations))

                    with open(self.engine.get_logfile('mert'), 'wb') as log:
                        shell.execute(' '.join(command), stdout=log, stderr=log)

            # Read optimized configuration
            with cmdlogger.step('Applying changes') as _:
                bleu_score = 0
                weights = {}
                found_weights = False

                with open(os.path.join(mert_wd, 'moses.ini')) as moses_ini:
                    for line in moses_ini:
                        line = line.strip()

                        if len(line) == 0:
                            continue
                        elif found_weights:
                            tokens = line.split()
                            weights[tokens[0].rstrip('=')] = [float(val) for val in tokens[1:]]
                        elif line.startswith('# BLEU'):
                            bleu_score = float(line.split()[2])
                        elif line == '[weight]':
                            found_weights = True

                _ = self.api.update_features(weights)

            cmdlogger.completed(bleu_score)
        finally:
            if not debug:
                self.engine.clear_tempdir()
Example #50
0
    def translate(self, corpora, dest_path=None, debug=False):
        if len(corpora) == 0:
            raise IllegalArgumentException('empty corpora')

        if dest_path:
            fileutils.makedirs(dest_path, exist_ok=True)

        target_lang = self._engine.target_lang
        source_lang = self._engine.source_lang

        working_dir = self._engine.get_tempdir('evaluation')
        have_references = False

        try:
            results = []

            # Process references
            corpora_path = os.path.join(working_dir, 'corpora')
            corpora = self._xmlencoder.encode(corpora, corpora_path)

            reference = os.path.join(working_dir, 'reference.' + target_lang)
            source = os.path.join(working_dir, 'source.' + source_lang)
            refs = [corpus.get_file(target_lang) for corpus in corpora if corpus.get_file(target_lang)]
            have_references = len(refs) > 0
            fileutils.merge(refs, reference)  # tolerates missing reference
            fileutils.merge([corpus.get_file(source_lang) for corpus in corpora], source)

            if dest_path:
                for corpus in corpora:
                    corpus.copy(dest_path, suffixes={source_lang: '.src', target_lang: '.ref', 'tmx': '.src'})

            # Translate
            translator = self._translator
            name = translator.name()

            result = _EvaluationResult(translator)
            results.append(result)

            translations_path = os.path.join(working_dir, 'translations', result.id + '.raw')
            xmltranslations_path = os.path.join(working_dir, 'translations', result.id)
            fileutils.makedirs(translations_path, exist_ok=True)

            try:
                translated, mtt, parallelism = translator.translate(corpora, translations_path)
                filename = result.id + '.' + target_lang

                result.mtt = mtt
                result.parallelism = parallelism
                result.translated_corpora = self._xmlencoder.encode(translated, xmltranslations_path)
                result.merge = os.path.join(working_dir, filename)

                fileutils.merge([corpus.get_file(target_lang)
                                 for corpus in result.translated_corpora], result.merge)

                if dest_path:
                    for corpus in result.translated_corpora:
                        corpus.copy(dest_path, suffixes={target_lang: '.hyp', 'tmx': '.hyp'})

            except TranslateError as e:
                result.error = e
            except Exception as e:
                result.error = TranslateError('Unexpected ERROR: ' + str(e.message))

            if result.error is None:
                if have_references:
                    scorer = BLEUScore()
                    # bleu in range [0;1)
                    bleu = scorer.calculate(result.merge, reference)
                    return bleu
                else:
                    return True
            else:
                print(result.error)
                return None
        finally:
            if not debug:
                self._engine.clear_tempdir('evaluation')