Example #1
0
    def get_mem_percent():
        """:returns percentage of MemTotal (hardware memory) to use in `lmplz`."""
        # Simple heuristic: use 80% of *available* memory (instead of MemTotal as is the lmplz default) - avoids
        # crashing on machines with other jobs running.
        # This may evict some disk caches (is not too nice to other programs using mmapped
        # files unless you lower the 80%).

        mi = fileutils.meminfo()
        total = float(mi['MemTotal'])
        available = fileutils.free()
        return int(80.0 * available / total)
Example #2
0
    def _check_constraints(self):
        free_space_on_disk = fileutils.df(self._engine.path)[2]
        corpus_size_on_disk = 0
        for root in self._roots:
            corpus_size_on_disk += fileutils.du(root)
        free_memory = fileutils.free()

        recommended_mem = self.__GB * corpus_size_on_disk / (
            350 * self.__MB)  # 1G RAM every 350M on disk
        recommended_disk = 10 * corpus_size_on_disk

        if free_memory < recommended_mem or free_space_on_disk < recommended_disk:
            if free_memory < recommended_mem:
                print '> WARNING: more than %.fG of RAM recommended, only %.fG available' % \
                      (recommended_mem / self.__GB, free_memory / self.__GB)
            if free_space_on_disk < recommended_disk:
                print '> WARNING: more than %.fG of storage recommended, only %.fG available' % \
                      (recommended_disk / self.__GB, free_space_on_disk / self.__GB)
            print
Example #3
0
    def _check_constraints(self):
        free_space_on_disk = fileutils.df(self._engine.path)[2]
        corpus_size_on_disk = 0
        for root in self._roots:
            corpus_size_on_disk += fileutils.du(root)
        free_memory = fileutils.free()

        recommended_mem = self._GB * corpus_size_on_disk / (
            350 * self._MB)  # 1G RAM every 350M on disk
        recommended_disk = 10 * corpus_size_on_disk

        if free_memory < recommended_mem or free_space_on_disk < recommended_disk:
            if free_memory < recommended_mem:
                raise EngineBuilder.HWConstraintViolated(
                    'more than %.fG of RAM recommended, only %.fG available' %
                    (recommended_mem / self._GB, free_memory / self._GB))
            if free_space_on_disk < recommended_disk:
                raise EngineBuilder.HWConstraintViolated(
                    'more than %.fG of storage recommended, only %.fG available'
                    % (recommended_disk / self._GB,
                       free_space_on_disk / self._GB))
Example #4
0
    def build(self, roots, debug=False, steps=None, split_trainingset=True):
        self._temp_dir = self._engine.get_tempdir('training', ensure=True)

        source_lang = self._engine.source_lang
        target_lang = self._engine.target_lang

        bilingual_corpora, monolingual_corpora = BilingualCorpus.splitlist(source_lang, target_lang, roots=roots)

        if len(bilingual_corpora) == 0:
            raise IllegalArgumentException(
                'you project does not include %s-%s data.' % (source_lang.upper(), target_lang.upper()))

        if steps is None:
            steps = self._engine.training_steps
        else:
            unknown_steps = [step for step in steps if step not in self._engine.training_steps]
            if len(unknown_steps) > 0:
                raise IllegalArgumentException('Unknown training steps: ' + str(unknown_steps))

        cmdlogger = _builder_logger(len(steps) + 1)
        cmdlogger.start(self._engine, bilingual_corpora, monolingual_corpora)

        shutil.rmtree(self._engine.path, ignore_errors=True)
        os.makedirs(self._engine.path)

        # Check disk space constraints
        free_space_on_disk = fileutils.df(self._engine.path)[2]
        corpus_size_on_disk = 0
        for root in roots:
            corpus_size_on_disk += fileutils.du(root)
        free_memory = fileutils.free()

        recommended_mem = self.__GB * corpus_size_on_disk / (350 * self.__MB)  # 1G RAM every 350M on disk
        recommended_disk = 10 * corpus_size_on_disk

        if free_memory < recommended_mem or free_space_on_disk < recommended_disk:
            if free_memory < recommended_mem:
                print '> WARNING: more than %.fG of RAM recommended, only %.fG available' % \
                      (recommended_mem / self.__GB, free_memory / self.__GB)
            if free_space_on_disk < recommended_disk:
                print '> WARNING: more than %.fG of storage recommended, only %.fG available' % \
                      (recommended_disk / self.__GB, free_space_on_disk / self.__GB)
            print

        try:
            unprocessed_bicorpora = bilingual_corpora
            unprocessed_monocorpora = monolingual_corpora

            # TM draft-translations cleanup
            if 'tm_cleanup' in steps:
                with cmdlogger.step('TMs clean-up') as _:
                    unprocessed_bicorpora = self._engine.cleaner.clean(
                        unprocessed_bicorpora, self._get_tempdir('clean_tms')
                    )

            cleaned_bicorpora = unprocessed_bicorpora
            processed_bicorpora = unprocessed_bicorpora
            processed_monocorpora = unprocessed_monocorpora

            # Preprocessing
            if 'preprocess' in steps:
                with cmdlogger.step('Corpora preprocessing') as _:
                    unprocessed_bicorpora, unprocessed_monocorpora = self._engine.db.generate(
                        unprocessed_bicorpora, unprocessed_monocorpora, self._get_tempdir('training_corpora')
                    )

                    processed_bicorpora, processed_monocorpora = self._engine.training_preprocessor.process(
                        unprocessed_bicorpora + unprocessed_monocorpora, self._get_tempdir('preprocessed'),
                        (self._engine.data_path if split_trainingset else None)
                    )

                    cleaned_bicorpora = self._engine.training_preprocessor.clean(
                        processed_bicorpora, self._get_tempdir('clean_corpora')
                    )

            # Training Context Analyzer
            if 'context_analyzer' in steps:
                with cmdlogger.step('Context Analyzer training') as _:
                    log_file = self._engine.get_logfile('training.context')
                    self._engine.analyzer.create_index(unprocessed_bicorpora, source_lang, log_file=log_file)

            # Aligner
            if 'aligner' in steps:
                with cmdlogger.step('Aligner training') as _:
                    log_file = self._engine.get_logfile('training.aligner')
                    working_dir = self._get_tempdir('aligner')

                    self._engine.aligner.build(cleaned_bicorpora, working_dir, log_file)

            # Training Translation Model
            if 'tm' in steps:
                with cmdlogger.step('Translation Model training') as _:
                    working_dir = self._get_tempdir('tm')
                    log_file = self._engine.get_logfile('training.tm')
                    self._engine.pt.train(cleaned_bicorpora, self._engine.aligner, working_dir, log_file)

            # Training Adaptive Language Model
            if 'lm' in steps:
                with cmdlogger.step('Language Model training') as _:
                    working_dir = self._get_tempdir('lm')
                    log_file = self._engine.get_logfile('training.lm')
                    self._engine.lm.train(processed_bicorpora + processed_monocorpora, target_lang,
                                          working_dir, log_file)

            # Writing config file
            with cmdlogger.step('Writing config files') as _:
                self._engine.write_configs()

            cmdlogger.completed()
        finally:
            if not debug:
                self._engine.clear_tempdir('training')
Example #5
0
    def build(self, roots, debug=False, steps=None, split_trainingset=True):
        self._temp_dir = self._engine.get_tempdir('training', ensure=True)

        source_lang = self._engine.source_lang
        target_lang = self._engine.target_lang

        bilingual_corpora, monolingual_corpora = BilingualCorpus.splitlist(
            source_lang, target_lang, roots=roots)

        if len(bilingual_corpora) == 0:
            raise IllegalArgumentException(
                'you project does not include %s-%s data.' %
                (source_lang.upper(), target_lang.upper()))

        if steps is None:
            steps = self._engine.training_steps
        else:
            unknown_steps = [
                step for step in steps
                if step not in self._engine.training_steps
            ]
            if len(unknown_steps) > 0:
                raise IllegalArgumentException('Unknown training steps: ' +
                                               str(unknown_steps))

        shutil.rmtree(self._engine.path, ignore_errors=True)
        os.makedirs(self._engine.path)

        # Check disk space constraints
        free_space_on_disk = fileutils.df(self._engine.path)[2]
        corpus_size_on_disk = 0
        for root in roots:
            corpus_size_on_disk += fileutils.du(root)
        free_memory = fileutils.free()

        recommended_mem = self.__GB * corpus_size_on_disk / (
            350 * self.__MB)  # 1G RAM every 350M on disk
        recommended_disk = 10 * corpus_size_on_disk

        if free_memory < recommended_mem or free_space_on_disk < recommended_disk:
            if free_memory < recommended_mem:
                print '> WARNING: more than %.fG of RAM recommended, only %.fG available' % \
                      (recommended_mem / self.__GB, free_memory / self.__GB)
            if free_space_on_disk < recommended_disk:
                print '> WARNING: more than %.fG of storage recommended, only %.fG available' % \
                      (recommended_disk / self.__GB, free_space_on_disk / self.__GB)
            print

        logger = _builder_logger(
            len(steps) + 1, self._engine.get_logfile('training'))

        try:
            logger.start(self._engine, bilingual_corpora, monolingual_corpora)

            unprocessed_bicorpora = bilingual_corpora
            unprocessed_monocorpora = monolingual_corpora

            # TM draft-translations cleanup
            if 'tm_cleanup' in steps:
                with logger.step('TMs clean-up') as _:
                    unprocessed_bicorpora = self._engine.cleaner.clean(
                        unprocessed_bicorpora,
                        self._get_tempdir('clean_tms'),
                        log=logger.stream)

            cleaned_bicorpora = unprocessed_bicorpora
            processed_bicorpora = unprocessed_bicorpora
            processed_monocorpora = unprocessed_monocorpora

            # Preprocessing
            if 'preprocess' in steps:
                with logger.step('Corpora preprocessing') as _:
                    unprocessed_bicorpora, unprocessed_monocorpora = self._engine.db.generate(
                        unprocessed_bicorpora,
                        unprocessed_monocorpora,
                        self._get_tempdir('training_corpora'),
                        log=logger.stream)

                    processed_bicorpora, processed_monocorpora = self._engine.training_preprocessor.process(
                        unprocessed_bicorpora + unprocessed_monocorpora,
                        self._get_tempdir('preprocessed'),
                        (self._engine.data_path
                         if split_trainingset else None),
                        log=logger.stream)

                    cleaned_bicorpora = self._engine.training_preprocessor.clean(
                        processed_bicorpora,
                        self._get_tempdir('clean_corpora'))

            # Training Context Analyzer
            if 'context_analyzer' in steps:
                with logger.step('Context Analyzer training') as _:
                    self._engine.analyzer.create_index(unprocessed_bicorpora,
                                                       log=logger.stream)

            # Aligner
            if 'aligner' in steps:
                with logger.step('Aligner training') as _:
                    working_dir = self._get_tempdir('aligner')
                    self._engine.aligner.build(cleaned_bicorpora,
                                               working_dir,
                                               log=logger.stream)

            # Training Translation Model
            if 'tm' in steps:
                with logger.step('Translation Model training') as _:
                    working_dir = self._get_tempdir('tm')
                    self._engine.pt.train(cleaned_bicorpora,
                                          self._engine.aligner,
                                          working_dir,
                                          log=logger.stream)

            # Training Adaptive Language Model
            if 'lm' in steps:
                with logger.step('Language Model training') as _:
                    working_dir = self._get_tempdir('lm')
                    self._engine.lm.train(processed_bicorpora +
                                          processed_monocorpora,
                                          target_lang,
                                          working_dir,
                                          log=logger.stream)

            # Writing config file
            with logger.step('Writing config files') as _:
                self._engine.write_configs()

            logger.completed()
        except:
            logger.error()
            raise
        finally:
            logger.close()
            if not debug:
                self._engine.clear_tempdir('training')