Ejemplo n.º 1
0
    def _build_vocab(self) -> FastText:
        try:
            if not self._bare_model_path.exists(
            ) and self.use_vocab_from is not None:
                message = 'Copying vocab for {} from {} to {}'
                message = message.format(self,
                                         self.use_vocab_from._bare_model_path,
                                         self._bare_model_path)
                LOGGER.debug(message)
                copy(self.use_vocab_from._bare_model_path,
                     self._bare_model_path)
            with self._bare_model_path.open('rb') as rf:
                LOGGER.info('Loading vocab for {} from {}'.format(
                    self, self._bare_model_path))
                saved_values = pickle.load(rf)
        except IOError:
            if self.use_vocab_from is not None:
                raise ValueError('Failed to use vocab from {}'.format(
                    self.use_vocab_from))
            bare_model = FastText(**self.fasttext_parameters)
            build_vocab_parameters = {
                **FASTTEXT_PARAMETERS['build_vocab'],
                **{
                    key: value
                    for (key, value) in self.fasttext_parameters.items() if key in FASTTEXT_PARAMETERS['build_vocab'].keys(
                    )
                },
            }
            LOGGER.info('Building vocab for {}'.format(self))
            LOGGER.debug(
                'build_vocab() parameters: {}'.format(build_vocab_parameters))
            bare_model.build_vocab(corpus_iterable=self.corpus,
                                   **build_vocab_parameters)
            saved_values = {'model_values': {}, 'wv_values': {}}
            for key in FASTTEXT_PARAMETERS['build_vocab_keys']:
                if key in vars(bare_model):
                    saved_values['model_values'][key] = bare_model.__dict__[
                        key]
                elif key in vars(bare_model.wv):
                    saved_values['wv_values'][key] = bare_model.wv.__dict__[
                        key]
                else:
                    message = 'Key {} not found in FastText model or its keyed vectors'.format(
                        key)
                    raise KeyError(message)
            del bare_model
            with self._bare_model_path.open('wb') as wf:
                LOGGER.debug('Saving vocab for {} to {}'.format(
                    self, self._bare_model_path))
                pickle.dump(saved_values, wf, protocol=PICKLE_PROTOCOL)

        LOGGER.debug('FastText() parameters: {}'.format(
            self.fasttext_parameters))
        model = FastText(**self.fasttext_parameters)
        for key, value in saved_values['model_values'].items():
            model.__dict__[key] = value
        for key, value in saved_values['wv_values'].items():
            model.wv.__dict__[key] = value
        model.wv.norms = None
        model.prepare_weights()

        return model