Ejemplo n.º 1
0
class FastTextEmbedding(WordEmbedding):
    """FastText word embedding"""
    def __init__(self,
                 sentences=None,
                 corpus_file_path: str = None,
                 dim: int = 100,
                 saved_model_path: str = None,
                 save_path=None) -> None:
        """Constructor for FastTextEmbedding classes (RAII)"""
        super().__init__()

        if sentences is not None:
            self._impl = FastText(size=dim)
            self._impl.build_vocab(sentences=sentences)
            self._impl.train(sentences=sentences,
                             total_words=self._impl.corpus_total_words,
                             epochs=10)
        elif corpus_file_path is not None:
            self._impl = FastText(size=dim)
            self._impl.build_vocab(corpus_file=corpus_file_path)
            self._impl.train(corpus_file=corpus_file_path,
                             total_words=self._impl.corpus_total_words,
                             epochs=10)
        elif saved_model_path is not None:
            # load from saved FastText embedding file
            self._impl = load_facebook_model(saved_model_path)
        else:
            raise AssertionError(
                'sentences or corpus_file_path should be given as not None')

        self.vocab = self._impl.wv.index2word
        self.vocab_size = len(self.vocab)
        self.embedding_matrix = []
        for word in self.vocab:
            self.embedding_matrix.append(self._impl.wv[word])

    def word2vec(self, words):
        return [self._impl.wv.get_vector(w) for w in words]

    def vec2word(self, vectors):
        return [self._impl.similar_by_vector(v, topn=1)[0] for v in vectors]

    def get_embedding_matrix(self) -> np.ndarray:
        return np.array(self.embedding_matrix)