def word2vec(self, x, n_grams) -> np.ndarray: """ word2vec embedding :param x: list of texts :param n_grams: n-grams parameters :return: encoded vectors :rtype: numpy.ndarray """ if self.word_vectors is None: assert os.path.exists(self.model_path), FileNotFoundError( f'Could not found word2vec model at {self.model_path}') import gensim from distutils.version import LooseVersion if LooseVersion(gensim.__version__) >= LooseVersion('1.0.1'): from gensim.models import KeyedVectors self.word_vectors = KeyedVectors.load_word2vec_format( self.model_path, binary=True) else: from gensim.models import Word2Vec self.word_vectors = Word2Vec.load_word2vec_format( self.model_path, binary=True) _x = None for item in tqdm(x, desc='Word2Vec Text Encoder'): __x = None items = TextProcessor.n_gram_split(item, n_grams) for token in items: try: vector = self.word_vectors.get_vector(token) except Exception as ex: vector = self.word_vectors.get_vector('unk') vector = np.expand_dims(vector, axis=0) __x = vector if __x is None else np.concatenate((__x, vector)) if __x.shape[0] < self.max_length: adjust_size = self.max_length - __x.shape[0] adjust_array = np.zeros((adjust_size, 400)) __x = np.concatenate((__x, adjust_array)) __x = np.expand_dims(__x, axis=0) _x = __x if _x is None else np.concatenate((_x, __x)) return _x
def bow(self, x, vocab, n_grams) -> np.ndarray: """ Bag-of-Words encoder :param x: list of texts :param vocab: corpus vocabulary :param n_grams: n-grams parameters :return: Bag-of-Words vectors :rtype: numpy.ndarray """ _x = np.zeros((x.__len__(), vocab.__len__())) for i, item in enumerate(tqdm(x, desc='Bag Of Words Text Encoder')): items = TextProcessor.n_gram_split(item, n_grams) for token in items: if token in vocab: j = vocab.index(token) _x[i][j] = _x[i][j] + 1 return _x
def one_hot(self, x, vocab, n_grams) -> np.ndarray: """ Convert corpus into batch of one-hot vectors. :param x: list of texts :param vocab: corpus vocabulary :param n_grams: n-grams parameters :return: one-hot vectors :rtype: numpy.ndarray """ _x = np.zeros((x.__len__(), self.max_length, vocab.__len__())) for i, item in enumerate(tqdm(x, desc='One Hot Text Encoder')): items = TextProcessor.n_gram_split(item, n_grams) for j, token in enumerate(items): if j >= self.max_length: break if token in vocab: idx = vocab.index(token) _x[i][j][idx] = 1 return _x
def tf_idf(self, x, vocab, n_grams) -> np.ndarray: """ Simple TF-IDF vectors :param x: list of texts :param vocab: corpus vocabulary :param n_grams: n-grams parameters :return: encoded vectors :rtype: numpy.ndarray """ items = [TextProcessor.n_gram_split(item, n_grams) for item in x] appearances_in_doc = {k: 0 for k in vocab} for _ in items: _set = set() for __ in _: if __ not in _set: appearances_in_doc[__] += 1 _set.add(__) import math _x = np.zeros((x.__len__(), vocab.__len__())) for i, _ in enumerate(tqdm(items, desc='TF-IDF Text Encoder')): appearances_in_here = dict() for __ in _: if __ not in appearances_in_here: appearances_in_here[__] = 1 else: appearances_in_here[__] += 1 for __ in _: if __ in vocab: j = vocab.index(__) _x[i][j] = math.log(1 + appearances_in_here[__]) * math.log( x.__len__() / appearances_in_doc[__]) return _x
import unittest from sentivi.text_processor import TextProcessor if __name__ == '__main__': text_processor = TextProcessor( methods=['remove_punctuation', 'word_segmentation']) print(text_processor('Trường đại học, Tôn Đức Thắng, Hồ; Chí Minh.')) print(TextProcessor.n_gram_split('bài tập phân tích cảm xúc', 3))