def shorttext_to_embedvec(self, shorttext): """ Convert the short text into an averaged embedded vector representation. Given a short sentence, it converts all the tokens into embedded vectors according to the given word-embedding model, sums them up, and normalize the resulting vector. It returns the resulting vector that represents this short sentence. :param shorttext: a short sentence :return: an embedded vector that represents the short sentence :type shorttext: str :rtype: numpy.ndarray """ vec = np.zeros(self.vecsize) for token in spacy_tokenize(shorttext): if token in self.wvmodel: vec += self.wvmodel[token] norm = np.linalg.norm(vec) if norm!=0: vec /= np.linalg.norm(vec) return vec
def convert_traindata_embedvecs(self, classdict): """ Convert the training text data into embedded matrix. Convert the training text data into embedded matrix, where each short sentence is a normalized summed embedded vectors for all words. :param classdict: training data :return: tuples, consisting of class labels, matrix of embedded vectors, and corresponding outputs :type classdict: dict :rtype: (list, numpy.ndarray, list) """ classlabels = classdict.keys() lblidx_dict = dict(zip(classlabels, range(len(classlabels)))) indices = [] embedvecs = [] for classlabel in classlabels: for shorttext in classdict[classlabel]: embedvec = np.sum(np.array([ self.word_to_embedvec(token) for token in spacy_tokenize(shorttext) ]), axis=0) # embedvec = np.reshape(embedvec, embedvec.shape+(1,)) norm = np.linalg.norm(embedvec) if norm == 0: continue embedvec /= norm embedvecs.append(embedvec) category_bucket = [0] * len(classlabels) category_bucket[lblidx_dict[classlabel]] = 1 indices.append(category_bucket) indices = np.array(indices) embedvecs = np.array(embedvecs) return classlabels, embedvecs, indices