def convert_traindata_embedvecs(self, classdict): """ Convert the training text data into embedded matrix. COnvert the training text data into embedded matrix, where each short sentence is a normalized summed embedded vectors for all words. :param classdict: training data :return: tuples, consisting of class labels, matrix of embedded vectors, and corresponding outputs :type classdict: dict :rtype: (list, numpy.ndarray, list) """ classlabels = classdict.keys() lblidx_dict = dict(zip(classlabels, range(len(classlabels)))) indices = [] embedvecs = [] for classlabel in classlabels: for shorttext in classdict[classlabel]: embedvec = np.sum(np.array([ self.word_to_embedvec(token) for token in spacy_tokenize(shorttext) ]), axis=0) # embedvec = np.reshape(embedvec, embedvec.shape+(1,)) embedvec /= np.linalg.norm(embedvec) embedvecs.append(embedvec) category_bucket = [0] * len(classlabels) category_bucket[lblidx_dict[classlabel]] = 1 indices.append(category_bucket) indices = np.array(indices) embedvecs = np.array(embedvecs) return classlabels, embedvecs, indices
def convert_trainingdata_matrix(self, classdict): """ Convert the training data into format put into the neural networks. Convert the training data into format put into the neural networks. This is called by :func:`~train`. :param classdict: training data :return: a tuple of three, containing a list of class labels, matrix of embedded word vectors, and corresponding outputs :type classdict: dict :rtype: (list, numpy.ndarray, list) """ classlabels = classdict.keys() lblidx_dict = dict(zip(classlabels, range(len(classlabels)))) # tokenize the words, and determine the word length phrases = [] indices = [] for label in classlabels: for shorttext in classdict[label]: shorttext = shorttext if type(shorttext)==str else '' category_bucket = [0]*len(classlabels) category_bucket[lblidx_dict[label]] = 1 indices.append(category_bucket) phrases.append(spacy_tokenize(shorttext)) # store embedded vectors train_embedvec = np.zeros(shape=(len(phrases), self.maxlen, self.vecsize)) for i in range(len(phrases)): for j in range(min(self.maxlen, len(phrases[i]))): train_embedvec[i, j] = self.word_to_embedvec(phrases[i][j]) indices = np.array(indices, dtype=np.int) return classlabels, train_embedvec, indices
def retrieve_bow(self, shorttext): """ Calculate the gensim bag-of-words representation of the given short text. :param shorttext: text to be represented :return: corpus representation of the text :type shorttext: str :rtype: list """ return self.dictionary.doc2bow( spacy_tokenize(self.preprocessor(shorttext)))
def generate_corpus(self, classdict): """ Calculate the gensim dictionary and corpus, and extract the class labels from the training data. Called by :func:`~train`. :param classdict: training data :return: None :type classdict: dict """ self.dictionary, self.corpus, self.classlabels = gc.generate_gensim_corpora( classdict, preprocess_and_tokenize=lambda sent: spacy_tokenize( self.preprocessor(sent)))
def shorttext_to_matrix(self, shorttext): """ Convert the short text into a matrix with word-embedding representation. Given a short sentence, it converts all the tokens into embedded vectors according to the given word-embedding model, and put them into a matrix. If a word is not in the model, that row will be filled with zero. :param shorttext: a short sentence :return: a matrix of embedded vectors that represent all the tokens in the sentence :type shorttext: str :rtype: numpy.ndarray """ tokens = spacy_tokenize(shorttext) matrix = np.zeros((self.maxlen, self.vecsize)) for i in range(min(self.maxlen, len(tokens))): matrix[i] = self.word_to_embedvec(tokens[i]) return matrix
def shorttext_to_embedvec(self, shorttext): """ Convert the short text into an averaged embedded vector representation. Given a short sentence, it converts all the tokens into embedded vectors according to the given word-embedding model, sums them up, and normalize the resulting vector. It returns the resulting vector that represents this short sentence. :param shorttext: a short sentence :return: an embedded vector that represents the short sentence :type shorttext: str :rtype: numpy.ndarray """ vec = np.zeros(self.vecsize) for token in spacy_tokenize(shorttext): if token in self.wvmodel: vec += self.wvmodel[token] norm = np.linalg.norm(vec) if norm!=0: vec /= np.linalg.norm(vec) return vec