Example #1
0
    def convert_traindata_embedvecs(self, classdict):
        """ Convert the training text data into embedded matrix.

        COnvert the training text data into embedded matrix, where each short sentence
        is a normalized summed embedded vectors for all words.

        :param classdict: training data
        :return: tuples, consisting of class labels, matrix of embedded vectors, and corresponding outputs
        :type classdict: dict
        :rtype: (list, numpy.ndarray, list)
        """
        classlabels = classdict.keys()
        lblidx_dict = dict(zip(classlabels, range(len(classlabels))))

        indices = []
        embedvecs = []
        for classlabel in classlabels:
            for shorttext in classdict[classlabel]:
                embedvec = np.sum(np.array([
                    self.word_to_embedvec(token)
                    for token in spacy_tokenize(shorttext)
                ]),
                                  axis=0)
                # embedvec = np.reshape(embedvec, embedvec.shape+(1,))
                embedvec /= np.linalg.norm(embedvec)
                embedvecs.append(embedvec)
                category_bucket = [0] * len(classlabels)
                category_bucket[lblidx_dict[classlabel]] = 1
                indices.append(category_bucket)

        indices = np.array(indices)
        embedvecs = np.array(embedvecs)
        return classlabels, embedvecs, indices
    def convert_trainingdata_matrix(self, classdict):
        """ Convert the training data into format put into the neural networks.

        Convert the training data into format put into the neural networks.
        This is called by :func:`~train`.

        :param classdict: training data
        :return: a tuple of three, containing a list of class labels, matrix of embedded word vectors, and corresponding outputs
        :type classdict: dict
        :rtype: (list, numpy.ndarray, list)
        """
        classlabels = classdict.keys()
        lblidx_dict = dict(zip(classlabels, range(len(classlabels))))

        # tokenize the words, and determine the word length
        phrases = []
        indices = []
        for label in classlabels:
            for shorttext in classdict[label]:
                shorttext = shorttext if type(shorttext)==str else ''
                category_bucket = [0]*len(classlabels)
                category_bucket[lblidx_dict[label]] = 1
                indices.append(category_bucket)
                phrases.append(spacy_tokenize(shorttext))

        # store embedded vectors
        train_embedvec = np.zeros(shape=(len(phrases), self.maxlen, self.vecsize))
        for i in range(len(phrases)):
            for j in range(min(self.maxlen, len(phrases[i]))):
                train_embedvec[i, j] = self.word_to_embedvec(phrases[i][j])
        indices = np.array(indices, dtype=np.int)

        return classlabels, train_embedvec, indices
Example #3
0
    def retrieve_bow(self, shorttext):
        """ Calculate the gensim bag-of-words representation of the given short text.

        :param shorttext: text to be represented
        :return: corpus representation of the text
        :type shorttext: str
        :rtype: list
        """
        return self.dictionary.doc2bow(
            spacy_tokenize(self.preprocessor(shorttext)))
Example #4
0
    def generate_corpus(self, classdict):
        """ Calculate the gensim dictionary and corpus, and extract the class labels
        from the training data. Called by :func:`~train`.

        :param classdict: training data
        :return: None
        :type classdict: dict
        """
        self.dictionary, self.corpus, self.classlabels = gc.generate_gensim_corpora(
            classdict,
            preprocess_and_tokenize=lambda sent: spacy_tokenize(
                self.preprocessor(sent)))
    def shorttext_to_matrix(self, shorttext):
        """ Convert the short text into a matrix with word-embedding representation.

        Given a short sentence, it converts all the tokens into embedded vectors according to
        the given word-embedding model, and put them into a matrix. If a word is not in the model,
        that row will be filled with zero.

        :param shorttext: a short sentence
        :return: a matrix of embedded vectors that represent all the tokens in the sentence
        :type shorttext: str
        :rtype: numpy.ndarray
        """
        tokens = spacy_tokenize(shorttext)
        matrix = np.zeros((self.maxlen, self.vecsize))
        for i in range(min(self.maxlen, len(tokens))):
            matrix[i] = self.word_to_embedvec(tokens[i])
        return matrix
    def shorttext_to_embedvec(self, shorttext):
        """ Convert the short text into an averaged embedded vector representation.

        Given a short sentence, it converts all the tokens into embedded vectors according to
        the given word-embedding model, sums
        them up, and normalize the resulting vector. It returns the resulting vector
        that represents this short sentence.

        :param shorttext: a short sentence
        :return: an embedded vector that represents the short sentence
        :type shorttext: str
        :rtype: numpy.ndarray
        """
        vec = np.zeros(self.vecsize)
        for token in spacy_tokenize(shorttext):
            if token in self.wvmodel:
                vec += self.wvmodel[token]
        norm = np.linalg.norm(vec)
        if norm!=0:
            vec /= np.linalg.norm(vec)
        return vec