Esempio n. 1
0
    def transform_paragraph(self, paragraph, epochs=50, ignore_missing=False):
        """
        Transform an iterable of tokens into its vector representation
        (a paragraph vector).

        Experimental. This will return something close to a tf-idf
        weighted average of constituent token vectors by fitting
        rare words (with low word bias values) more closely.
        """

        if self.word_vectors is None:
            raise Exception('Model must be fit to transform paragraphs')

        if self.dictionary is None:
            raise Exception('Dictionary must be provided to '
                            'transform paragraphs')

        cooccurrence = collections.defaultdict(lambda: 0.0)

        for token in paragraph:
            try:
                cooccurrence[self.dictionary[token]] += self.max_count / 10.0
            except KeyError:
                if not ignore_missing:
                    raise

        random_state = check_random_state(self.random_state)

        word_ids = np.array(cooccurrence.keys(), dtype=np.int32)
        values = np.array(cooccurrence.values(), dtype=np.float64)
        shuffle_indices = np.arange(len(word_ids), dtype=np.int32)

        # Initialize the vector to mean of constituent word vectors
        paragraph_vector = np.mean(self.word_vectors[word_ids], axis=0)
        sum_gradients = np.ones_like(paragraph_vector)

        # Shuffle the coocurrence matrix
        random_state.shuffle(shuffle_indices)
        transform_paragraph(self.word_vectors,
                            self.word_biases,
                            paragraph_vector,
                            sum_gradients,
                            word_ids,
                            values,
                            shuffle_indices,
                            self.learning_rate,
                            self.max_count,
                            self.alpha,
                            epochs)

        return paragraph_vector
Esempio n. 2
0
    def transform_paragraph(self, paragraph, epochs=50, ignore_missing=False):
        """
        Transform an iterable of tokens into its vector representation
        (a paragraph vector).

        Experimental. This will return something close to a tf-idf
        weighted average of constituent token vectors by fitting 
        rare words (with low word bias values) more closely.
        """

        if self.word_vectors is None:
            raise Exception('Model must be fit to transform paragraphs')

        if self.dictionary is None:
            raise Exception('Dictionary must be provided to '
                            'transform paragraphs')

        cooccurrence = collections.defaultdict(lambda: 0.0)
            
        for token in paragraph:
            try:
                cooccurrence[self.dictionary[token]] += self.max_count / 10.0
            except KeyError:
                if not ignore_missing:
                    raise

        word_ids = np.array(cooccurrence.keys(), dtype=np.int32)
        values = np.array(cooccurrence.values(), dtype=np.float64)
        shuffle_indices = np.arange(len(word_ids), dtype=np.int32)

        # Initialize the vector to mean of constituent word vectors
        paragraph_vector = np.mean(self.word_vectors[word_ids], axis=0)

        # Shuffle the coocurrence matrix
        np.random.shuffle(shuffle_indices)
        transform_paragraph(self.word_vectors,
                            self.word_biases,
                            paragraph_vector,
                            word_ids,
                            values,
                            shuffle_indices,
                            self.learning_rate,
                            self.max_count,
                            self.alpha,
                            epochs)

        return paragraph_vector
Esempio n. 3
0
    def transform_paragraph(self, paragraph, epochs=50, ignore_missing=False, use_pca=False):
        """
        Transform an iterable of tokens into its vector representation
        (a paragraph vector).

        Experimental. This will return something close to a tf-idf
        weighted average of constituent token vectors by fitting 
        rare words (with low word bias values) more closely. If use_pca is True,
        the token vectors will be transformed using PCA before the weighted
        average is calculated.
        
        Added the PCA thing because of this paper: http://courses.cs.tau.ac.il/~wolf/papers/qagg.pdf
        """

        if self.word_vectors is None:
            raise Exception('Model must be fit to transform paragraphs')

        if self.dictionary is None:
            raise Exception('Dictionary must be provided to '
                            'transform paragraphs')

        cooccurrence = collections.defaultdict(lambda: 0.0)
            
        for token in paragraph:
            try:
                cooccurrence[self.dictionary[token]] += self.max_count / 10.0
            except KeyError:
                if not ignore_missing:
                    raise

        word_ids = np.array(cooccurrence.keys(), dtype=np.int32)
        values = np.array(cooccurrence.values(), dtype=np.float64)
        shuffle_indices = np.arange(len(word_ids), dtype=np.int32)

        # Initialize the vector to mean of constituent word vectors
        # if a PCA model should be used, use it first to transform the vectors

        if use_pca:
            if self.pca is None:                
                self.perform_PCA()
                
            paragraph_vector = np.mean(self.pca.transform(self.word_vectors[word_ids]), axis=0)
        else:
            paragraph_vector = np.mean(self.word_vectors[word_ids], axis=0)

        sum_gradients = np.ones_like(paragraph_vector)

        # Shuffle the coocurrence matrix
        np.random.shuffle(shuffle_indices)
        transform_paragraph(self.word_vectors,
                            self.word_biases,
                            paragraph_vector,
                            sum_gradients,
                            word_ids,
                            values,
                            shuffle_indices,
                            self.learning_rate,
                            self.max_count,
                            self.alpha,
                            epochs)

        return paragraph_vector
Esempio n. 4
0
    def transform_paragraph(self,
                            paragraph,
                            epochs=50,
                            ignore_missing=False,
                            use_pca=False):
        """
        Transform an iterable of tokens into its vector representation
        (a paragraph vector).

        Experimental. This will return something close to a tf-idf
        weighted average of constituent token vectors by fitting
        rare words (with low word bias values) more closely. If use_pca is True,
        the token vectors will be transformed using PCA before the weighted
        average is calculated.

        Added the PCA thing because of this paper: http://courses.cs.tau.ac.il/~wolf/papers/qagg.pdf
        """

        if self.word_vectors is None:
            raise Exception('Model must be fit to transform paragraphs')

        if self.dictionary is None:
            raise Exception('Dictionary must be provided to '
                            'transform paragraphs')

        cooccurrence = collections.defaultdict(lambda: 0.0)

        for token in paragraph:
            try:
                cooccurrence[self.dictionary[token]] += self.max_count / 10.0
            except KeyError:
                if not ignore_missing:
                    raise

        word_ids = np.array(cooccurrence.keys(), dtype=np.int32)
        values = np.array(cooccurrence.values(), dtype=np.float64)
        shuffle_indices = np.arange(len(word_ids), dtype=np.int32)

        # Initialize the vector to mean of constituent word vectors
        # if a PCA model should be used, use it first to transform the vectors

        if use_pca:
            if self.pca is None:
                self.perform_PCA()

            paragraph_vector = np.mean(self.pca.transform(
                self.word_vectors[word_ids]),
                                       axis=0)
        else:
            paragraph_vector = np.mean(self.word_vectors[word_ids], axis=0)

        sum_gradients = np.ones_like(paragraph_vector)

        # Shuffle the coocurrence matrix
        np.random.shuffle(shuffle_indices)
        transform_paragraph(self.word_vectors, self.word_biases,
                            paragraph_vector, sum_gradients, word_ids, values,
                            shuffle_indices, self.learning_rate,
                            self.max_count, self.alpha, epochs)

        return paragraph_vector