def transform_paragraph(self, paragraph, epochs=50, ignore_missing=False): """ Transform an iterable of tokens into its vector representation (a paragraph vector). Experimental. This will return something close to a tf-idf weighted average of constituent token vectors by fitting rare words (with low word bias values) more closely. """ if self.word_vectors is None: raise Exception('Model must be fit to transform paragraphs') if self.dictionary is None: raise Exception('Dictionary must be provided to ' 'transform paragraphs') cooccurrence = collections.defaultdict(lambda: 0.0) for token in paragraph: try: cooccurrence[self.dictionary[token]] += self.max_count / 10.0 except KeyError: if not ignore_missing: raise random_state = check_random_state(self.random_state) word_ids = np.array(cooccurrence.keys(), dtype=np.int32) values = np.array(cooccurrence.values(), dtype=np.float64) shuffle_indices = np.arange(len(word_ids), dtype=np.int32) # Initialize the vector to mean of constituent word vectors paragraph_vector = np.mean(self.word_vectors[word_ids], axis=0) sum_gradients = np.ones_like(paragraph_vector) # Shuffle the coocurrence matrix random_state.shuffle(shuffle_indices) transform_paragraph(self.word_vectors, self.word_biases, paragraph_vector, sum_gradients, word_ids, values, shuffle_indices, self.learning_rate, self.max_count, self.alpha, epochs) return paragraph_vector
def transform_paragraph(self, paragraph, epochs=50, ignore_missing=False): """ Transform an iterable of tokens into its vector representation (a paragraph vector). Experimental. This will return something close to a tf-idf weighted average of constituent token vectors by fitting rare words (with low word bias values) more closely. """ if self.word_vectors is None: raise Exception('Model must be fit to transform paragraphs') if self.dictionary is None: raise Exception('Dictionary must be provided to ' 'transform paragraphs') cooccurrence = collections.defaultdict(lambda: 0.0) for token in paragraph: try: cooccurrence[self.dictionary[token]] += self.max_count / 10.0 except KeyError: if not ignore_missing: raise word_ids = np.array(cooccurrence.keys(), dtype=np.int32) values = np.array(cooccurrence.values(), dtype=np.float64) shuffle_indices = np.arange(len(word_ids), dtype=np.int32) # Initialize the vector to mean of constituent word vectors paragraph_vector = np.mean(self.word_vectors[word_ids], axis=0) # Shuffle the coocurrence matrix np.random.shuffle(shuffle_indices) transform_paragraph(self.word_vectors, self.word_biases, paragraph_vector, word_ids, values, shuffle_indices, self.learning_rate, self.max_count, self.alpha, epochs) return paragraph_vector
def transform_paragraph(self, paragraph, epochs=50, ignore_missing=False, use_pca=False): """ Transform an iterable of tokens into its vector representation (a paragraph vector). Experimental. This will return something close to a tf-idf weighted average of constituent token vectors by fitting rare words (with low word bias values) more closely. If use_pca is True, the token vectors will be transformed using PCA before the weighted average is calculated. Added the PCA thing because of this paper: http://courses.cs.tau.ac.il/~wolf/papers/qagg.pdf """ if self.word_vectors is None: raise Exception('Model must be fit to transform paragraphs') if self.dictionary is None: raise Exception('Dictionary must be provided to ' 'transform paragraphs') cooccurrence = collections.defaultdict(lambda: 0.0) for token in paragraph: try: cooccurrence[self.dictionary[token]] += self.max_count / 10.0 except KeyError: if not ignore_missing: raise word_ids = np.array(cooccurrence.keys(), dtype=np.int32) values = np.array(cooccurrence.values(), dtype=np.float64) shuffle_indices = np.arange(len(word_ids), dtype=np.int32) # Initialize the vector to mean of constituent word vectors # if a PCA model should be used, use it first to transform the vectors if use_pca: if self.pca is None: self.perform_PCA() paragraph_vector = np.mean(self.pca.transform(self.word_vectors[word_ids]), axis=0) else: paragraph_vector = np.mean(self.word_vectors[word_ids], axis=0) sum_gradients = np.ones_like(paragraph_vector) # Shuffle the coocurrence matrix np.random.shuffle(shuffle_indices) transform_paragraph(self.word_vectors, self.word_biases, paragraph_vector, sum_gradients, word_ids, values, shuffle_indices, self.learning_rate, self.max_count, self.alpha, epochs) return paragraph_vector
def transform_paragraph(self, paragraph, epochs=50, ignore_missing=False, use_pca=False): """ Transform an iterable of tokens into its vector representation (a paragraph vector). Experimental. This will return something close to a tf-idf weighted average of constituent token vectors by fitting rare words (with low word bias values) more closely. If use_pca is True, the token vectors will be transformed using PCA before the weighted average is calculated. Added the PCA thing because of this paper: http://courses.cs.tau.ac.il/~wolf/papers/qagg.pdf """ if self.word_vectors is None: raise Exception('Model must be fit to transform paragraphs') if self.dictionary is None: raise Exception('Dictionary must be provided to ' 'transform paragraphs') cooccurrence = collections.defaultdict(lambda: 0.0) for token in paragraph: try: cooccurrence[self.dictionary[token]] += self.max_count / 10.0 except KeyError: if not ignore_missing: raise word_ids = np.array(cooccurrence.keys(), dtype=np.int32) values = np.array(cooccurrence.values(), dtype=np.float64) shuffle_indices = np.arange(len(word_ids), dtype=np.int32) # Initialize the vector to mean of constituent word vectors # if a PCA model should be used, use it first to transform the vectors if use_pca: if self.pca is None: self.perform_PCA() paragraph_vector = np.mean(self.pca.transform( self.word_vectors[word_ids]), axis=0) else: paragraph_vector = np.mean(self.word_vectors[word_ids], axis=0) sum_gradients = np.ones_like(paragraph_vector) # Shuffle the coocurrence matrix np.random.shuffle(shuffle_indices) transform_paragraph(self.word_vectors, self.word_biases, paragraph_vector, sum_gradients, word_ids, values, shuffle_indices, self.learning_rate, self.max_count, self.alpha, epochs) return paragraph_vector