def main(): """ prints list of words sorted according to their changes between two points of time """ if len(sys.argv) != 4 and len(sys.argv) != 3: raise Exception( "Provide 2+ arguments:\n\t1,first model\n\t2,second model\n\t3,Optional: number of min occurrences") start = sys.argv[1] end = sys.argv[2] if len(sys.argv) == 4: min_occ = int(sys.argv[3]) else: min_occ = 0 model1 = gensim.models.Word2Vec.load(start) model2 = gensim.models.Word2Vec.load(end) similarity = {} for word in model1.vocab: if model1.vocab[word].count >= min_occ and word in model2.vocab and model2.vocab[word].count >= min_occ: similarity[word] = dot(matutils.unitvec( model1[word]), matutils.unitvec(model2[word])) for w, c in sorted(similarity.items(), key=itemgetter(1)): print(w, c)
def lineReceived(self, line): #print( "LSA received " + line ); try: sent1, sent2 = line.strip().split("\t") except: self.sendLine("INPUTERROR: missing tab character?") return #print( "LSA sentence 1: " + sent1 ) #print( "LSA sentence 2: " + sent2 ) try: vec_bow1 = dictionary.doc2bow( sent1.lower().split()) vec_bow2 = dictionary.doc2bow( sent2.lower().split()) vec_lsi1 = lsi[vec_bow1] vec_lsi2 = lsi[vec_bow2] except KeyError: self.sendLine(str(0)) return if not vec_lsi1 or not vec_lsi2: self.sendLine(str(0)) return #print "LSA vector1 :" #print vec_lsi1 #print "LSA vector2 : " #print vec_lsi2 try: cossim = numpy.dot(matutils.unitvec(numpy.array([ x[1] for x in vec_lsi1])), matutils.unitvec(numpy.array([ x[1] for x in vec_lsi2])) ) except: print "dot product faalt" cossim = 0 raise self.sendLine(str(cossim))
def smartirs_normalize(x, norm_scheme, return_norm=False): """Normalize a vector using the normalization scheme specified in `norm_scheme`. Parameters ---------- x : numpy.ndarray Input array norm_scheme : {'n', 'c'} Normalizing function to use: `n`: no normalization `c`: unit L2 norm (scale `x` to unit euclidean length) return_norm : bool, optional Return the length of `x` as well? Returns ------- numpy.ndarray Normalized array. float (only if return_norm is set) L2 norm of `x`. """ if norm_scheme == "n": if return_norm: _, length = matutils.unitvec(x, return_norm=return_norm) return x, length else: return x elif norm_scheme == "c": return matutils.unitvec(x, return_norm=return_norm)
def add_documents(self, corpus): """ Extend the index with new documents. Internally, documents are buffered and then spilled to disk when there's `self.shardsize` of them (or when a query is issued). """ min_ratio = 1.0 # 0.5 to only reopen shards that are <50% complete if self.shards and len(self.shards[-1]) < min_ratio * self.shardsize: # The last shard was incomplete (<; load it back and add the documents there, don't start a new shard self.reopen_shard() for doc in corpus: if isinstance(doc, numpy.ndarray): doclen = len(doc) elif scipy.sparse.issparse(doc): doclen = doc.nnz else: doclen = len(doc) if doclen < 0.3 * self.num_features: doc = matutils.unitvec(matutils.corpus2csc([doc], self.num_features).T, self.norm) else: doc = matutils.unitvec(matutils.sparse2full(doc, self.num_features), self.norm) self.fresh_docs.append(doc) self.fresh_nnz += doclen if len(self.fresh_docs) >= self.shardsize: self.close_shard() if len(self.fresh_docs) % 10000 == 0: logger.info("PROGRESS: fresh_shard size=%i", len(self.fresh_docs))
def similarity(self, d1, d2): """ Compute cosine similarity between two docvecs in the trained set, specified by int index or string tag. (TODO: Accept vectors of out-of-training-set docs, as if from inference.) """ return dot(matutils.unitvec(self[d1]), matutils.unitvec(self[d2]))
def main(): # model parameters, taken from Kim et al. & Kulkarni et al. ALPHA = 0.01 NET_SIZE = 200 if len(sys.argv) < 11: raise Exception("""Provide 5+ arguments:\n\t1,path to save models\n\t2,path to corpora \t3,number of worker processes\n\t4,number of max. epochs\n\t5, minimum count \t6, hierarchic (0/1)\n\t7,neg sampling (0-20)\n\t8,downsampling (0-0.00001) \n\t9,max distance for convergence as exponent (e.g., 2 corresponding to 10^-2), use 0 to indicate no limit \n\t10+ files to train on (one model per file)""") model_path = sys.argv[1] corpus_path = sys.argv[2] workers = int(sys.argv[3]) epochs = int(sys.argv[4]) min_count = int(sys.argv[5]) hs = int(sys.argv[6]) negative = int(sys.argv[7]) sample = float(sys.argv[8]) if sys.argv[9] == "0": max_dist = None else: max_dist = 1 - 10**(-1 * float(sys.argv[9])) files = sys.argv[10:] if not os.path.exists(model_path): os.makedirs(model_path) old_model = None for f in files: if not os.path.exists(os.path.join(corpus_path, f)): logging.info("skipping %s", f) continue logging.info("processing %s", f) model = gensim.models.Word2Vec( size=NET_SIZE, window=4, min_count=min_count, workers=workers, alpha=ALPHA, sg=1, hs=hs, negative=negative, sample=sample) # skip-gram on! corpus = Corpus(f, corpus_path) if old_model: update_vocab(corpus, old_model, model) else: model.build_vocab(corpus) epoch = 0 dist = 0 while epoch < epochs and (max_dist == None or dist < max_dist): epoch += 1 if epoch > 1: old_syn0 = copy(model.syn0) model.train(corpus) if epoch > 1 and not max_dist == None: dist = sum([dot(unitvec(model.syn0[i]), unitvec( old_syn0[i])) for i in range(len(model.vocab))]) / len(model.vocab) old_model = model fname = os.path.join(model_path, "model" + f) fvocab = os.path.join(model_path, "vocab" + f) model.save_word2vec_format(fname, fvocab=fvocab, binary=True) logging.info("finished after %s epochs", epoch)
def cosine_similarity(word1, word2): global word_vector if word1.lower() in word_vector.keys() and word2.lower() in word_vector.keys(): return dot(matutils.unitvec(word_vector[word1.lower()]), matutils.unitvec(word_vector[word2.lower()])) else: return 0
def calc_w2v_similarity(words, use_ic=False): words1 = words[0] words2 = words[1] vec1 = sentence_vec(words1, use_ic) vec2 = sentence_vec(words2, use_ic) return [dot(matutils.unitvec(array(vec1)), matutils.unitvec(array(vec2)))]
def n_similarity(self, ds1, ds2): """ Compute cosine similarity between two sets of docvecs from the trained set, specified by int index or string tag. (TODO: Accept vectors of out-of-training-set docs, as if from inference.) """ v1 = [self[doc] for doc in ds1] v2 = [self[doc] for doc in ds2] return dot(matutils.unitvec(array(v1).mean(axis=0)), matutils.unitvec(array(v2).mean(axis=0)))
def getUVectors(self, toks): ''' Token Unit Vectors ''' if isinstance(toks, basestring): uv = mu.unitvec(self.model[toks]) else: uv = [mu.unitvec(self.model[tok]) for tok in toks] return uv
def puebaSimpleCosenos(): model = Doc2Vec.load('./imdb_dm.d2v') source = 'data/trainneg.txt' generador = GeneraVectores(model) vecs = generador.getVecsFromFile(source) print "coseno primer vector, trainneg" print dot(matutils.unitvec(vecs[0]), matutils.unitvec(model.docvecs["TRAIN_NEG_0"]))
def similarity_unseen_docs(self, model, doc_words1, doc_words2, alpha=0.1, min_alpha=0.0001, steps=5): """ Compute cosine similarity between two post-bulk out of training documents. Document should be a list of (word) tokens. """ d1 = model.infer_vector(doc_words=doc_words1, alpha=alpha, min_alpha=min_alpha, steps=steps) d2 = model.infer_vector(doc_words=doc_words2, alpha=alpha, min_alpha=min_alpha, steps=steps) return dot(matutils.unitvec(d1), matutils.unitvec(d2))
def similarity(self, word_a, word_b): try: a_vector = self.embeddings[word_a] b_vector = self.embeddings[word_b] diff = dot(matutils.unitvec(a_vector), matutils.unitvec(b_vector)) return diff except KeyError: #logger.debug("'%s' or '%s' don't have a word vector" % (word_a.encode("utf-8"), # word_b.encode("utf-8"))) return 0.0 if word_a != word_b else 1.0
def similarity(self, word_a, word_b): try: a_vector = self.embeddings[word_a] b_vector = self.embeddings[word_b] diff = dot(matutils.unitvec(a_vector), matutils.unitvec(b_vector)) return diff except KeyError: logger.debug("'%s' or '%s' don't have a word vector" % (word_a, word_b)) return 0.0
def py_vq2(obs, code_book, check_finite=True): """2nd Python version of vq algorithm. The algorithm simply computes the euclidian distance between each observation and every frame in the code_book/ Parameters ---------- obs : ndarray Expect a rank 2 array. Each row is one observation. code_book : ndarray Code book to use. Same format than obs. Should have same number of features (eg columns) than obs. check_finite : bool, optional Whether to check that the input matrices contain only finite numbers. Disabling may give a performance gain, but may result in problems (crashes, non-termination) if the inputs do contain infinities or NaNs. Default: True Returns ------- code : ndarray code[i] gives the label of the ith obversation, that its code is code_book[code[i]]. mind_dist : ndarray min_dist[i] gives the distance between the ith observation and its corresponding code. Notes ----- This could be faster when number of codebooks is small, but it becomes a real memory hog when codebook is large. It requires N by M by O storage where N=number of obs, M = number of features, and O = number of codes. """ obs = _asarray_validated(obs, check_finite=check_finite) code_book = _asarray_validated(code_book, check_finite=check_finite) d = shape(obs)[1] # code books and observations should have same number of features if not d == code_book.shape[1]: raise ValueError(""" code book(%d) and obs(%d) should have the same number of features (eg columns)""" % (code_book.shape[1], d)) #diff = obs[newaxis, :, :] - code_book[:,newaxis,:] #dist = sqrt(np.sum(diff * diff, -1)) dist = dot(matutils.unitvec(obs[newaxis, :, :]), matutils.unitvec(code_book[:,newaxis,:])) code = argmin(dist, 0) min_dist = minimum.reduce(dist, 0) # The next line I think is equivalent and should be faster than the one # above, but in practice didn't seem to make much difference: # min_dist = choose(code,dist) return code, min_dist
def main(): """ Training follows procedure described in Kim et al. (2014), cf. https://www.aclweb.org/anthology/W/W14/W14-2517.pdf """ ALPHA = 0.01 NET_SIZE = 200 if len(sys.argv) < 6: raise Exception("""Provide 5+ arguments:\n\t1,path to save models\n\t2,path to corpora \t3,number of worker processes\n\t4,number of max. epochs\n\t5, minimum count \t6, hierarchic (0/1)\n\t7,neg sampling (0-20)\n\t8,downsampling (0-0.00001) 9+ files to train on (one model per file)""") model_path = sys.argv[1] corpus_path = sys.argv[2] workers = int(sys.argv[3]) epochs = int(sys.argv[4]) min_count = int(sys.argv[5]) hs = int(sys.argv[6]) negative = int(sys.argv[7]) sample = float(sys.argv[8]) files = sys.argv[9:] if not os.path.exists(model_path): os.makedirs(model_path) old_model = None for f in files: if not os.path.exists(os.path.join(corpus_path, f)): logging.info("skipping %s", f) continue logging.info("processing %s", f) model = gensim.models.Word2Vec( size=NET_SIZE, window=4, min_count=min_count, workers=workers, alpha=ALPHA, sg=1, hs=hs, negative=negative, sample=sample) corpus = Corpus(f, corpus_path) if old_model: update_vocab(corpus, old_model, model) else: model.build_vocab(corpus) # training to convergence epoch = 0 dist = 0 while epoch < epochs and dist < 0.99: epoch += 1 if epoch > 1: old_syn0 = copy(model.syn0) model.train(corpus) if epoch > 1: dist = sum([dot(unitvec(model.syn0[i]), unitvec( old_syn0[i])) for i in range(len(model.vocab))]) / len(model.vocab) old_model = model model.save(os.path.join(model_path, "model" + f)) logging.info("finished after %s epochs", epoch)
def __iter__(self): textual_lines = FileIOManager.read_textual_file() visual_file = open(FileIOManager.images_features_path, 'r') visual_file.readline() number_of_lines = 0 for textual_line in textual_lines: number_of_lines += 1 if self.limited_length is not None and number_of_lines > self.limited_length: break corpus_line_dict = dict() line_words = textual_line.split() textual_img_id = line_words[0] number_of_features = int(line_words[1]) line_words = line_words[2:] for j in range(0, number_of_features*2, 2): word = self.dictionary.processWord(line_words[j].decode('utf-8')) # Normalize weight weight = float(line_words[j + 1]) / 100000 if word not in self.dictionary.word2id: continue # Get word id word_id = self.dictionary.word2id[word] if word_id not in corpus_line_dict: corpus_line_dict[word_id] = weight else: corpus_line_dict[word_id] += weight # Create array of tuples (word_id, weight) from dictionary corpus_line = [] for key, value in corpus_line_dict.iteritems(): corpus_line.append( (key, value) ) # Normalize to unit vector corpus_line = matutils.unitvec(corpus_line) # Search for training images only for corresponding img visual_line = visual_file.readline().split() image_id = visual_line[0] while image_id != textual_img_id: visual_line = visual_file.readline().split() image_id = visual_line[0] # Append visual features corpus_line = corpus_line + utils.generate_corpus_for_image(visual_line[1:], self.dictionary.features_names2id) # Normalize to unit vector corpus_line = matutils.unitvec(corpus_line) yield corpus_line
def similarity(self, w1, w2): """ Compute cosine similarity between two words. Example:: >>> trained_model.similarity('woman', 'man') 0.73723527 >>> trained_model.similarity('woman', 'woman') 1.0 """ return dot(matutils.unitvec(self[w1]), matutils.unitvec(self[w2]))
def add_documents(self, corpus): """Extend the index with new documents. Parameters ---------- corpus : iterable of list of (int, number) Corpus in BoW format. Notes ----- Internally, documents are buffered and then spilled to disk when there's `self.shardsize` of them (or when a query is issued). Examples -------- .. sourcecode:: pycon >>> from gensim.corpora.textcorpus import TextCorpus >>> from gensim.test.utils import datapath, get_tmpfile >>> from gensim.similarities import Similarity >>> >>> corpus = TextCorpus(datapath('testcorpus.mm')) >>> index_temp = get_tmpfile("index") >>> index = Similarity(index_temp, corpus, num_features=400) # create index >>> >>> one_more_corpus = TextCorpus(datapath('testcorpus.txt')) >>> index.add_documents(one_more_corpus) # add more documents in corpus """ min_ratio = 1.0 # 0.5 to only reopen shards that are <50% complete if self.shards and len(self.shards[-1]) < min_ratio * self.shardsize: # The last shard was incomplete (<; load it back and add the documents there, don't start a new shard self.reopen_shard() for doc in corpus: if isinstance(doc, numpy.ndarray): doclen = len(doc) elif scipy.sparse.issparse(doc): doclen = doc.nnz else: doclen = len(doc) if doclen < 0.3 * self.num_features: doc = matutils.unitvec(matutils.corpus2csc([doc], self.num_features).T, self.norm) else: doc = matutils.unitvec(matutils.sparse2full(doc, self.num_features), self.norm) self.fresh_docs.append(doc) self.fresh_nnz += doclen if len(self.fresh_docs) >= self.shardsize: self.close_shard() if len(self.fresh_docs) % 10000 == 0: logger.info("PROGRESS: fresh_shard size=%i", len(self.fresh_docs))
def __getitem__(self, query): """Get similarities of the given document or corpus against this index. Uses :meth:`~gensim.interfaces.SimilarityABC.get_similarities` internally. Notes ----- Passing an entire corpus as `query` can be more efficient than passing its documents one after another, because it will issue queries in batches internally. Parameters ---------- query : {list of (int, number), iterable of list of (int, number)} Document in the sparse Gensim bag-of-words format, or a streamed corpus of such documents. Returns ------- {`scipy.sparse.csr.csr_matrix`, list of (int, float)} Similarities given document or corpus and objects corpus, depends on `query`. """ is_corpus, query = utils.is_corpus(query) if self.normalize: # self.normalize only works if the input is a plain gensim vector/corpus (as # advertised in the doc). in fact, input can be a numpy or scipy.sparse matrix # as well, but in that case assume tricks are happening and don't normalize # anything (self.normalize has no effect). if not matutils.ismatrix(query): if is_corpus: query = [matutils.unitvec(v) for v in query] else: query = matutils.unitvec(query) result = self.get_similarities(query) if self.num_best is None: return result # if maintain_sparsity is True, result is scipy sparse. Sort, clip the # topn and return as a scipy sparse matrix. if getattr(self, 'maintain_sparsity', False): return matutils.scipy2scipy_clipped(result, self.num_best) # if the input query was a corpus (=more documents), compute the top-n # most similar for each document in turn if matutils.ismatrix(result): return [matutils.full2sparse_clipped(v, self.num_best) for v in result] else: # otherwise, return top-n of the single input document return matutils.full2sparse_clipped(result, self.num_best)
def __getitem__(self, query): """Get access to similarities of document/corpus `query` to all documents in the corpus. Using :meth:`~gensim.interfaces.SimilarityABC.get_similarities` Notes ----- Passing corpus to `query` (instead of document) can be more efficient, because will processed in batching-way. Parameters ---------- query : {list of (int, int), iterable of list of (int, int)} Document or corpus in BoW format. Returns ------- {`scipy.sparse.csr.csr_matrix`, list of (int, float)} Similarities given document or corpus and objects corpus, depends on `query`. """ is_corpus, query = utils.is_corpus(query) if self.normalize: # self.normalize only works if the input is a plain gensim vector/corpus (as # advertised in the doc). in fact, input can be a numpy or scipy.sparse matrix # as well, but in that case assume tricks are happening and don't normalize # anything (self.normalize has no effect). if not matutils.ismatrix(query): if is_corpus: query = [matutils.unitvec(v) for v in query] else: query = matutils.unitvec(query) result = self.get_similarities(query) if self.num_best is None: return result # if maintain_sparsity is True, result is scipy sparse. Sort, clip the # topn and return as a scipy sparse matrix. if getattr(self, 'maintain_sparsity', False): return matutils.scipy2scipy_clipped(result, self.num_best) # if the input query was a corpus (=more documents), compute the top-n # most similar for each document in turn if matutils.ismatrix(result): return [matutils.full2sparse_clipped(v, self.num_best) for v in result] else: # otherwise, return top-n of the single input document return matutils.full2sparse_clipped(result, self.num_best)
def reject_words_1(A, B, model = model): '''Takes two **LIST OF WORDS** and returns most_similar for word A, while rejecting words with meanings closer to B. Seems to work better than just giving in negative words. ''' in_words = A+B basic_word = [model[each] for each in A] reject_word = [model[each] for each in B] basic_mean = matutils.unitvec(array(basic_word).mean(axis=0)).astype(REAL) reject_mean = matutils.unitvec(array(reject_word).mean(axis=0)).astype(REAL) r = reject(basic_mean, reject_mean) dists = np.linalg.linalg.dot(model.syn0norm, r) best = matutils.argsort(dists, topn = 500, reverse = True) result = [(model.index2word[sim], float(dists[sim])) for sim in best if model.index2word[sim] not in in_words] return result
def wordInfluenceOnTopics(model, noOfWords = 25): with open ('../Data/topic_words.txt', 'w') as fout: for t in range(model.K): fout.write ('================ TOPIC: %s ==============\n'% t) pq = PriorityQueue() for v in range(len(model.vocab)): word = model.index2word[v] if(('SENT' not in word) and ('TOPIC' not in word)): vec_word = model.word_impact[v][t] similarity = dot(matutils.unitvec(vec_word), matutils.unitvec(model['TOPIC_'+str(t)])) pq.put((similarity, word)) for i in range(noOfWords): # print pq.get() fout.write(str(pq.get())) fout.write('\n')
def find_instances(self, patterns, instances, child_conn): updated_patterns = list() candidate_tuples = list() while True: try: t = instances.get_nowait() if instances.qsize() % 500 == 0: sys.stdout.write( str(multiprocessing.current_process()) + " Instances to process: " + str(instances.qsize())+'\n') sys.stdout.flush() # measure similarity towards every extraction pattern max_similarity = 0 pattern_best = None for p in patterns: good = 0 bad = 0 if self.config.alpha == 0 and self.config.gamma == 0: for p_bet_v in list(p.bet_uniques_vectors): if t.bet_vector is not None and p_bet_v is not None: score = dot( matutils.unitvec(t.bet_vector), matutils.unitvec(asarray(p_bet_v)) ) if score >= self.config.threshold_similarity: good += 1 else: bad += 1 if good > bad: p.update_selectivity(t, self.config) if score > max_similarity: max_similarity = score pattern_best = p # if its above a threshold associated the pattern with it if max_similarity >= self.config.threshold_similarity: candidate_tuples.append((t, pattern_best, max_similarity)) except queue.Empty: print(multiprocessing.current_process(), "Queue is Empty") for p in patterns: updated_patterns.append(p) pid = multiprocessing.current_process().pid child_conn.send((pid, updated_patterns, candidate_tuples)) break
def train(self, read_article_ids = None, unread_article_ids = None): #Load user feedback if needed if read_article_ids is None: read_article_ids = (r.article.id for r in ReadArticleFeedback.objects(user_id = self.user.id).only("article")) user_feedback = Article.objects(id__in = read_article_ids) #TODO: cluster feedback articles and save more than one profile num_loaded_articles = 0 centroid = numpy.zeros(self.num_features_, dtype=numpy.float32) for article in user_feedback: try: article_features_as_full_vec = self.get_features(article) except Exception as inst: logger.error("Could not get features for article %s: %s" % (article.id, inst)) continue #do we need this? tmp_doc = matutils.unitvec(article_features_as_full_vec) #add up tmp_doc centroid = numpy.add(centroid, tmp_doc) num_loaded_articles += 1 #average each element if num_loaded_articles != 0: centroid = centroid / num_loaded_articles centroid = matutils.full2sparse(centroid) #set user model data self.user_model_features = [centroid]
def mean_word_vecs(model, positive=[], negative=[], skip_unknown=False): ''' gensim.Word2vecのモデルから、単語を足しあわせたベクトルを計算する。 this code is based on gensim.Word2vec.most_simialr どの単語も辞書にない場合はNoneを返す。 ''' model.init_sims() # add weights for each word, if not already present; default to 1.0 for # positive and -1.0 for negative words positive = [(word, 1.0) for word in positive] negative = [(word, -1.0) for word in negative] # compute the weighted average of all words all_words, mean = set(), [] for word, weight in positive + negative: if isinstance(word, numpy.ndarray): mean.append(weight * word) elif word in model.vocab: mean.append(weight * model.syn0norm[model.vocab[word].index]) #all_words.add(model.vocab[word].index) elif not skip_unknown: words = tools.word_segmenter_ja(word, np=False) words = [w for w in words if len(w.strip()) > 0] mean_ = mean_word_vecs(model, positive=words, skip_unknown=True) if mean_ is not None: mean.append(weight * mean_) #raise KeyError("word '%s' not in vocabulary" % word) if not mean: #raise ValueError("cannot compute similarity with no input") return None mean = matutils.unitvec(numpy.array(mean).mean(axis=0)).astype(numpy.float32) return mean
def most_similar(self, positive=[], negative=[], topn=10): if isinstance(positive, string_types) and not negative: # allow calls like most_similar('dog'), as a shorthand for most_similar(['dog']) positive = [positive] # add weights for each word, if not already present; default to 1.0 for positive and -1.0 for negative words positive = [(word, 1.0) if isinstance(word, string_types + (ndarray,)) else word for word in positive] negative = [(word, -1.0) if isinstance(word, string_types + (ndarray,)) else word for word in negative] # compute the weighted average of all words all_words, mean = set(), [] for word, weight in positive + negative: if isinstance(word, ndarray): mean.append(weight * word) elif word in self.vocab: mean.append(weight * self.syn0norm[self.vocab[word].index]) all_words.add(self.vocab[word].index) else: raise KeyError("word '%s' not in vocabulary" % word) if not mean: raise ValueError("cannot compute similarity with no input") mean = matutils.unitvec(array(mean).mean(axis=0)).astype(REAL) dists = dot(self.syn0norm, mean) if not topn: return dists best = argsort(dists)[::-1][:topn + len(all_words)] # ignore (don't return) words from the input result = [(self.index2word[sim], float(dists[sim]), self.syn0[sim]) for sim in best if sim not in all_words] return result[:topn]
def __getitem__(self, bow, eps=1e-12): """ Return esa representation of the input vector and/or corpus. bow should already be weights, e.g. with TF-IDF """ # if the input vector is in fact a corpus, return a transformed corpus # as a result is_corpus, bow = utils.is_corpus(bow) if is_corpus: return self._apply(bow) #use corpus as interpreter matrix #simply multiply feature vector of input with corpus matrix #to get the weight of the concept vector = numpy.dot(matutils.sparse2full(bow, self.num_features), self.corpus) #normalize vector = matutils.unitvec(vector) # make sure there are no explicit zeroes in the vector (must be sparse) vector = [(concept_id, weight) for concept_id, weight in enumerate(vector) if abs(weight) > eps] return vector
def __getitem__(self, bow): """Get log entropy representation of the input vector and/or corpus. Parameters ---------- bow : list of (int, int) Document in BoW format. Returns ------- list of (int, float) Log-entropy vector for passed `bow`. """ # if the input vector is in fact a corpus, return a transformed corpus is_corpus, bow = utils.is_corpus(bow) if is_corpus: return self._apply(bow) # unknown (new) terms will be given zero weight (NOT infinity/huge) vector = [ (term_id, math.log(tf + 1) * self.entr.get(term_id)) for term_id, tf in bow if term_id in self.entr ] if self.normalize: vector = matutils.unitvec(vector) return vector
def __init__(self, corpus, num_features=None, num_terms=None, num_docs=None, num_nnz=None, num_best=None, chunksize=500, dtype=numpy.float32, maintain_sparsity=False): self.num_best = num_best self.normalize = True self.chunksize = chunksize self.maintain_sparsity = maintain_sparsity if corpus is not None: logger.info("creating sparse index") # iterate over input corpus, populating the sparse index matrix try: # use the more efficient corpus generation version, if the input # `corpus` is MmCorpus-like (knows its shape and number of non-zeroes). num_terms, num_docs, num_nnz = corpus.num_terms, corpus.num_docs, corpus.num_nnz logger.debug("using efficient sparse index creation") except AttributeError: # no MmCorpus, use the slower version (or maybe user supplied the # num_* params in constructor) pass if num_features is not None: # num_terms is just an alias for num_features, for compatibility with MatrixSimilarity num_terms = num_features if num_terms is None: raise ValueError("refusing to guess the number of sparse features: specify num_features explicitly") corpus = (matutils.scipy2sparse(v) if scipy.sparse.issparse(v) else (matutils.full2sparse(v) if isinstance(v, numpy.ndarray) else matutils.unitvec(v)) for v in corpus) self.index = matutils.corpus2csc( corpus, num_terms=num_terms, num_docs=num_docs, num_nnz=num_nnz, dtype=dtype, printprogress=10000).T # convert to Compressed Sparse Row for efficient row slicing and multiplications self.index = self.index.tocsr() # currently no-op, CSC.T is already CSR logger.info("created %r", self.index)
def word_averaging(self, words): vecs = [] for word in words: if isinstance(word, np.ndarray): vecs.append(word) elif word in self.wv.wv.vocab: id = self.wv.wv.vocab[word].index vecs.append(self.wv.wv.syn0norm[id]) if not vecs: logging.getLogger(self.__class__.__name__).warning( "cannot compute similarity : %s", words) # FIXME: remove these examples in pre-processing return np.zeros(self.wv.layer1_size, ) vec = np.array(vecs).mean(axis=0) vec = unitvec(vec).astype(np.float32) return vec
def combTest(model, w1, w2, pDict): rstStandard = model.similarity(w1, w2) #标准答案 vec1c = model[w1] if len(w1) < 5 else vecMean(model, [i for i in w1 if i in model]) vec2c = model[w2] if len(w2) < 5 else vecMean(model, [i for i in w2 if i in model]) rstc = np.dot(matutils.unitvec(vec1c), matutils.unitvec(vec2c)) vec1s = model[w1] if len(w1) < 5 else vecMean(model, pDict[w1]) vec2s = model[w2] if len(w2) < 5 else vecMean(model, pDict[w2]) rsts = np.dot(matutils.unitvec(vec1s), matutils.unitvec(vec2s)) # 下面是两个K-Means vec1kc = model[w1] if len(w1) < 5 else vecKmean(model, w1, [i for i in w1 if i in model]) vec2kc = model[w2] if len(w2) < 5 else vecKmean(model, w2, [i for i in w2 if i in model]) rstkc = np.dot(matutils.unitvec(vec1kc), matutils.unitvec(vec2kc)) vec1ks = model[w1] if len(w1) < 5 else vecKmean(model, w1, pDict[w1]) vec2ks = model[w2] if len(w2) < 5 else vecKmean(model, w2, pDict[w2]) rstks = np.dot(matutils.unitvec(vec1ks), matutils.unitvec(vec2ks)) return rstStandard, rstc, rsts, rstkc, rstks
def _most_similar(self: WordEmbeddingsKeyedVectors, author, input_word): topn = 10 positive = [input_word] self.init_sims() # add weights for each word, if not already present; default to 1.0 for positive and -1.0 for negative words positive = [ (word, 1.0) if isinstance(word, string_types + (ndarray,)) else word for word in positive ] # compute the weighted average of all words all_words, mean = set(), [] for word, weight in positive: if isinstance(word, ndarray): mean.append(weight * word) else: mean.append(weight * self.word_vec(word, use_norm=True)) index = encode_adj(word, author) if index >= 0: all_words.add(index) if not mean: raise ValueError("cannot compute similarity with no input") mean = matutils.unitvec(array(mean).mean(axis=0)).astype(REAL) limited = self.vectors_norm dists = dot(limited, mean) if not topn: return dists best = matutils.argsort(dists, topn=topn + len(all_words), reverse=True) # ignore (don't return) words from the input result = [(sim, float(dists[sim])) for sim in best if sim not in all_words] adj_res = [ (decode(r[0], author), r[1]) for r in result if encode_adj(decode(r[0], author), author) >= 0 ] return adj_res[:topn]
def word_averaging(wv, words): all_words, mean = set(), [] for word in words: if isinstance(word, np.ndarray): mean.append(word) elif word in wv.vocab: mean.append(wv.syn0norm[wv.vocab[word].index]) all_words.add(wv.vocab[word].index) print "biswa" if not mean: logging.warning("cannot compute similarity with no input %s", words) # FIXME: remove these examples in pre-processing return np.zeros(wv.layer_size,) mean = unitvec(np.array(mean).mean(axis=0)).astype(np.float32) print mean return mean
def _get_jieba_array(self, words): words = char_cleaner(words) seg_cut = jieba.lcut(words) seg_cut = char_list_cheaner(seg_cut) w2v_array = list() for word in seg_cut: try: similar_list = self.w2v_model[word] w2v_array.append(similar_list) except KeyError: continue if not w2v_array: w2v_array = [None] * self.size else: w2v_array = matutils.unitvec(np.array(w2v_array).mean(axis=0)) return w2v_array
def most_similar_paragraph(self, positive=[], negative=[], topn=10): """ Find the top-N most similar paragraphs. """ self.init_sims() if isinstance(positive, string_types) and not negative: # allow calls like most_similar('dog'), as a shorthand for most_similar(['dog']) positive = [positive] # add weights for each paragraph, if not already present; default to 1.0 for positive and -1.0 for negative paragraphs positive = [(paragraph, 1.0) if isinstance(paragraph, string_types + (ndarray, )) else paragraph for paragraph in positive] negative = [(paragraph, -1.0) if isinstance(paragraph, string_types + (ndarray, )) else paragraph for paragraph in negative] # compute the weighted average of all words all_paragraphs, mean = set(), [] for paragraph, weight in positive + negative: if isinstance(paragraph, ndarray): mean.append(weight * paragraph) elif paragraph in self.paragraph_vocab: mean.append(weight * self.synparagraphnorm[ self.paragraph_vocab[paragraph].index]) all_paragraphs.add(self.paragraph_vocab[paragraph].index) else: raise KeyError("paragraph '%s' not in vocabulary" % paragraph) if not mean: raise ValueError("cannot compute similarity with no input") mean = matutils.unitvec(array(mean).mean(axis=0)).astype(REAL) dists = dot(self.synparagraphnorm, mean) if not topn: return dists best = argsort(dists)[::-1][:topn + len(all_paragraphs)] # ignore (don't return) words from the input result = [(self.index2paragraph[sim], float(dists[sim]), sim) for sim in best if sim not in all_paragraphs] return result[:topn]
def __init__(self, corpus, num_best=None, dtype=numpy.float32, num_features=None, chunksize=256, corpus_len=None): """ `num_features` is the number of features in the corpus (will be determined automatically by scanning the corpus if not specified). See `Similarity` class for description of the other parameters. """ if num_features is None: logger.warning("scanning corpus to determine the number of features (consider setting `num_features` explicitly)") num_features = 1 + utils.get_max_id(corpus) self.num_features = num_features self.num_best = num_best self.normalize = True self.chunksize = chunksize if corpus_len is None: corpus_len = len(corpus) if corpus is not None: if self.num_features <= 0: raise ValueError( "cannot index a corpus with zero features (you must specify either `num_features` " "or a non-empty corpus in the constructor)" ) logger.info("creating matrix with %i documents and %i features", corpus_len, num_features) self.index = numpy.empty(shape=(corpus_len, num_features), dtype=dtype) # iterate over corpus, populating the numpy index matrix with (normalized) # document vectors for docno, vector in enumerate(corpus): if docno % 1000 == 0: logger.debug("PROGRESS: at document #%i/%i", docno, corpus_len) # individual documents in fact may be in numpy.scipy.sparse format as well. # it's not documented because other it's not fully supported throughout. # the user better know what he's doing (no normalization, must # explicitly supply num_features etc). if isinstance(vector, numpy.ndarray): pass elif scipy.sparse.issparse(vector): vector = vector.toarray().flatten() else: vector = matutils.unitvec(matutils.sparse2full(vector, num_features)) self.index[docno] = vector
def get_elmo_vector(sess, texts, batcher, sentence_character_ids, elmo_sentence_input, nrs): vectors = [] # Create batches of data. sentence_ids = batcher.batch_sentences(texts) print('Sentences in this chunk:', len(texts), file=sys.stderr) # Compute ELMo representations. elmo_sentence_input_ = sess.run(elmo_sentence_input['weighted_op'], feed_dict={sentence_character_ids: sentence_ids}) print('ELMo sentence input shape:', elmo_sentence_input_.shape, file=sys.stderr) for sentence, nr in zip(range(len(texts)), nrs): # query_word = texts[sentence][nr] # print(texts[sentence]) query_vec = elmo_sentence_input_[sentence, nr, :] query_vec = unitvec(query_vec) # print('Vector shape:', query_vec.shape) vectors.append(query_vec) return vectors
def doesnt_match(self, words): """ Which word from the given list doesn't go with the others? Example:: >>> trained_model.doesnt_match("breakfast cereal dinner lunch".split()) 'cereal' """ self.init_sims() words = [word for word in words if word in self.vocab] # filter out OOV words logger.debug("using words %s" % words) if not words: raise ValueError("cannot select a word from an empty list") vectors = vstack(self.syn0norm[self.vocab[word].index] for word in words).astype(REAL) mean = matutils.unitvec(vectors.mean(axis=0)).astype(REAL) dists = dot(vectors, mean) return sorted(zip(dists, words))[0][1]
def calculate_text_similar(vec_ques, matrix_org_norm, matrix_org_index, top_vec): """ 最相似的句子,句向量与矩阵点乘 :param vec: :param matrix: :param keys: :param topn: :return: """ # 问句向量标准化, Scale a vector to unit length. The only exception is the zero vector, which is returned back unchanged. vec_ques_mean = matutils.unitvec(np.array([vec_ques]).mean(axis=0)).astype(numpy_type) # 矩阵点乘, 即问句与标准问句库里边的问句点乘, matrix_vec_dot = np.dot(matrix_org_norm, vec_ques_mean) # 相似度排序 most_similar_sentence_vec_sort = matutils.argsort(matrix_vec_dot, topn=top_vec, reverse=True) # 获取最相似标准问句的index和得分score index_score = [] for t in most_similar_sentence_vec_sort[:top_vec]: index_score.append([matrix_org_index[t], float(matrix_vec_dot[t])]) return index_score
def generate_corpus_for_image(features, features2id): ''' Create sparse vector (feature_id, weight) from visual features. :param features: :param features2id: :return: ''' image_corpus_line = [] values = [] for i in range(0, 4096): feature_value = float(features[i]) if feature_value != 0: values.append(feature_value) image_corpus_line.append((features2id[i], feature_value)) # Get only X top significant elements. image_corpus_line = [ x for (y, x) in sorted(zip(values, image_corpus_line), reverse=True) ][0:number_of_elements] #image_corpus_line = sorted(enumerate(image_corpus_line), key=lambda item: item[1], reverse=True)[0:number_of_elements] return matutils.unitvec(image_corpus_line)
def updated_normalize(x, n_n): """Normalizes the final tf-idf value according to the value of `n_n`. Parameters ---------- x : numpy.ndarray Input array n_n : {'n', 'c'} Parameter that decides the normalizing function to be used. Returns ------- numpy.ndarray Normalized array. """ if n_n == "n": return x elif n_n == "c": return matutils.unitvec(x)
def doesnt_match(self, docs): """ Which doc from the given list doesn't go with the others? (TODO: Accept vectors of out-of-training-set docs, as if from inference.) """ self.init_sims() docs = [ doc for doc in docs if doc in self.doctags or 0 <= doc < self.count ] # filter out unknowns logger.debug("using docs %s" % docs) if not docs: raise ValueError("cannot select a doc from an empty list") vectors = vstack(self.doctag_syn0norm[self._int_index(doc)] for doc in docs).astype(REAL) mean = matutils.unitvec(vectors.mean(axis=0)).astype(REAL) dists = dot(vectors, mean) return sorted(zip(dists, docs))[0][1]
def get_w2v_vectors(text, merge_vectors=False): ''' Translates text into vector using word2vec. Args: text: str merge_vectors: bool, return sentence by sentence vectors or their mean Returns: numpy.ndarray (if merge_vectors) OR dict, where key (str) is a sentence, value (numpy.ndarray) is a vector ''' sentences = preprocessing(text, stopwords=stopwords.words('russian')) vectors = [get_w2v_vector(sentences)] if vectors == []: return None if merge_vectors: return matutils.unitvec(np.array(vectors).mean(axis=0)).astype( np.float32) return {sentence: vector for sentence, vector in zip(text, vectors)}
def calc_norm(self, corpus): """Calculate the norm by calling :func:`~gensim.matutils.unitvec` with the norm parameter. Parameters ---------- corpus : iterable of iterable of (int, number) Input corpus. """ logger.info("Performing %s normalization...", self.norm) norms = [] numnnz = 0 docno = 0 for bow in corpus: docno += 1 numnnz += len(bow) norms.append(matutils.unitvec(bow, self.norm)) self.num_docs = docno self.num_nnz = numnnz self.norms = norms
def raw2ppmi(cooccur, k_shift=1.0): # following lines a bit tedious, as we try to avoid making temporary copies of the (large) `cooccur` matrix marginal_word = cooccur.sum(axis=1) marginal_context = cooccur.sum(axis=0) cooccur /= marginal_word[:, None] cooccur /= marginal_context cooccur *= marginal_word.sum() np.log(cooccur, out=cooccur) #Shfiting PMI scores by log(k) cooccur -= np.log(k_shift) #Clipping values to be non-negative cooccur.clip(0.0, out=cooccur) #Normalize PPMI vectors to unit length for i, vec in enumerate(cooccur): cooccur[i] = matutils.unitvec(vec) return cooccur
def _to_csv(df, col, size): file_name = '{col}_w2v.csv'.format(col=col) file_path = os.path.join(TEMP_DATA_PATH, file_name) if os.path.exists(file_path): os.remove(file_path) columns = ['{}_w2v_{}'.format(col, i) for i in range(size)] none_index_set = set() with open(file_path, 'a', encoding='utf-8') as f: # write columns f.write(','.join(columns) + '\n') for idx, item in tqdm(df[col].items()): if item == 'null': item_list = [''] * size none_index_set.add(idx) elif not item: item_list = [''] * size none_index_set.add(idx) else: seg_cut = jieba.lcut(item) seg_cut = char_list_cheaner(seg_cut) w2v_array = list() for word in seg_cut: try: similar_list = w2v_model[word] w2v_array.append(similar_list) except KeyError: pass if not w2v_array: item_list = [''] * size none_index_set.add(idx) else: item_list = matutils.unitvec(np.array(w2v_array).mean(axis=0)) f.write(','.join(map(str, item_list)) + '\n') return none_index_set
def get_elmo_vector_average(sess, texts, batcher, sentence_character_ids, elmo_sentence_input): vectors = [] # Create batches of data. sentence_ids = batcher.batch_sentences(texts) print('Sentences in this chunk:', len(texts), file=sys.stderr) # Compute ELMo representations. elmo_sentence_input_ = sess.run(elmo_sentence_input['weighted_op'], feed_dict={sentence_character_ids: sentence_ids}) print('ELMo sentence input shape:', elmo_sentence_input_.shape, file=sys.stderr) for sentence in range(len(texts)): sent_vec = np.zeros((elmo_sentence_input_.shape[1], elmo_sentence_input_.shape[2])) for word_vec in enumerate(elmo_sentence_input_[sentence, :, :]): sent_vec[word_vec[0], :] = word_vec[1] semantic_fingerprint = np.sum(sent_vec, axis=0) semantic_fingerprint = np.divide(semantic_fingerprint, sent_vec.shape[0]) query_vec = unitvec(semantic_fingerprint) vectors.append(query_vec) return vectors
def _get_jieba_array(self, words, size=300): ''' 对输入的word做结巴分词后获取对于的词向量,取平均后作为words的向量 ''' seg_cut = jieba.lcut(words) seg_cut = char_list_cheaner(seg_cut) w2v_array = list() for word in seg_cut: try: similar_list = self.w2v_model[word] w2v_array.append(similar_list) except KeyError: continue if not w2v_array: w2v_array = [None] * size else: w2v_array = matutils.unitvec(np.array(w2v_array).mean(axis=0)) return w2v_array
def shift_clip_pmi(pmimtr, k_shift=1.0): """ Turns a pmi matrix into a PPMI matrix by setting all negative values to 0 and then shifting by a factor of -log(k). :param pmimtr: The matrix of PMI values. :param k_shift: The shift factor. :return: A PPMI matrix. """ logger.info("shifting PMI scores by log(k) with k=%s" % (k_shift, )) pmimtr -= np.log(k_shift) # shifted PMI = log(#(w, c) * D / (#w * #c)) - log(k) logger.info("clipping PMI scores to be non-negative PPMI") pmimtr.clip(0.0, out=pmimtr) # SPPMI = max(0, log(#(w, c) * D / (#w * #c)) - log(k)) logger.info("normalizing PPMI word vectors to unit length") for i, vec in enumerate(pmimtr): pmimtr[i] = matutils.unitvec(vec) return matutils.corpus2csc(matutils.Dense2Corpus(pmimtr, documents_columns=False)).T
def mostSimilarSent(self, sent, query, allDoc, topn): words2 = query.split() try: words2.remove(u'\ufeff') except ValueError: words2 = words2 v2 = numpy.array([self[word] for word in words2], dtype=object) mean = matutils.unitvec(array(v2).mean(axis=0)) print "starting search dist" dists = dot(allDoc[0:None], mean) best = matutils.argsort(dists, topn, reverse=True) print "done!" result = [] for index in best: result.append(sent[index]) return result
def word_averaging(wv, words): """Calculate average word vectors. Args: wv: The keyed vectors instance to use to get word vectors as :class:`gensim.models.keyedvectors.WordEmbeddingsKeyedVectors`. words: The words to transform into vectors as :class:`list` of :class:`str`. Returns: The averaged vector as :class:`list` of :class:`float`. """ all_words, mean = set(), [] for word in words: if isinstance(word, np.ndarray): mean.append(word) elif word in wv.vocab: mean.append(wv.word_vec(word, use_norm=True)) all_words.add(wv.vocab[word].index) mean = unitvec(np.array(mean).mean(axis=0)).astype(np.float32) return mean
def most_similar(self, positive=[], negative=[], topn=10): if isinstance(positive, string_types) and not negative: # allow calls like most_similar('dog'), as a shorthand for most_similar(['dog']) positive = [positive] # add weights for each word, if not already present; default to 1.0 for positive and -1.0 for negative words positive = [ (word, 1.0) if isinstance(word, string_types + (ndarray, )) else word for word in positive ] negative = [ (word, -1.0) if isinstance(word, string_types + (ndarray, )) else word for word in negative ] # compute the weighted average of all words all_words, mean = set(), [] for word, weight in positive + negative: if isinstance(word, ndarray): mean.append(weight * word) elif word in self.vocab: mean.append(weight * self.syn0norm[self.vocab[word].index]) all_words.add(self.vocab[word].index) else: raise KeyError("word '%s' not in vocabulary" % word) if not mean: raise ValueError("cannot compute similarity with no input") mean = matutils.unitvec(array(mean).mean(axis=0)).astype(REAL) dists = dot(self.syn0norm, mean) if not topn: return dists best = argsort(dists)[::-1][:topn + len(all_words)] # ignore (don't return) words from the input result = [(self.index2word[sim], float(dists[sim]), self.syn0[sim]) for sim in best if sim not in all_words] return result[:topn]
def similarity_3_contexts(self, p, t): (bef, bet, aft) = (0, 0, 0) if t.bef_vector is not None and p.bef_vector is not None: bef = dot(matutils.unitvec(t.bef_vector), matutils.unitvec(p.bef_vector)) if t.bet_vector is not None and p.bet_vector is not None: bet = dot(matutils.unitvec(t.bet_vector), matutils.unitvec(p.bet_vector)) if t.aft_vector is not None and p.aft_vector is not None: aft = dot(matutils.unitvec(t.aft_vector), matutils.unitvec(p.aft_vector)) return self.config.alpha * bef + self.config.beta * bet + self.config.gamma * aft
def __init__(self, corpus, num_best=None, chunksize=500, dtype=numpy.float32, num_terms=None, num_docs=None, num_nnz=None): self.num_best = num_best self.normalize = True self.chunksize = chunksize if corpus is not None: logger.info("creating sparse index") # iterate over input corpus, populating the sparse index matrix try: # use the more efficient corpus generation version, if the input # `corpus` is MmCorpus-like (knows its shape and number of non-zeroes). num_terms, num_docs, num_nnz = corpus.num_terms, corpus.num_docs, corpus.num_nnz logger.debug("using efficient sparse index creation") except AttributeError: # no MmCorpus, use the slower version (or maybe user supplied the # num_* params in constructor) pass corpus = (matutils.scipy2sparse(v) if scipy.sparse.issparse(v) else (matutils.full2sparse(v) if isinstance(v, numpy.ndarray) else matutils.unitvec(v)) for v in corpus) self.index = matutils.corpus2csc(corpus, num_terms=num_terms, num_docs=num_docs, num_nnz=num_nnz, dtype=dtype, printprogress=10000).T # convert to Compressed Sparse Row for efficient row slicing and multiplications self.index = self.index.tocsr( ) # currently no-op, CSC.T is already CSR logger.info("created %r" % self.index)
def get_w2v_vector(sentence): ''' Translates sentence into vector using word2vec. Args: sentence: list of strings Returns: numpy.ndarray ''' all_words, mean = set(), [] for word in sentence: if word in w2v_model.wv.vocab: mean.append(w2v_model.wv.word_vec(word)) all_words.add(w2v_model.wv.vocab[word].index) if mean == []: return np.zeros(w2v_model.layer1_size, ) mean = matutils.unitvec(np.array(mean).mean(axis=0)).astype(np.float32) return mean
def doesnt_match(self, words): """ Which word from the given list doesn't go with the others? Example:: >>> trained_model.doesnt_match("breakfast cereal dinner lunch".split()) 'cereal' """ self.init_sims() used_words = [word for word in words if word in self] if len(used_words) != len(words): ignored_words = set(words) - set(used_words) logger.warning("vectors for words %s are not present in the model, ignoring these words", ignored_words) if not used_words: raise ValueError("cannot select a word from an empty list") vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL) mean = matutils.unitvec(vectors.mean(axis=0)).astype(REAL) dists = dot(vectors, mean) return sorted(zip(dists, used_words))[0][1]
def get_synsets_of_rule_parse(self, dp, use_offset=True, convert=False): try: context = ' '.join(dp['captions']) except TypeError: # Nan is in list context = ' '.join( [item for item in dp['captions'] if isinstance(item, str)]) tokens = self.tokenize(context) vecs = self.get_vecs_for_BOW(context) context_vec = normalize_vec(array(vecs).mean(axis=0)) context_vec = matutils.unitvec(context_vec) pos_tagged_context_dict = { self.lemmatize(k): v for k, v in self.tagger.tag(tokens) } # <token>: <pos> new_atoms = [] unique_entities = set([x for atom in dp['atoms'] for x in atom]) entity_id_dict = {} for entity in unique_entities: try: pt_pos = pos_tagged_context_dict[entity] except KeyError: pt_pos = self.tagger.tag([entity])[0][-1] pos = self.pos_map[pt_pos] synset = self.link_word_to_wn(entity, context_vec, context_as_vec=True, pos=pos) #synset = self.link_word_to_wn(entity,context,pos=pos) if synset is None: offset = None else: offset = synset.offset() if use_offset: entity_id_dict[entity] = offset else: entity_id_dict[entity] = synset for atom in dp['atoms']: new_atom = [] for entity in atom: new_atom.append((entity, entity_id_dict[entity])) new_atoms.append(new_atom) return convert_logical_caption(new_atoms) if convert else new_atoms
def saveSentenceVect(self, sent, loc): #be patient! this operation should take a long time allDoc = [] print "starting calculate vectors" for phrase, title in sent: s = phrase words1 = s.split() try: words1.remove(u'\ufeff') except ValueError: words1 = words1 v1 = numpy.array([self[word] for word in words1], dtype=object) allDoc.append(matutils.unitvec(array(v1).mean(axis=0))) print "done!" #numpy.savez('obj/vect.npz', *allDoc) print "save to a file..." outfile = open(loc + ".pkl", "w") numpy.save(loc, allDoc) print "done!"
def classify(self, instance): """Classify a text instance Returns: distribution: dict {class: possibility} """ distribution = {} words = instance.text.split() test_vec = self.model.infer_vector(words, steps=self.infer_num_passes) test_vec = unitvec(test_vec) for class_value, training_instances in self.training_data.items(): best_score = 0 for training_instance in training_instances: score = np.dot(test_vec, training_instance) if score > best_score: best_score = score distribution[class_value] = max(0, best_score) return self._normalize_distribution(distribution)