def _from_kvs_to_wvs(self, kvs: KeyedVectors) -> WordVectors: wvs = WordVectors( kvs.name, kvs.vector_size, { word: np.array(kvs.get_vector(word)) for word, vocab in kvs.vocab.items() }) return wvs
def texts_to_vectors(self, wordvectors: KeyedVectors, descriptor_name: str, use_idf=False): """ Map each document's word to a vector contained in wordvectors and then calculate a sentence vector by averaging all the vectors of the document. :param wordvectors: KeyedVector object with the loaded vectors :param descriptor_name: Name of the descriptor :return: """ if use_idf: if self.dct is None: self.dct = Dictionary([x.split(" ") for x in self.x]) if self.tfidf is None: self.tfidf = TfidfModel([self.dct.doc2bow(document.split(" ")) for document in self.x]) new_x = [] vectorized_counter = 0 not_vectorized_counter = 0 for document in self.x: document_vector_accum = None weight_accum = 0 for word in document.split(" "): try: vector = wordvectors.get_vector(word) if use_idf: try: idf = self.tfidf.idfs[self.dct.token2id[word]] except KeyError as e: print("warning: idf not found for {}".format(word)) continue else: idf = 1 # simple mean if document_vector_accum is None: document_vector_accum = vector*idf else: document_vector_accum += vector*idf weight_accum += idf vectorized_counter += 1 except KeyError: # print("warning: word: \"{}\" not found in {} vectors".format(word, descriptor_name)) not_vectorized_counter += 1 continue document_vector_accum = document_vector_accum / weight_accum new_x.append(document_vector_accum) print("info: done converting. vectorized {}; skipped {}".format(vectorized_counter, not_vectorized_counter)) return new_x
def test_load_multi(self): in_w2v_fp = TEST_DIR / 'doc2vec_w2v.txt' wv = KeyedVectors.load_word2vec_format(in_w2v_fp) glove = KeyedVectors.load_word2vec_format( TEST_DIR / 'glove.6B.200d.w2vformat.1k.txt') models = [wv, wv, wv, glove] target_vector_size = np.sum([m.vector_size for m in models]) self.assertEqual(models[0].vector_size * 3 + 200, target_vector_size) # Build new keyed vector model model = KeyedVectors(vector_size=target_vector_size) # self.assertEqual([str(i) for i in range(5)], wv.index2word) # Iterate over all words (in first model) for doc_id in models[0].index2word: # print(type(doc_id)) # Stack vectors from all models models_vec = [] for m in models: if doc_id in m.index2word: models_vec.append(m.get_vector(doc_id)) else: print(f'WARNING: {doc_id} does not exist in {m}') models_vec.append(np.zeros((m.vector_size))) vec = np.hstack(models_vec) model.add(doc_id, vec) self.assertEqual(300 + 200, model.get_vector('0').shape[0])
def check_embedding_coverage(vocabulary: Dict[str, int], keyed_vectors: KeyedVectors): """See what words from the vocabulary are not represented in the word vectors. Output information about the OOV (out of vocabulary) terms. :param vocabulary: Dictionary with words as keys and frequencies of the words in the corpus as values. :param keyed_vectors: gensim.model.KeyedVectors instance containing the word vectors. :return: """ cov = {} # Covered words dictionary oov = {} # Out of vocabulary dictionary covered_words = 0 oov_words = 0 for word in tqdm(vocabulary, desc="Words checked"): try: vector = keyed_vectors.get_vector(word) cov[word] = vector covered_words += vocabulary[word] if isinstance(vocabulary, Dict) else word except KeyError: oov[word] = vocabulary[word] if isinstance(vocabulary, Dict) else word oov_words += vocabulary[word] if isinstance(vocabulary, Dict) else word found_vocab_vectors = len(cov) / len(vocabulary) found_vocab_all_text = covered_words / (covered_words + oov_words) print('Found embeddings for {:.2%} of vocab'.format(found_vocab_vectors)) print('Found embeddings for {:.2%} of all text'.format( found_vocab_all_text)) sorted_oov = sorted(oov.items(), key=operator.itemgetter(1))[::-1] return sorted_oov