def load_annoy(annoypath, model): ''' :param annoypath: :type annoypath: :param model: :type model: Word2Vec :return: :rtype: AnnoyIndexer ''' if not os.path.exists(annoypath): print("开始构建annoy索引:当前时间 : " + time.asctime(time.localtime(time.time()))) starttime12 = time.time() aindex = AnnoyIndexer(model, 200) print("构建索引完毕 %.2f secs" % (time.time() - starttime12)) # 保存annoy索引 print("开始保存annoy索引") starttime13 = time.time() aindex.save(annoypath) print("保存索引完毕 %.2f secs" % (time.time() - starttime13)) else: aindex = AnnoyIndexer() aindex.load(annoypath) return aindex
def load_index_investment(self, path): index = AnnoyIndexer() for parent, dirnames, filenames in os.walk(path): for filename in filenames: # 生成的B.ind.d 是不能加载进来的,只能加载B.ind if len(filename.split('.')) == 2: logger.info(u'文件名为%s ,路径为:%s' % (str(filename.split('.')[0]), os.path.join(parent, filename))) index = AnnoyIndexer() index.load(os.path.join(parent, filename)) return index
def get_annoy(w2v, embedding_type='w2v'): dims = 100 annoy_file_name = data_dir + '/annoy_index_' + '_' + str(dims) + '_' + embedding_type + '_' + str(len(w2v.vocab)) if os.path.exists(annoy_file_name): logging.info("Loading Annoy from file: %s", annoy_file_name) annoy_index = AnnoyIndexer() annoy_index.load(annoy_file_name) annoy_index.model = word_vectors else: logging.info("Creating Annoy") annoy_index = AnnoyIndexer(word_vectors, dims) annoy_index.save(annoy_file_name) logging.info("Annoy indexing saved to %s", annoy_file_name) return annoy_index
def __init__(self, vec_file, pap, pat, pro): # self.wm = gensim.models.KeyedVectors.load_word2vec_format(vec_file,binary=True) self.wm = gensim.models.word2vec.Word2Vec.load_word2vec_format( vec_file, binary=True) self.paper_index = AnnoyIndexer() self.paper_index.load(pap) self.patent_index = AnnoyIndexer() self.patent_index.load(pat) self.project_index = AnnoyIndexer() self.project_index.load(pro) self.t2v = Convert2Vec(self.wm) self.cuttor = FilterCut() self.db = DB() self.featureIndex = self.buildFeatureIndex()
def index_vector(self, dimensions=300, save=False): ''' make annoy_index which is used in function 'is_word_pairs_similar' Using annoy_index, execution may be slower than normal index ''' path = Path.cwd().parent.joinpath('preprocessed/annoy.index') if path.exists(): annoy_index = AnnoyIndexer() annoy_index.load(str(path)) annoy_index.model = self.embedding else: annoy_index = AnnoyIndexer(self.embedding, dimensions) if save: annoy_index.save(str(path)) return annoy_index
def get_similarity(model, args): indexer = AnnoyIndexer(model, 10) i = 0 chunk_i = 0 item = [] similarity = [] with tqdm.tqdm(desc="get_similarity", total=len(model.wv.vectors)) as progress: for word in np.sort(list(model.wv.vocab.keys())): item.append(word) similarity.append([ '{}={}'.format(cscore, cword) for cscore, cword in model.wv.most_similar(word, topn=args.k, indexer=indexer) ]) i += 1 if i % args.save_one_time == 0: print("save to csv chunk no: {}".format(chunk_i)) topk_df = pd.DataFrame({'item': item, 'topk': similarity}) topk_df.to_csv(args.output_file, mode='a', header=False, index=False) i = 0 chunk_i += 1 item = [] similarity = [] progress.update(1) if i > 0: print("save to csv chunk no: {}".format(chunk_i)) topk_df = pd.DataFrame({'item': item, 'topk': similarity}) topk_df.to_csv(args.output_file, mode='a', header=False, index=False) return similarity
def load_index(self, path): index = {} nn = 0 for field in field_list: logger.info(u'---------field:' + field) index[field] = {} for unit_type in unit_types: index[field][unit_type] = {} for parent, dirnames, filenames in os.walk(path + field + '/' + unit_type + '/'): for filename in filenames: # if len(filename.split('.')) == 2 and ( # 'A5' in os.path.join(parent, filename) or 'project' in os.path.join(parent, filename)): if len(filename.split('.')) == 2: nn = nn + 1 logger.info( u'创建AnnoyIndexer %s:field=%s,unit_type=%s,province=%s' % (nn, field, unit_type, filename.split('.')[0])) index[field][unit_type][str( filename.split('.')[0])] = AnnoyIndexer() index[field][unit_type][str( filename.split('.')[0])].load( os.path.join(parent, filename)) return index
def _load_classifier(self, **kwargs): if self.classifier_type == 'ann': for f in list_files(self.s3_conn, self.s3_path): filepath = os.path.join(self.temporary_directory, f) if not os.path.exists(filepath): logging.warning('calling download from %s to %s', self.s3_path + f, filepath) download(self.s3_conn, filepath, os.path.join(self.s3_path, f)) ann_index = AnnoyIndexer() ann_index.load( os.path.join(self.temporary_directory, self.classifier_id + '.index')) return NearestNeighbors(s3_conn=self.s3_conn, indexer=ann_index, **kwargs) elif self.classifier_type == 'knn': return NearestNeighbors(s3_conn=self.s3_conn, indexed=False, **kwargs) else: print('Not implemented yet!') return None
def build_graph(filename, TOPN, A_name, indice2word_name, annoy=False, dim=100, tree_num=20): """ """ model = read_w2v(filename, dim) V = len(model.wv.vocab) print("Num. vocab = %i" % V) word_indice_dic = {word: i for i, word in enumerate(model.wv.vocab)} indice2word = {i: word for word, i in word_indice_dic.items()} A = dok_matrix((V, V), dtype=np.float32) if annoy: print("Using ANNOY...") from gensim.similarities.index import AnnoyIndexer annoy_index = AnnoyIndexer(model, tree_num) add_neighbors(A, TOPN, model, word_indice_dic, annoy_index=annoy_index) else: add_neighbors(A, TOPN, model, word_indice_dic) save_sparse_csr(A_name, A.tocsr()) pickle.dump(indice2word, open(indice2word_name, "wb"))
def _initialize_nns_method(self, nnsmethod, annoymodelpath): if self.nns_method_init_dict[nnsmethod]: return if nnsmethod == NNSMethod.KD_TREE: print("Building KD tree..") self.tree = cKDTree(self.model.docvecs.vectors_docs) print("Finished building KD tree.") self.keys = list(self.model.docvecs.doctags.keys()) elif nnsmethod == NNSMethod.ANNOY: if not os.path.isfile(annoymodelpath): print("Generating annoy index...") self.annoy_indexer = AnnoyIndexer(self.model, 50) print("Finished generating annoy index.") self.annoy_indexer.save(annoymodelpath) else: self.annoy_indexer = AnnoyIndexer() self.annoy_indexer.load(annoymodelpath) self.annoy_indexer.model = self.model
def create_sim_dict(file, vectors, min_sim=0.55, topn=10, num_trees=200): indexer = AnnoyIndexer(vectors, num_trees=num_trees) sim_dict = dict() for w in messages.pbar(vectors.vocab): sim = indexer.most_similar(vectors.get_vector(w), topn) sim_dict[w] = [s for s in sim if s[1] > min_sim] with open(file, 'wb') as fileout: pickle.dump(sim_dict, fileout)
def similar_augment(texts, labels, n_increase, n_word_replace, model_path, similar_threshold=0.5, use_annoy=True, annoy_path=None): w2v = KeyedVectors.load_word2vec_format(model_path, binary=True) texts_long = [] labels_long = [] if use_annoy: if annoy_path is None: indexer = AnnoyIndexer(w2v, 100) else: indexer = AnnoyIndexer() indexer.load(annoy_path) for ind in range(len(texts)): if len(texts[ind]) >= n_word_replace: texts_long.append(texts[ind]) labels_long.append(labels[ind]) shuffle_ind = np.random.choice(len(texts_long), size=n_increase) for ind in shuffle_ind: text_copy = copy.deepcopy(texts_long[ind]) # if is_hier: replace_inds = np.random.choice(text_copy.shape[-1], size=n_word_replace, replace=False) for word_ind in replace_inds: word = text_copy[word_ind] try: closest, score = w2v.wv.most_similar( word, topn=2, indexer=indexer if use_annoy else None)[1] if score > similar_threshold: text_copy[word_ind] = closest except: continue texts.append(text_copy) labels = np.append(labels, [labels_long[ind]]) return texts, labels
def testAnnoyIndexingOfKeyedVectors(self): from gensim.similarities.index import AnnoyIndexer keyVectors_file = datapath('lee_fasttext.vec') model = KeyedVectors.load_word2vec_format(keyVectors_file) index = AnnoyIndexer(model, 10) self.assertEqual(index.num_trees, 10) self.assertVectorIsSimilarToItself(model, index) self.assertApproxNeighborsMatchExact(model, model, index)
def load_indexer(self, model_name): if self.full_log is not None: self.full_log.info("Loading word model indexer...") indexer = None if self.settings["use_annoy_indexer"]: if model_name == "glove-twitter-100": if self.full_log is not None: self.full_log.debug("Loading Twitter 100") indexer = AnnoyIndexer() indexer.load( r"C:\Users\benja\OneDrive\Documents\UniWork\Aberystwyth\Year3\CS39440" + r"\MajorProject\models\glove-twitter-100-5-trees.ann") elif model_name == "glove-twitter-200": if self.full_log is not None: self.full_log.debug("Loading Twitter 200") indexer = AnnoyIndexer() indexer.load( r"C:\Users\benja\OneDrive\Documents\UniWork\Aberystwyth\Year3\CS39440" + r"\MajorProject\models\glove-twitter-200-5-trees.ann") elif model_name == "glove-wiki-300": if self.full_log is not None: self.full_log.debug("Loading Wiki 300") indexer = AnnoyIndexer() indexer.load( r"C:\Users\benja\OneDrive\Documents\UniWork\Aberystwyth\Year3\CS39440" + r"\MajorProject\models\glove-wiki-300-5-trees.ann") elif model_name == "glove-wiki-100": if self.full_log is not None: self.full_log.debug("Loading Wiki 100") indexer = AnnoyIndexer() indexer.load( r"C:\Users\benja\OneDrive\Documents\UniWork\Aberystwyth\Year3\CS39440" + r"\MajorProject\models\glove-wiki-100-5-trees.ann") if self.full_log is not None: self.full_log.info("Done loading model indexer") else: if self.full_log is not None: self.full_log.warning("No indexer selected, using default") return indexer
def get_indexer(fpath, model, room_id): if os.path.exists(fpath): logging.info("Use annoy_index :: room_id:%s", room_id) annoy_index = AnnoyIndexer() annoy_index.load(fpath) annoy_index.model = model return annoy_index else: # indexer: defaut is None return None
def f(process_id): print('Process Id: {}'.format(os.getpid())) process = psutil.Process(os.getpid()) new_model = Word2Vec.load('/tmp/mymodel.pkl') vector = new_model.wv["science"] annoy_index = AnnoyIndexer(new_model, 100) approximate_neighbors = new_model.wv.most_similar([vector], topn=5, indexer=annoy_index) print('\nMemory used by process {}: {}\n---'.format( os.getpid(), process.memory_info()))
def train_with_annoy(self): self.status = 3 push.push_to_rtx(push.generate_rtx_markdown("annoy向量空间开始注水")) self.annoy_index = AnnoyIndexer(self.tc_wv_model, 200) fname = 'tc_index_genoy.index' self.annoy_index.save(fname) # 导出训练结果,以后直接 load 即可 # annoy_index = AnnoyIndexer() # annoy_index.load(fname) # annoy_index.model = tc_wv_model push.push_to_rtx(push.generate_rtx_markdown("annoy向量空间注水完毕")) self.status = 4
def assertLoadedIndexEqual(self, index, model): from gensim.similarities.index import AnnoyIndexer index.save('index') index2 = AnnoyIndexer() index2.load('index') index2.model = model self.assertEqual(index.index.f, index2.index.f) self.assertEqual(index.labels, index2.labels) self.assertEqual(index.num_trees, index2.num_trees)
def testSaveLoad(self): from gensim.similarities.index import AnnoyIndexer self.index.save('index') self.index2 = AnnoyIndexer() self.index2.load('index') self.index2.model = self.model self.assertEqual(self.index.index.f, self.index2.index.f) self.assertEqual(self.index.labels, self.index2.labels) self.assertEqual(self.index.num_trees, self.index2.num_trees)
def __init__(self, path_to_dictionary="mipt_vecs.w2v", indexer=None, cache_dict=False, partition=0.5): # если нет словаря, то скачать его self.model = gensim.models.KeyedVectors.load_word2vec_format( path_to_dictionary, binary=True, unicode_errors="ignore") self.annoy_index = (AnnoyIndexer(self.model, num_trees=10) if (indexer == "annoy") else None) self.replace_dict = dict() if cache_dict else None self.partition = partition
def setUp(self): try: import annoy except ImportError: raise unittest.SkipTest("Annoy library is not available") from gensim.similarities.index import AnnoyIndexer self.model = doc2vec.Doc2Vec(sentences, min_count=1) self.model.init_sims() self.index = AnnoyIndexer(self.model, 300) self.vector = self.model.docvecs.doctag_syn0norm[0]
def setUp(self): try: import annoy except ImportError: raise unittest.SkipTest("Annoy library is not available") from gensim.similarities.index import AnnoyIndexer self.model = word2vec.Word2Vec(texts, min_count=1) self.model.init_sims() self.index = AnnoyIndexer(self.model, 10) self.vector = self.model.syn0norm[0]
def assertLoadedIndexEqual(self, index, model): from gensim.similarities.index import AnnoyIndexer fname = get_tmpfile('gensim_similarities.tst.pkl') index.save(fname) index2 = AnnoyIndexer() index2.load(fname) index2.model = model self.assertEqual(index.index.f, index2.index.f) self.assertEqual(index.labels, index2.labels) self.assertEqual(index.num_trees, index2.num_trees)
def predict(text): model = doc2vec.Doc2Vec.load('../models/doc2vec.model') indexer = AnnoyIndexer() indexer.load('../models/dv_index') indexer.model = model # print(indexer.labels) new_vec = [] for word in transform_text(text, strip=False): new_vec.append(model[word]) print(new_vec) sv = model.infer_vector(transform_text(text, strip=False)) print(sv) print(indexer.most_similar(sv, 2))
def testSaveLoad(self): from gensim.similarities.index import AnnoyIndexer fname = get_tmpfile('gensim_similarities.tst.pkl') self.index.save(fname) self.index2 = AnnoyIndexer() self.index2.load(fname) self.index2.model = self.model self.assertEqual(self.index.index.f, self.index2.index.f) self.assertEqual(self.index.labels, self.index2.labels) self.assertEqual(self.index.num_trees, self.index2.num_trees)
def load_w2v(): print("Loading gensim pre-trained model") # model = KeyedVectors.load_word2vec_format("SO_vectors_200.bin", binary=True) # Above is intolerably slow and large, normed by code found here: https://stackoverflow.com/a/56963501 model = KeyedVectors.load("SO_vectors_normed", mmap='r') # Use this to load the provided AnnoyIndex annoy_index = AnnoyIndexer() annoy_index.load('SO_vectors_normed_annoy_index') # Use this to generate a new AnnoyIndex in ram, number is n-gram size (2 is recommended and seems to work best here) # annoy_index = AnnoyIndexer(model, 3) return Word2Vec(model, index=annoy_index)
def create_sim_dict(word_map, model_path, similar_threshold=0.5, use_annoy=True, annoy_path=None): w2v = KeyedVectors.load_word2vec_format(model_path, binary=True) if use_annoy: if annoy_path is None: indexer = AnnoyIndexer(w2v, 100) else: indexer = AnnoyIndexer() indexer.load(annoy_path) sim_dict = dict() for word in word_map: try: closest, score = w2v.wv.most_similar( word, topn=2, indexer=indexer if use_annoy else None)[1] if score > similar_threshold and closest in word_map: sim_dict[word_map[word]] = word_map[closest] except: continue return sim_dict
def _ann_indexer(self): """This function should be in the training process. It's here for temporary usage. Annoy is an open source library to search for points in space that are close to a given query point. It also creates large read-only file-based data structures that are mmapped into memory so that many processes may share the same data. For our purpose, it is used to find similarity between words or documents in a vector space. Returns: Annoy index object if self.indexed is True. None if we want to use gensim built-in index. """ logging.info('indexing the model %s', self.model_name) self.model.init_sims() annoy_index = AnnoyIndexer(self.model, 200) return annoy_index
def train_item2vec(df=None, sessions=None, samples=None): if df is None and samples is None: raise NotImplementedError( ">>> Must be specific no items. Can not set `df` and `samples` to None" # noqa ) if samples is None: gen_rooms = RoomsGenerator(df, sessions) else: gen_rooms = samples start_ = time.time() model_i2v_path = os.path.join(st.BASE_MODEL, "{}.model".format(st.ITEM2VEC_KEY)) if os.path.exists(model_i2v_path): logging.info("Load pre-train model") model = Word2Vec.load(model_i2v_path) logging.info("Vocabulary before re-training: %d", len(model.wv.vocab)) model.build_vocab(gen_rooms, update=True) logging.info("Vocabulary after re-training: %d", len(model.wv.vocab)) model.train(gen_rooms, total_examples=model.corpus_count, epochs=model.iter, callbacks=()) logging.info("Pre-train model took %d's'", time.time() - start_) else: model = Word2Vec(gen_rooms, sg=st.SG, size=st.I2V_DIM, window=st.WINDOWS, min_count=st.MIN_COUNT, workers=st.WORKERS, iter=st.EPOCHS, sample=st.SAMPLE, negative=st.NS, compute_loss=st.COMPUTE_LOSS, callbacks=[Timer(start_)]) logging.info("Saving item2vec model") model.save(model_i2v_path) logging.info("Build annoy index for item2vec model") annoy_index = AnnoyIndexer(model, 100) annoy_index.save( os.path.join(st.BASE_MODEL, "{}.model".format(st.ANNOY_INDEX_KEY)))
def build_ann_indexer(self, num_trees=100): """ Annoy is an open source library to search for points in space that are close to a given query point. It also creates large read-only file-based data structures that are mmapped into memory so that many processes may share the same data. For our purpose, it is used to find similarity between words or documents in a vector space. Args: num_trees (int): A positive integer which effects the build time and the index size. A larger value will give more accurate results, but larger indexes. (https://github.com/spotify/annoy) Returns: Annoy index object """ logging.info('indexing the model %s', self.model_name) self.model.init_sims() annoy_index = AnnoyIndexer(self.model, num_trees) self.indexer = annoy_index return annoy_index