def load_annoy(annoypath, model): ''' :param annoypath: :type annoypath: :param model: :type model: Word2Vec :return: :rtype: AnnoyIndexer ''' if not os.path.exists(annoypath): print("开始构建annoy索引:当前时间 : " + time.asctime(time.localtime(time.time()))) starttime12 = time.time() aindex = AnnoyIndexer(model, 200) print("构建索引完毕 %.2f secs" % (time.time() - starttime12)) # 保存annoy索引 print("开始保存annoy索引") starttime13 = time.time() aindex.save(annoypath) print("保存索引完毕 %.2f secs" % (time.time() - starttime13)) else: aindex = AnnoyIndexer() aindex.load(annoypath) return aindex
def _load_classifier(self, **kwargs): if self.classifier_type == 'ann': for f in list_files(self.s3_conn, self.s3_path): filepath = os.path.join(self.temporary_directory, f) if not os.path.exists(filepath): logging.warning('calling download from %s to %s', self.s3_path + f, filepath) download(self.s3_conn, filepath, os.path.join(self.s3_path, f)) ann_index = AnnoyIndexer() ann_index.load( os.path.join(self.temporary_directory, self.classifier_id + '.index')) return NearestNeighbors(s3_conn=self.s3_conn, indexer=ann_index, **kwargs) elif self.classifier_type == 'knn': return NearestNeighbors(s3_conn=self.s3_conn, indexed=False, **kwargs) else: print('Not implemented yet!') return None
class TestDoc2VecAnnoyIndexer(unittest.TestCase): def setUp(self): try: import annoy # noqa:F401 except ImportError: raise unittest.SkipTest("Annoy library is not available") from gensim.similarities.index import AnnoyIndexer self.model = doc2vec.Doc2Vec(sentences, min_count=1) self.model.init_sims() self.index = AnnoyIndexer(self.model, 300) self.vector = self.model.docvecs.doctag_syn0norm[0] def testDocumentIsSimilarToItself(self): approx_neighbors = self.index.most_similar(self.vector, 1) doc, similarity = approx_neighbors[0] self.assertEqual(doc, 0) self.assertEqual(similarity, 1.0) def testApproxNeighborsMatchExact(self): approx_neighbors = self.model.docvecs.most_similar([self.vector], topn=5, indexer=self.index) exact_neighbors = self.model.docvecs.most_similar( positive=[self.vector], topn=5) approx_words = [neighbor[0] for neighbor in approx_neighbors] exact_words = [neighbor[0] for neighbor in exact_neighbors] self.assertEqual(approx_words, exact_words) def testSave(self): fname = testfile() self.index.save(fname) self.assertTrue(os.path.exists(fname)) self.assertTrue(os.path.exists(fname + '.d')) def testLoadNotExist(self): from gensim.similarities.index import AnnoyIndexer self.test_index = AnnoyIndexer() self.assertRaises(IOError, self.test_index.load, fname='test-index') def testSaveLoad(self): from gensim.similarities.index import AnnoyIndexer fname = testfile() self.index.save(fname) self.index2 = AnnoyIndexer() self.index2.load(fname) self.index2.model = self.model self.assertEqual(self.index.index.f, self.index2.index.f) self.assertEqual(self.index.labels, self.index2.labels) self.assertEqual(self.index.num_trees, self.index2.num_trees)
class TestDoc2VecAnnoyIndexer(unittest.TestCase): def setUp(self): try: import annoy except ImportError: raise unittest.SkipTest("Annoy library is not available") from gensim.similarities.index import AnnoyIndexer self.model = doc2vec.Doc2Vec(sentences, min_count=1) self.model.init_sims() self.index = AnnoyIndexer(self.model, 300) self.vector = self.model.docvecs.doctag_syn0norm[0] def testDocumentIsSimilarToItself(self): approx_neighbors = self.index.most_similar(self.vector, 1) doc, similarity = approx_neighbors[0] self.assertEqual(doc, 0) self.assertEqual(similarity, 1.0) def testApproxNeighborsMatchExact(self): approx_neighbors = self.model.docvecs.most_similar([self.vector], topn=5, indexer=self.index) exact_neighbors = self.model.docvecs.most_similar( positive=[self.vector], topn=5) approx_words = [neighbor[0] for neighbor in approx_neighbors] exact_words = [neighbor[0] for neighbor in exact_neighbors] self.assertEqual(approx_words, exact_words) def testSave(self): fname = testfile() self.index.save(fname) self.assertTrue(os.path.exists(fname)) self.assertTrue(os.path.exists(fname + '.d')) def testLoadNotExist(self): from gensim.similarities.index import AnnoyIndexer self.test_index = AnnoyIndexer() self.assertRaises(IOError, self.test_index.load, fname='test-index') def testSaveLoad(self): from gensim.similarities.index import AnnoyIndexer fname = testfile() self.index.save(fname) self.index2 = AnnoyIndexer() self.index2.load(fname) self.index2.model = self.model self.assertEqual(self.index.index.f, self.index2.index.f) self.assertEqual(self.index.labels, self.index2.labels) self.assertEqual(self.index.num_trees, self.index2.num_trees)
class TestWord2VecAnnoyIndexer(unittest.TestCase): def setUp(self): try: import annoy except ImportError: raise unittest.SkipTest("Annoy library is not available") from gensim.similarities.index import AnnoyIndexer self.model = word2vec.Word2Vec(texts, min_count=1) self.model.init_sims() self.index = AnnoyIndexer(self.model, 10) self.vector = self.model.wv.syn0norm[0] def testVectorIsSimilarToItself(self): label = self.model.index2word[0] approx_neighbors = self.index.most_similar(self.vector, 1) word, similarity = approx_neighbors[0] self.assertEqual(word, label) self.assertEqual(similarity, 1.0) def testApproxNeighborsMatchExact(self): approx_neighbors = self.model.most_similar([self.vector], topn=5, indexer=self.index) exact_neighbors = self.model.most_similar(positive=[self.vector], topn=5) approx_words = [neighbor[0] for neighbor in approx_neighbors] exact_words = [neighbor[0] for neighbor in exact_neighbors] self.assertEqual(approx_words, exact_words) def testSave(self): self.index.save('index') self.assertTrue(os.path.exists('index')) self.assertTrue(os.path.exists('index.d')) def testLoadNotExist(self): from gensim.similarities.index import AnnoyIndexer self.test_index = AnnoyIndexer() self.assertRaises(IOError, self.test_index.load, fname='test-index') def testSaveLoad(self): from gensim.similarities.index import AnnoyIndexer self.index.save('index') self.index2 = AnnoyIndexer() self.index2.load('index') self.index2.model = self.model self.assertEqual(self.index.index.f, self.index2.index.f) self.assertEqual(self.index.labels, self.index2.labels) self.assertEqual(self.index.num_trees, self.index2.num_trees)
class TestWord2VecAnnoyIndexer(unittest.TestCase): def setUp(self): try: import annoy except ImportError: raise unittest.SkipTest("Annoy library is not available") from gensim.similarities.index import AnnoyIndexer self.model = word2vec.Word2Vec(texts, min_count=1) self.model.init_sims() self.index = AnnoyIndexer(self.model, 10) self.vector = self.model.syn0norm[0] def testVectorIsSimilarToItself(self): label = self.model.index2word[0] approx_neighbors = self.index.most_similar(self.vector, 1) word, similarity = approx_neighbors[0] self.assertEqual(word, label) self.assertEqual(similarity, 1.0) def testApproxNeighborsMatchExact(self): approx_neighbors = self.model.most_similar([self.vector], topn=5, indexer=self.index) exact_neighbors = self.model.most_similar(positive=[self.vector], topn=5) approx_words = [neighbor[0] for neighbor in approx_neighbors] exact_words = [neighbor[0] for neighbor in exact_neighbors] self.assertEqual(approx_words, exact_words) def testSave(self): self.index.save('index') self.assertTrue(os.path.exists('index')) self.assertTrue(os.path.exists('index.d')) def testLoadNotExist(self): from gensim.similarities.index import AnnoyIndexer self.test_index = AnnoyIndexer() self.assertRaises(IOError, self.test_index.load, fname='test-index') def testSaveLoad(self): from gensim.similarities.index import AnnoyIndexer self.index.save('index') self.index2 = AnnoyIndexer() self.index2.load('index') self.index2.model = self.model self.assertEqual(self.index.index.f, self.index2.index.f) self.assertEqual(self.index.labels, self.index2.labels) self.assertEqual(self.index.num_trees, self.index2.num_trees)
def get_indexer(fpath, model, room_id): if os.path.exists(fpath): logging.info("Use annoy_index :: room_id:%s", room_id) annoy_index = AnnoyIndexer() annoy_index.load(fpath) annoy_index.model = model return annoy_index else: # indexer: defaut is None return None
def assertLoadedIndexEqual(self, index, model): from gensim.similarities.index import AnnoyIndexer index.save('index') index2 = AnnoyIndexer() index2.load('index') index2.model = model self.assertEqual(index.index.f, index2.index.f) self.assertEqual(index.labels, index2.labels) self.assertEqual(index.num_trees, index2.num_trees)
def load_index_investment(self, path): index = AnnoyIndexer() for parent, dirnames, filenames in os.walk(path): for filename in filenames: # 生成的B.ind.d 是不能加载进来的,只能加载B.ind if len(filename.split('.')) == 2: logger.info(u'文件名为%s ,路径为:%s' % (str(filename.split('.')[0]), os.path.join(parent, filename))) index = AnnoyIndexer() index.load(os.path.join(parent, filename)) return index
def predict(text): model = doc2vec.Doc2Vec.load('../models/doc2vec.model') indexer = AnnoyIndexer() indexer.load('../models/dv_index') indexer.model = model # print(indexer.labels) new_vec = [] for word in transform_text(text, strip=False): new_vec.append(model[word]) print(new_vec) sv = model.infer_vector(transform_text(text, strip=False)) print(sv) print(indexer.most_similar(sv, 2))
def f(process_id): print('Process Id: {}'.format(os.getpid())) process = psutil.Process(os.getpid()) new_model = Word2Vec.load('/tmp/mymodel.pkl') vector = new_model.wv["science"] annoy_index = AnnoyIndexer() annoy_index.load('/tmp/mymodel.index') annoy_index.model = new_model approximate_neighbors = new_model.wv.most_similar([vector], topn=5, indexer=annoy_index) print('\nMemory used by process {}: {}\n---'.format( os.getpid(), process.memory_info()))
def assertLoadedIndexEqual(self, index, model): from gensim.similarities.index import AnnoyIndexer fname = get_tmpfile('gensim_similarities.tst.pkl') index.save(fname) index2 = AnnoyIndexer() index2.load(fname) index2.model = model self.assertEqual(index.index.f, index2.index.f) self.assertEqual(index.labels, index2.labels) self.assertEqual(index.num_trees, index2.num_trees)
def get_annoy(w2v, embedding_type='w2v'): dims = 100 annoy_file_name = data_dir + '/annoy_index_' + '_' + str(dims) + '_' + embedding_type + '_' + str(len(w2v.vocab)) if os.path.exists(annoy_file_name): logging.info("Loading Annoy from file: %s", annoy_file_name) annoy_index = AnnoyIndexer() annoy_index.load(annoy_file_name) annoy_index.model = word_vectors else: logging.info("Creating Annoy") annoy_index = AnnoyIndexer(word_vectors, dims) annoy_index.save(annoy_file_name) logging.info("Annoy indexing saved to %s", annoy_file_name) return annoy_index
def load_w2v(): print("Loading gensim pre-trained model") # model = KeyedVectors.load_word2vec_format("SO_vectors_200.bin", binary=True) # Above is intolerably slow and large, normed by code found here: https://stackoverflow.com/a/56963501 model = KeyedVectors.load("SO_vectors_normed", mmap='r') # Use this to load the provided AnnoyIndex annoy_index = AnnoyIndexer() annoy_index.load('SO_vectors_normed_annoy_index') # Use this to generate a new AnnoyIndex in ram, number is n-gram size (2 is recommended and seems to work best here) # annoy_index = AnnoyIndexer(model, 3) return Word2Vec(model, index=annoy_index)
def index_vector(self, dimensions=300, save=False): ''' make annoy_index which is used in function 'is_word_pairs_similar' Using annoy_index, execution may be slower than normal index ''' path = Path.cwd().parent.joinpath('preprocessed/annoy.index') if path.exists(): annoy_index = AnnoyIndexer() annoy_index.load(str(path)) annoy_index.model = self.embedding else: annoy_index = AnnoyIndexer(self.embedding, dimensions) if save: annoy_index.save(str(path)) return annoy_index
def similar_augment(texts, labels, n_increase, n_word_replace, model_path, similar_threshold=0.5, use_annoy=True, annoy_path=None): w2v = KeyedVectors.load_word2vec_format(model_path, binary=True) texts_long = [] labels_long = [] if use_annoy: if annoy_path is None: indexer = AnnoyIndexer(w2v, 100) else: indexer = AnnoyIndexer() indexer.load(annoy_path) for ind in range(len(texts)): if len(texts[ind]) >= n_word_replace: texts_long.append(texts[ind]) labels_long.append(labels[ind]) shuffle_ind = np.random.choice(len(texts_long), size=n_increase) for ind in shuffle_ind: text_copy = copy.deepcopy(texts_long[ind]) # if is_hier: replace_inds = np.random.choice(text_copy.shape[-1], size=n_word_replace, replace=False) for word_ind in replace_inds: word = text_copy[word_ind] try: closest, score = w2v.wv.most_similar( word, topn=2, indexer=indexer if use_annoy else None)[1] if score > similar_threshold: text_copy[word_ind] = closest except: continue texts.append(text_copy) labels = np.append(labels, [labels_long[ind]]) return texts, labels
class WordNeighbors: def __init__(self, model_dir): self.ft_model = FastText.load(os.path.join(model_dir, 'ft_model')) self.w2v_model = Word2Vec.load(os.path.join(model_dir, 'w2v_model')) self.annoy_index = AnnoyIndexer() self.annoy_index.load(os.path.join(model_dir, 'annoy_model')) def query(self, w, topn): if w in self.w2v_model: vector = self.w2v_model[w] neighbors = self.w2v_model.most_similar([vector], topn=topn, indexer=self.annoy_index) else: try: neighbors = self.ft_model.most_similar(w, topn=topn) except KeyError: neighbors = [] return neighbors
class Predictor: def __init__(self): self.model = Word2Vec.load('data/word2vec.model') self.vocab = self.model.wv.vocab self.annoy_index = AnnoyIndexer() self.annoy_index.load('data/word2vec_idx.ann') self.annoy_index.model = self.model def explain(self, word, n_words): try: ans_words = self.model.wv.most_similar( positive=[lemmatize_stemming(word)], topn=n_words + 1, indexer=self.annoy_index) print([lemmatize_stemming(word[0]) for word in ans_words[1:]]) return [word[0] for word in ans_words[1:]] except KeyError: return 'Wrong word' def guess(self, words, n_words): try: if len(words) != 1: ans_words = self.model.wv.most_similar( positive=[ lemmatize_stemming(word) for word in words if lemmatize_stemming(word) in self.vocab ], topn=n_words, indexer=self.annoy_index) return [word[0] for word in ans_words] else: ans_words = self.model.wv.most_similar( positive=[ lemmatize_stemming(word) for word in words if lemmatize_stemming(word) in self.vocab ], topn=n_words + 1, indexer=self.annoy_index) return [word[0] for word in ans_words[1:]] except ValueError: return 'Wrong word'
def create_sim_dict(word_map, model_path, similar_threshold=0.5, use_annoy=True, annoy_path=None): w2v = KeyedVectors.load_word2vec_format(model_path, binary=True) if use_annoy: if annoy_path is None: indexer = AnnoyIndexer(w2v, 100) else: indexer = AnnoyIndexer() indexer.load(annoy_path) sim_dict = dict() for word in word_map: try: closest, score = w2v.wv.most_similar( word, topn=2, indexer=indexer if use_annoy else None)[1] if score > similar_threshold and closest in word_map: sim_dict[word_map[word]] = word_map[closest] except: continue return sim_dict
def load_indexer(self, model_name): if self.full_log is not None: self.full_log.info("Loading word model indexer...") indexer = None if self.settings["use_annoy_indexer"]: if model_name == "glove-twitter-100": if self.full_log is not None: self.full_log.debug("Loading Twitter 100") indexer = AnnoyIndexer() indexer.load( r"C:\Users\benja\OneDrive\Documents\UniWork\Aberystwyth\Year3\CS39440" + r"\MajorProject\models\glove-twitter-100-5-trees.ann") elif model_name == "glove-twitter-200": if self.full_log is not None: self.full_log.debug("Loading Twitter 200") indexer = AnnoyIndexer() indexer.load( r"C:\Users\benja\OneDrive\Documents\UniWork\Aberystwyth\Year3\CS39440" + r"\MajorProject\models\glove-twitter-200-5-trees.ann") elif model_name == "glove-wiki-300": if self.full_log is not None: self.full_log.debug("Loading Wiki 300") indexer = AnnoyIndexer() indexer.load( r"C:\Users\benja\OneDrive\Documents\UniWork\Aberystwyth\Year3\CS39440" + r"\MajorProject\models\glove-wiki-300-5-trees.ann") elif model_name == "glove-wiki-100": if self.full_log is not None: self.full_log.debug("Loading Wiki 100") indexer = AnnoyIndexer() indexer.load( r"C:\Users\benja\OneDrive\Documents\UniWork\Aberystwyth\Year3\CS39440" + r"\MajorProject\models\glove-wiki-100-5-trees.ann") if self.full_log is not None: self.full_log.info("Done loading model indexer") else: if self.full_log is not None: self.full_log.warning("No indexer selected, using default") return indexer
# -*- coding: utf-8 -*- from gensim.models import Word2Vec from gensim.similarities.index import AnnoyIndexer import os from app.common.config import model_path from app.util.pre_model import W2V_VOCABULARY_SET, VOCABULARY_SET from app.util.remove_utils import remove_not_sports # 加载model目录下指定模型 path_to_model = os.path.join(model_path, 'word2vec') model = Word2Vec.load(path_to_model) # 从disk加载annoy indexer path_to_indexer = os.path.join(model_path, 'annoy_indexer_100') annoy_indexer_100 = AnnoyIndexer() annoy_indexer_100.load(path_to_indexer) annoy_indexer_100.model = model # 使用模型计算输入词组 # 2018-03-08 使用indexer解决cpu占用问题 def associate_words(words, cont_type, with_model=model, top_n=10): words = [w.strip() for w in words] words = list(filter(lambda x: x in VOCABULARY_SET, words)) res = {} tops = [] if words is None or len(words) == 0: return res for i in range(len(words)): try: tops = (with_model.most_similar(positive=words[0:(len(words) - i)],
class Doc2VecModel: BASE_WIKI_QUERY = "https://en.wikipedia.org/w/api.php?action=query&format=json&pageids=" stopword_list = stopwords.words('english') def __init__(self, modelname): logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) self.modelname = modelname if self.modelname is None: trian_corpus = self._get_training_iterator() self.model = Doc2Vec(vector_size=100, min_count=5, workers=7) self.model.build_vocab(trian_corpus, progress_per=10000) else: self.model = Doc2Vec.load(self.modelname) self.nns_method_init_dict = { NNSMethod.BRUTE: True, NNSMethod.KD_TREE: False, NNSMethod.ANNOY: False } def infer_file(self, filename, n=10): with open(filename, 'r') as f: lines = f.readlines() lines = ' '.join(lines) return self.infer(lines, n) def infer(self, string, n=10, nnsmethod=NNSMethod.ANNOY, annoymodelpath="gensim_annoy"): self._initialize_nns_method(nnsmethod, annoymodelpath) words = self._preprocess(string) # Set the random seed to make inferred vector determanistic self.model.random = np.random.mtrand.RandomState(1337) inferred_vector = self.model.infer_vector(words) ids, dists = self._calculate_most_similar(inferred_vector, n, nnsmethod) titles = self._get_title_from_pageids(ids) return titles, dists def train(self, epochs): trian_corpus = self._get_training_iterator() self.model.train(trian_corpus, total_examples=self.model.corpus_count, epochs=epochs, report_delay=10) self.model.save(self.modelname) def _initialize_nns_method(self, nnsmethod, annoymodelpath): if self.nns_method_init_dict[nnsmethod]: return if nnsmethod == NNSMethod.KD_TREE: print("Building KD tree..") self.tree = cKDTree(self.model.docvecs.vectors_docs) print("Finished building KD tree.") self.keys = list(self.model.docvecs.doctags.keys()) elif nnsmethod == NNSMethod.ANNOY: if not os.path.isfile(annoymodelpath): print("Generating annoy index...") self.annoy_indexer = AnnoyIndexer(self.model, 50) print("Finished generating annoy index.") self.annoy_indexer.save(annoymodelpath) else: self.annoy_indexer = AnnoyIndexer() self.annoy_indexer.load(annoymodelpath) self.annoy_indexer.model = self.model def _calculate_most_similar(self, vector, n, nnsmethod): start_time = time.clock() if nnsmethod == NNSMethod.BRUTE: tops = self.model.docvecs.most_similar([vector], topn=n) dists, indicies = [t[0] for t in tops], [t[1] for t in tops] if nnsmethod == NNSMethod.KD_TREE: dists, indicies = self.tree.query(vector, k=n) indicies = [self.keys[i] for i in indicies] if nnsmethod == NNSMethod.ANNOY: tops = self.model.docvecs.most_similar([vector], topn=n, indexer=self.annoy_indexer) dists, indicies = [t[0] for t in tops], [t[1] for t in tops] print(f"Time using {nnsmethod} - {time.clock() - start_time}") return dists, indicies def _preprocess(self, string): string = string.lower() string = re.sub('[^a-z\s]+', '', string) words = nltk.word_tokenize(string) return [word for word in words if word not in self.stopword_list] def _get_training_iterator(self): home = os.path.expanduser("~") path = os.path.join( home, "Documents", "text") # Data is assumed to be in ~/Documents/text files = glob.glob(os.path.join(path, "**/wiki_*"), recursive=True) return TaggedDocumentGenerator(files) def _get_title_from_pageids(self, ids): ids = '|'.join(ids) query = self.BASE_WIKI_QUERY + ids response = urlopen(query) dic = json.loads(response.read()) return [ v['title'] if 'title' in v else "PageId: " + str(v['pageid']) for v in dic['query']['pages'].values() ]
import codecs, json from collections import defaultdict import numpy as np from matplotlib import pyplot as plt from sklearn.manifold import TSNE from sklearn.cluster import KMeans from gensim.models import KeyedVectors from gensim.similarities.index import AnnoyIndexer wv_ent = KeyedVectors.load_word2vec_format('entity2vec.bin', binary=True) annoy_index_ent = AnnoyIndexer() annoy_index_ent.load('entity2vec.index') annoy_index_ent.model = wv_ent wv_rel = KeyedVectors.load_word2vec_format('relation2vec.bin', binary=True) annoy_index_rel = AnnoyIndexer() annoy_index_rel.load('relation2vec.index') annoy_index_rel.model = wv_rel def tsne_vis(X, labels, name): tsne = TSNE(n_components=2).fit_transform(X) plt.figure(figsize=(50, 50)) for i, label in enumerate(labels): x, y = tsne[i, :] plt.scatter(x, y) plt.annotate(label, xy=(x, y),
annoy_index.save(annoy_file) # extend_glove() # build_word2vec() info('loading model') model = KeyedVectors.load(w2v_model) info(model) info('init sims') model.init_sims() # build_annoy(model) info('loading annoy indexer') annoy_index = AnnoyIndexer() annoy_index.load(annoy_file) annoy_index.model = model noise = np.random.random([DIM]) noise = np.zeros(DIM) info('querying with Annoy') with DisableLogger(): val = model.most_similar([noise, noise], topn=3, indexer=annoy_index) info(val) info('querying with gensim') with DisableLogger(): val = model.most_similar([noise, noise], topn=1) info(val)
class Recommander(object): def __init__(self, vec_file, pap, pat, pro): # self.wm = gensim.models.KeyedVectors.load_word2vec_format(vec_file,binary=True) self.wm = gensim.models.word2vec.Word2Vec.load_word2vec_format( vec_file, binary=True) self.paper_index = AnnoyIndexer() self.paper_index.load(pap) self.patent_index = AnnoyIndexer() self.patent_index.load(pat) self.project_index = AnnoyIndexer() self.project_index.load(pro) self.t2v = Convert2Vec(self.wm) self.cuttor = FilterCut() self.db = DB() self.featureIndex = self.buildFeatureIndex() def buildFeatureIndex(self): paperFeature = open( "/testdata400/data/recommender/data0828/feature/paper_feature.txt", 'r') patentFeature = open( "/testdata400/data/recommender/data0828/feature/patent_feature.txt", 'r') projectFeature = open( "/testdata400/data/recommender/data0828/feature/project_feature.txt", 'r') featureIndex = {} featureIndex['paper'] = self.loadFeature(paperFeature) featureIndex['patent'] = self.loadFeature(patentFeature) featureIndex['project'] = self.loadFeature(projectFeature) return featureIndex def loadFeature(self, file): file = file.readlines() index = {} index['field'] = {} index['type'] = {} index['province'] = {} index['unit'] = {} for line in file: feature = line.split('\t') if feature[1] not in index['field']: index['field'][feature[1]] = [] index['field'][feature[1]].append(feature[0]) if feature[2] not in index['type']: index['type'][feature[2]] = [] index['type'][feature[2]].append(feature[0]) if feature[3] not in index['province']: index['province'][feature[3]] = [] index['province'][feature[3]].append(feature[0]) if feature[4] not in index['unit']: index['unit'][feature[4]] = [] index['unit'][feature[4]].append(feature[0]) return index # 过滤论文,项目,专利 def filter(self, typee, topDocs, filterParams, topN): topDocIds = [i for i, j in topDocs] if not (filterParams[0] == '' or filterParams[0] == '-1' or typee == 'project'): # field, 项目没有type,不用过滤,参数为空字符串或者-1表示不过滤 if filterParams[0] not in self.featureIndex[typee]['field']: topDocIds = [] topDocIds = list( set(topDocIds).intersection( self.featureIndex[typee]['field'][filterParams[0]])) if not (filterParams[1] == '' or filterParams[1] == '-1'): # type if filterParams[1] not in self.featureIndex[typee]['type']: topDocIds = [] topDocIds = list( set(topDocIds).intersection( self.featureIndex[typee]['type'][filterParams[1]])) if not (filterParams[2] == '' or filterParams[2] == '-1'): # province if filterParams[2] not in self.featureIndex[typee]['province']: topDocIds = [] topDocIds = list( set(topDocIds).intersection( self.featureIndex[typee]['province'][filterParams[2]])) if not (filterParams[3] == '' or filterParams[3] == '-1'): # unit if filterParams[3] not in self.featureIndex[typee]['unit']: topDocIds = [] topDocIds = list( set(topDocIds).intersection( self.featureIndex[typee]['unit'][filterParams[3]])) result = [] for i in topDocs: if i[0] in topDocIds: result.append(i) if len(result) == topN: break return result # 不过滤地区,且返回全部满足的文档,而不仅仅是topn个文档 # def filterForExpert(self, typee, topDocs, filterParams): # topDocIds = [i for i,j in topDocs] # if not (filterParams[0] == '' or filterParams[ # 0] == '-1' or typee == 'project'): # field, 项目没有type,不用过滤,参数为空字符串或者-1表示不过滤 # if filterParams[0] not in self.featureIndex[typee]['field']: # topDocIds = [] # topDocIds = list(set(topDocIds).intersection(self.featureIndex[typee]['field'][filterParams[0]])) # if not (filterParams[1] == '' or filterParams[1] == '-1'): # type # if filterParams[1] not in self.featureIndex[typee]['type']: # topDocIds = [] # topDocIds = list(set(topDocIds).intersection(self.featureIndex[typee]['type'][filterParams[1]])) # if not (filterParams[3] == '' or filterParams[3] == '-1'): # unit # if filterParams[3] not in self.featureIndex[typee]['unit']: # topDocIds = [] # topDocIds = list(set(topDocIds).intersection(self.featureIndex[typee]['unit'][filterParams[3]])) # result = [] # # topDocsMap = {} # for i in range(len(topDocs)): # topDocsMap[topDocs[i][0]]=topDocs[i][1] # for id in topDocIds: # listTemp = [id,topDocsMap[id]] # result.append(listTemp) # return result def most_similar_paper(self, text, topn=10): vec = self.t2v.text2v(text, self.cuttor) return self.paper_index.most_similar(vec, topn) def most_similar_patent(self, text, topn=10): vec = self.t2v.text2v(text, self.cuttor) return self.patent_index.most_similar(vec, topn) def most_similar_project(self, text, topn=10): vec = self.t2v.text2v(text, self.cuttor) return self.project_index.most_similar(vec, topn) def getSimExpertsIds(self, topDocs): expertInfoOut = {} expertMap = {} authorSeqWeiht = [1.0, 0.85, 0.7, 0.5] for typee in topDocs: order = {} order[typee] = {} k = 0 for i, j in topDocs[typee]: order[typee][i] = k k = k + 1 ids = [i for i, j in topDocs[typee]] docExpertIds = self.db.getAuthors(typee, ids) for id in docExpertIds: if not self.db.idInDB(typee, id): print "docId:" + id + "is not in db" continue expertIds = docExpertIds[id] qs = 1.0 sim = qs for i, j in topDocs[typee]: if i == id: sim = j * sim break for i in range(len(expertIds)): if i >= 4: # 一个成果考虑4个作者 break if expertIds[i] not in expertInfoOut: expertInfoOut[expertIds[i]] = [] expertInfoOut[expertIds[i]].append([ typee + str(order[typee][id]), sim * authorSeqWeiht[i], i ]) if expertIds[i] not in expertMap: expertMap[expertIds[i]] = [] expertMap[expertIds[i]].append(sim * authorSeqWeiht[i]) return expertMap, expertInfoOut # 从成果提取专家,有些专家在不过滤省份时排在前,但过滤省份后排在后,为避免此情况,先不过滤成果的地区, # 从这些不过滤地区的成果中提取专家,再按地区过滤专家,若不足topN,再在过滤地区的成果中找剩余的专家 # # 这个函数需要重构,但是八成需求会改,所以先不重构了 def most_similar_expert(self, topPapers, topPatents, topProjects, filterParams, expertTopN): file = open("config.ini", 'r') config = ConfigParser.ConfigParser() config.readfp(file) LEN = int(config.get('global', 'len')) # 对于一个专家要计算多少他的成果 COE = float(config.get('global', 'coe')) # 对于一个专家,从第二个的成果相似度乘的系数 topDocs = {} topDocs['paper'] = self.filter('paper', topPapers, filterParams, 50) topDocs['patent'] = self.filter('patent', topPatents, filterParams, 50) topDocs['project'] = self.filter('project', topProjects, filterParams, 15) expertMap, expertInfoOut = self.getSimExpertsIds( topDocs) # 专家id为key,各项成果的相似度list为value expertScoreMap = {} # 专家为key,评分为value for expert in expertMap: expertMap[expert].sort(reverse=True) sim = expertMap[expert][0] for i in range(1, len(expertMap[expert])): if i >= LEN: break sim = sim + COE * expertMap[expert][i] expertScoreMap[expert] = sim result = sorted(expertScoreMap.items(), key=lambda item: item[1], reverse=True)[0:expertTopN] out = [] for i in result: if i[0] in expertInfoOut: out.append({i[0]: expertInfoOut[i[0]]}) # out[i[0]]=expertInfoOut[i[0]] self.printOut(out, LEN) return result def printOut(self, out, l): name = str('log/' + time.strftime("%Y-%m-%d %H-%M-%S" + ".txt", time.localtime())) print name output = open(name, 'w') for expert in out: for i in expert: list = expert[i] expert[i] = sorted(list, key=lambda doc: doc[1], reverse=True)[0:l] for expert in out: for i in expert: # print i # 作者id output.write(i + '\n') list = expert[i] # list为doc信息 docOrder = '' for j in list: docOrder = docOrder + j[0] + ' ' # print docOrder output.write(docOrder + '\n') sim = '' for j in list: sim = sim + str(j[1]) + ' ' # print sim output.write(sim + '\n') expertOrder = '' for j in list: expertOrder = expertOrder + str( j[2]) + ' ' # print expertOrder output.write(expertOrder + '\n') output.write("\n") output.close() # def most_similar_expert(self, text, topDocs): # expertMap = self.getSimExpertsIds(topDocs) # 专家id为key,各项成果的相似度list为value # expertScoreMap = {} # 专家为key,评分为value # for expert in expertMap: # expertMap[expert].sort(reverse=True) # sim = expertMap[expert][0] # for i in range(1, len(expertMap[expert])): # if i >= 4: # break # sim = sim + 0.04 * expertMap[expert][i] # expertScoreMap[expert] = sim # return sorted(expertScoreMap.items(), key=lambda item: item[1], reverse=True) def get_model(self): return self.wm def get_cuttor(self): return self.cuttor
# # You can save and load your indexes from/to disk to prevent having to # construct them each time. This will create two files on disk, *fname* and # *fname.d*. Both files are needed to correctly restore all attributes. Before # loading an index, you will have to create an empty AnnoyIndexer object. # fname = '/tmp/mymodel.index' # Persist index to disk annoy_index.save(fname) # Load index back import os.path if os.path.exists(fname): annoy_index2 = AnnoyIndexer() annoy_index2.load(fname) annoy_index2.model = model # Results should be identical to above vector = model.wv["science"] approximate_neighbors2 = model.wv.most_similar([vector], topn=11, indexer=annoy_index2) for neighbor in approximate_neighbors2: print(neighbor) assert approximate_neighbors == approximate_neighbors2 ############################################################################### # Be sure to use the same model at load that was used originally, otherwise you # will get unexpected behaviors.
from gensim.models import KeyedVectors from gensim.similarities.index import AnnoyIndexer # word2vec bin #wv = KeyedVectors.load_word2vec_format('numberbatch-en.txt', binary=False) #wv.save_word2vec_format('numberbatch-en.bin',binary=True) # annoy index #wv = KeyedVectors.load_word2vec_format('numberbatch-en.bin',binary=True) #annoy_index = AnnoyIndexer(wv,200) #annoy_index.save('numberbatch-en.index') # wv = KeyedVectors.load_word2vec_format('numberbatch-en.bin', binary=True) # annoy_index = AnnoyIndexer() # annoy_index.load('numberbatch-en.index') # annoy_index.model = wv wv = KeyedVectors.load_word2vec_format('glove.6B.300d.bin', binary=True) annoy_index = AnnoyIndexer() annoy_index.load('glove.6B.300d.index') annoy_index.model = wv wv.most_similar(positive=['football','win','organization'], topn=10, indexer=annoy_index) wv.most_similar(positive=['football','win','nationality'], topn=10, indexer=annoy_index)
# # 创建annoy indexer并关联word2vec模型 # # t0 = time.time() # # annoy_indexer_100 = AnnoyIndexer(model, 300) # # t1 = time.time() # # print("Create AnnoyIndexer: {}ms".format(1000*(t1-t0))) # # # # # 保存annoy indexer到disk # # annoy_indexer_100.save('annoy_indexer_100') # # t2 = time.time() # # print("Save AnnoyIndexer to File : {}ms".format(1000*(t2-t1))) # 从disk加载annoy indexer t1 = time.time() annoy_indexer_100 = AnnoyIndexer() annoy_indexer_100.load('annoy_indexer_100') annoy_indexer_100.model = model t2 = time.time() print("从disk加载annoy indexer : {}ms".format(1000 * (t2 - t1))) # 计算相似度 pos1 = ['上海队', '梅开二度', 'vs', '发球', '开局', '得分', '接连', '直接', '北京', '上海'] pos2 = ['火箭', '英语', 'NBA', 'VS'] pos3 = ['回放', '森林狼', '18', '赛季', '135', 'NBA', '121', '奇才', '19', '常规赛'] pos4 = ['威斯布鲁克', '雷霆队', '凯尔特人队'] pos5 = ['领衔', '球员', '十佳', '英超', '半场', '进球', '体育', '资讯', '吊射'] t2 = time.time() print(model.most_similar(positive=pos1, topn=25, indexer=annoy_indexer_100)) t3 = time.time() print("First Query : {}ms".format(1000 * (t3 - t2))) print(model.most_similar(positive=pos2, topn=6, indexer=annoy_indexer_100))