コード例 #1
0
def load_annoy(annoypath, model):
    '''

    :param annoypath: 
    :type annoypath: 
    :param model: 
    :type model: Word2Vec
    :return: 
    :rtype: AnnoyIndexer
    '''
    if not os.path.exists(annoypath):
        print("开始构建annoy索引:当前时间 : " +
              time.asctime(time.localtime(time.time())))
        starttime12 = time.time()
        aindex = AnnoyIndexer(model, 200)
        print("构建索引完毕 %.2f secs" % (time.time() - starttime12))
        # 保存annoy索引
        print("开始保存annoy索引")
        starttime13 = time.time()
        aindex.save(annoypath)
        print("保存索引完毕 %.2f secs" % (time.time() - starttime13))
    else:
        aindex = AnnoyIndexer()
        aindex.load(annoypath)
    return aindex
コード例 #2
0
    def _load_classifier(self, **kwargs):
        if self.classifier_type == 'ann':
            for f in list_files(self.s3_conn, self.s3_path):
                filepath = os.path.join(self.temporary_directory, f)
                if not os.path.exists(filepath):
                    logging.warning('calling download from %s to %s',
                                    self.s3_path + f, filepath)
                    download(self.s3_conn, filepath,
                             os.path.join(self.s3_path, f))
            ann_index = AnnoyIndexer()
            ann_index.load(
                os.path.join(self.temporary_directory,
                             self.classifier_id + '.index'))
            return NearestNeighbors(s3_conn=self.s3_conn,
                                    indexer=ann_index,
                                    **kwargs)

        elif self.classifier_type == 'knn':
            return NearestNeighbors(s3_conn=self.s3_conn,
                                    indexed=False,
                                    **kwargs)

        else:
            print('Not implemented yet!')
            return None
コード例 #3
0
class TestDoc2VecAnnoyIndexer(unittest.TestCase):
    def setUp(self):
        try:
            import annoy  # noqa:F401
        except ImportError:
            raise unittest.SkipTest("Annoy library is not available")

        from gensim.similarities.index import AnnoyIndexer

        self.model = doc2vec.Doc2Vec(sentences, min_count=1)
        self.model.init_sims()
        self.index = AnnoyIndexer(self.model, 300)
        self.vector = self.model.docvecs.doctag_syn0norm[0]

    def testDocumentIsSimilarToItself(self):
        approx_neighbors = self.index.most_similar(self.vector, 1)
        doc, similarity = approx_neighbors[0]

        self.assertEqual(doc, 0)
        self.assertEqual(similarity, 1.0)

    def testApproxNeighborsMatchExact(self):
        approx_neighbors = self.model.docvecs.most_similar([self.vector],
                                                           topn=5,
                                                           indexer=self.index)
        exact_neighbors = self.model.docvecs.most_similar(
            positive=[self.vector], topn=5)

        approx_words = [neighbor[0] for neighbor in approx_neighbors]
        exact_words = [neighbor[0] for neighbor in exact_neighbors]

        self.assertEqual(approx_words, exact_words)

    def testSave(self):
        fname = testfile()
        self.index.save(fname)
        self.assertTrue(os.path.exists(fname))
        self.assertTrue(os.path.exists(fname + '.d'))

    def testLoadNotExist(self):
        from gensim.similarities.index import AnnoyIndexer
        self.test_index = AnnoyIndexer()

        self.assertRaises(IOError, self.test_index.load, fname='test-index')

    def testSaveLoad(self):
        from gensim.similarities.index import AnnoyIndexer

        fname = testfile()
        self.index.save(fname)

        self.index2 = AnnoyIndexer()
        self.index2.load(fname)
        self.index2.model = self.model

        self.assertEqual(self.index.index.f, self.index2.index.f)
        self.assertEqual(self.index.labels, self.index2.labels)
        self.assertEqual(self.index.num_trees, self.index2.num_trees)
コード例 #4
0
ファイル: test_similarities.py プロジェクト: JKamlah/gensim
class TestDoc2VecAnnoyIndexer(unittest.TestCase):

    def setUp(self):
        try:
            import annoy
        except ImportError:
            raise unittest.SkipTest("Annoy library is not available")

        from gensim.similarities.index import AnnoyIndexer

        self.model = doc2vec.Doc2Vec(sentences, min_count=1)
        self.model.init_sims()
        self.index = AnnoyIndexer(self.model, 300)
        self.vector = self.model.docvecs.doctag_syn0norm[0]

    def testDocumentIsSimilarToItself(self):
        approx_neighbors = self.index.most_similar(self.vector, 1)
        doc, similarity = approx_neighbors[0]

        self.assertEqual(doc, 0)
        self.assertEqual(similarity, 1.0)

    def testApproxNeighborsMatchExact(self):
        approx_neighbors = self.model.docvecs.most_similar([self.vector], topn=5, indexer=self.index)
        exact_neighbors = self.model.docvecs.most_similar(
            positive=[self.vector], topn=5)

        approx_words = [neighbor[0] for neighbor in approx_neighbors]
        exact_words = [neighbor[0] for neighbor in exact_neighbors]

        self.assertEqual(approx_words, exact_words)

    def testSave(self):
        fname = testfile()
        self.index.save(fname)
        self.assertTrue(os.path.exists(fname))
        self.assertTrue(os.path.exists(fname + '.d'))

    def testLoadNotExist(self):
        from gensim.similarities.index import AnnoyIndexer
        self.test_index = AnnoyIndexer()

        self.assertRaises(IOError, self.test_index.load, fname='test-index')

    def testSaveLoad(self):
        from gensim.similarities.index import AnnoyIndexer

        fname = testfile()
        self.index.save(fname)

        self.index2 = AnnoyIndexer()
        self.index2.load(fname)
        self.index2.model = self.model

        self.assertEqual(self.index.index.f, self.index2.index.f)
        self.assertEqual(self.index.labels, self.index2.labels)
        self.assertEqual(self.index.num_trees, self.index2.num_trees)
コード例 #5
0
class TestWord2VecAnnoyIndexer(unittest.TestCase):
    def setUp(self):
        try:
            import annoy
        except ImportError:
            raise unittest.SkipTest("Annoy library is not available")

        from gensim.similarities.index import AnnoyIndexer

        self.model = word2vec.Word2Vec(texts, min_count=1)
        self.model.init_sims()
        self.index = AnnoyIndexer(self.model, 10)
        self.vector = self.model.wv.syn0norm[0]

    def testVectorIsSimilarToItself(self):
        label = self.model.index2word[0]
        approx_neighbors = self.index.most_similar(self.vector, 1)
        word, similarity = approx_neighbors[0]

        self.assertEqual(word, label)
        self.assertEqual(similarity, 1.0)

    def testApproxNeighborsMatchExact(self):
        approx_neighbors = self.model.most_similar([self.vector],
                                                   topn=5,
                                                   indexer=self.index)
        exact_neighbors = self.model.most_similar(positive=[self.vector],
                                                  topn=5)

        approx_words = [neighbor[0] for neighbor in approx_neighbors]
        exact_words = [neighbor[0] for neighbor in exact_neighbors]

        self.assertEqual(approx_words, exact_words)

    def testSave(self):
        self.index.save('index')
        self.assertTrue(os.path.exists('index'))
        self.assertTrue(os.path.exists('index.d'))

    def testLoadNotExist(self):
        from gensim.similarities.index import AnnoyIndexer
        self.test_index = AnnoyIndexer()

        self.assertRaises(IOError, self.test_index.load, fname='test-index')

    def testSaveLoad(self):
        from gensim.similarities.index import AnnoyIndexer

        self.index.save('index')

        self.index2 = AnnoyIndexer()
        self.index2.load('index')
        self.index2.model = self.model

        self.assertEqual(self.index.index.f, self.index2.index.f)
        self.assertEqual(self.index.labels, self.index2.labels)
        self.assertEqual(self.index.num_trees, self.index2.num_trees)
コード例 #6
0
class TestWord2VecAnnoyIndexer(unittest.TestCase):

    def setUp(self):
        try:
            import annoy
        except ImportError:
            raise unittest.SkipTest("Annoy library is not available")

        from gensim.similarities.index import AnnoyIndexer

        self.model = word2vec.Word2Vec(texts, min_count=1)
        self.model.init_sims()
        self.index = AnnoyIndexer(self.model, 10)
        self.vector = self.model.syn0norm[0]

    def testVectorIsSimilarToItself(self):
        label = self.model.index2word[0]
        approx_neighbors = self.index.most_similar(self.vector, 1)
        word, similarity = approx_neighbors[0]

        self.assertEqual(word, label)
        self.assertEqual(similarity, 1.0)

    def testApproxNeighborsMatchExact(self):
        approx_neighbors = self.model.most_similar([self.vector], topn=5, indexer=self.index)
        exact_neighbors = self.model.most_similar(positive=[self.vector], topn=5)

        approx_words = [neighbor[0] for neighbor in approx_neighbors]
        exact_words = [neighbor[0] for neighbor in exact_neighbors]

        self.assertEqual(approx_words, exact_words)

    def testSave(self):
        self.index.save('index')
        self.assertTrue(os.path.exists('index'))
        self.assertTrue(os.path.exists('index.d'))

    def testLoadNotExist(self):
        from gensim.similarities.index import AnnoyIndexer
        self.test_index = AnnoyIndexer()

        self.assertRaises(IOError, self.test_index.load, fname='test-index')

    def testSaveLoad(self):
        from gensim.similarities.index import AnnoyIndexer

        self.index.save('index')

        self.index2 = AnnoyIndexer()
        self.index2.load('index')
        self.index2.model = self.model

        self.assertEqual(self.index.index.f, self.index2.index.f)
        self.assertEqual(self.index.labels, self.index2.labels)
        self.assertEqual(self.index.num_trees, self.index2.num_trees)
コード例 #7
0
def get_indexer(fpath, model, room_id):
    if os.path.exists(fpath):
        logging.info("Use annoy_index :: room_id:%s", room_id)
        annoy_index = AnnoyIndexer()
        annoy_index.load(fpath)
        annoy_index.model = model

        return annoy_index
    else:
        # indexer: defaut is None
        return None
コード例 #8
0
    def assertLoadedIndexEqual(self, index, model):
        from gensim.similarities.index import AnnoyIndexer

        index.save('index')

        index2 = AnnoyIndexer()
        index2.load('index')
        index2.model = model

        self.assertEqual(index.index.f, index2.index.f)
        self.assertEqual(index.labels, index2.labels)
        self.assertEqual(index.num_trees, index2.num_trees)
コード例 #9
0
 def load_index_investment(self, path):
     index = AnnoyIndexer()
     for parent, dirnames, filenames in os.walk(path):
         for filename in filenames:
             # 生成的B.ind.d 是不能加载进来的,只能加载B.ind
             if len(filename.split('.')) == 2:
                 logger.info(u'文件名为%s ,路径为:%s' %
                             (str(filename.split('.')[0]),
                              os.path.join(parent, filename)))
                 index = AnnoyIndexer()
                 index.load(os.path.join(parent, filename))
     return index
コード例 #10
0
ファイル: test_similarities.py プロジェクト: leahic/gensim
    def assertLoadedIndexEqual(self, index, model):
        from gensim.similarities.index import AnnoyIndexer

        index.save('index')

        index2 = AnnoyIndexer()
        index2.load('index')
        index2.model = model

        self.assertEqual(index.index.f, index2.index.f)
        self.assertEqual(index.labels, index2.labels)
        self.assertEqual(index.num_trees, index2.num_trees)
コード例 #11
0
ファイル: similarity.py プロジェクト: zhaoqinghai/harvester
def predict(text):
    model = doc2vec.Doc2Vec.load('../models/doc2vec.model')
    indexer = AnnoyIndexer()
    indexer.load('../models/dv_index')
    indexer.model = model
    # print(indexer.labels)
    new_vec = []
    for word in transform_text(text, strip=False):
        new_vec.append(model[word])
    print(new_vec)
    sv = model.infer_vector(transform_text(text, strip=False))
    print(sv)
    print(indexer.most_similar(sv, 2))
コード例 #12
0
def f(process_id):
    print('Process Id: {}'.format(os.getpid()))
    process = psutil.Process(os.getpid())
    new_model = Word2Vec.load('/tmp/mymodel.pkl')
    vector = new_model.wv["science"]
    annoy_index = AnnoyIndexer()
    annoy_index.load('/tmp/mymodel.index')
    annoy_index.model = new_model
    approximate_neighbors = new_model.wv.most_similar([vector],
                                                      topn=5,
                                                      indexer=annoy_index)
    print('\nMemory used by process {}: {}\n---'.format(
        os.getpid(), process.memory_info()))
コード例 #13
0
    def assertLoadedIndexEqual(self, index, model):
        from gensim.similarities.index import AnnoyIndexer

        fname = get_tmpfile('gensim_similarities.tst.pkl')
        index.save(fname)

        index2 = AnnoyIndexer()
        index2.load(fname)
        index2.model = model

        self.assertEqual(index.index.f, index2.index.f)
        self.assertEqual(index.labels, index2.labels)
        self.assertEqual(index.num_trees, index2.num_trees)
コード例 #14
0
    def assertLoadedIndexEqual(self, index, model):
        from gensim.similarities.index import AnnoyIndexer

        fname = get_tmpfile('gensim_similarities.tst.pkl')
        index.save(fname)

        index2 = AnnoyIndexer()
        index2.load(fname)
        index2.model = model

        self.assertEqual(index.index.f, index2.index.f)
        self.assertEqual(index.labels, index2.labels)
        self.assertEqual(index.num_trees, index2.num_trees)
コード例 #15
0
def get_annoy(w2v, embedding_type='w2v'):
    dims = 100
    annoy_file_name = data_dir + '/annoy_index_' + '_' + str(dims) + '_' + embedding_type + '_' + str(len(w2v.vocab))
    if os.path.exists(annoy_file_name):
        logging.info("Loading Annoy from file: %s", annoy_file_name)
        annoy_index = AnnoyIndexer()
        annoy_index.load(annoy_file_name)
        annoy_index.model = word_vectors
    else:
        logging.info("Creating Annoy")
        annoy_index = AnnoyIndexer(word_vectors, dims)
        annoy_index.save(annoy_file_name)
        logging.info("Annoy indexing saved to %s", annoy_file_name)
    return annoy_index
コード例 #16
0
def load_w2v():
    print("Loading gensim pre-trained model")
    # model = KeyedVectors.load_word2vec_format("SO_vectors_200.bin", binary=True)
    # Above is intolerably slow and large, normed by code found here: https://stackoverflow.com/a/56963501
    model = KeyedVectors.load("SO_vectors_normed", mmap='r')

    # Use this to load the provided AnnoyIndex
    annoy_index = AnnoyIndexer()
    annoy_index.load('SO_vectors_normed_annoy_index')

    # Use this to generate a new AnnoyIndex in ram, number is n-gram size (2 is recommended and seems to work best here)
    # annoy_index = AnnoyIndexer(model, 3)

    return Word2Vec(model, index=annoy_index)
コード例 #17
0
 def index_vector(self, dimensions=300, save=False):
     '''
     make annoy_index which is used in function 'is_word_pairs_similar'
     Using annoy_index, execution may be slower than normal index
     '''
     path = Path.cwd().parent.joinpath('preprocessed/annoy.index')
     if path.exists():
         annoy_index = AnnoyIndexer()
         annoy_index.load(str(path))
         annoy_index.model = self.embedding
     else:
         annoy_index = AnnoyIndexer(self.embedding, dimensions)
         if save:
             annoy_index.save(str(path))
     return annoy_index
コード例 #18
0
ファイル: augment.py プロジェクト: nguyenvulebinh/vlsp-hsd
def similar_augment(texts,
                    labels,
                    n_increase,
                    n_word_replace,
                    model_path,
                    similar_threshold=0.5,
                    use_annoy=True,
                    annoy_path=None):
    w2v = KeyedVectors.load_word2vec_format(model_path, binary=True)
    texts_long = []
    labels_long = []
    if use_annoy:
        if annoy_path is None:
            indexer = AnnoyIndexer(w2v, 100)
        else:
            indexer = AnnoyIndexer()
            indexer.load(annoy_path)

    for ind in range(len(texts)):
        if len(texts[ind]) >= n_word_replace:
            texts_long.append(texts[ind])
            labels_long.append(labels[ind])

    shuffle_ind = np.random.choice(len(texts_long), size=n_increase)
    for ind in shuffle_ind:
        text_copy = copy.deepcopy(texts_long[ind])
        # if is_hier:

        replace_inds = np.random.choice(text_copy.shape[-1],
                                        size=n_word_replace,
                                        replace=False)
        for word_ind in replace_inds:
            word = text_copy[word_ind]
            try:

                closest, score = w2v.wv.most_similar(
                    word, topn=2, indexer=indexer if use_annoy else None)[1]
                if score > similar_threshold:
                    text_copy[word_ind] = closest
            except:
                continue

        texts.append(text_copy)
        labels = np.append(labels, [labels_long[ind]])

    return texts, labels
コード例 #19
0
class WordNeighbors:
    def __init__(self, model_dir):
        self.ft_model = FastText.load(os.path.join(model_dir, 'ft_model'))
        self.w2v_model = Word2Vec.load(os.path.join(model_dir, 'w2v_model'))
        self.annoy_index = AnnoyIndexer()
        self.annoy_index.load(os.path.join(model_dir, 'annoy_model'))

    def query(self, w, topn):
        if w in self.w2v_model:
            vector = self.w2v_model[w]
            neighbors = self.w2v_model.most_similar([vector],
                                                    topn=topn,
                                                    indexer=self.annoy_index)
        else:
            try:
                neighbors = self.ft_model.most_similar(w, topn=topn)
            except KeyError:
                neighbors = []
        return neighbors
コード例 #20
0
ファイル: ml.py プロジェクト: kepolol/Alias_game_bot
class Predictor:
    def __init__(self):
        self.model = Word2Vec.load('data/word2vec.model')
        self.vocab = self.model.wv.vocab
        self.annoy_index = AnnoyIndexer()
        self.annoy_index.load('data/word2vec_idx.ann')
        self.annoy_index.model = self.model

    def explain(self, word, n_words):
        try:
            ans_words = self.model.wv.most_similar(
                positive=[lemmatize_stemming(word)],
                topn=n_words + 1,
                indexer=self.annoy_index)
            print([lemmatize_stemming(word[0]) for word in ans_words[1:]])
            return [word[0] for word in ans_words[1:]]
        except KeyError:
            return 'Wrong word'

    def guess(self, words, n_words):
        try:
            if len(words) != 1:
                ans_words = self.model.wv.most_similar(
                    positive=[
                        lemmatize_stemming(word) for word in words
                        if lemmatize_stemming(word) in self.vocab
                    ],
                    topn=n_words,
                    indexer=self.annoy_index)
                return [word[0] for word in ans_words]
            else:
                ans_words = self.model.wv.most_similar(
                    positive=[
                        lemmatize_stemming(word) for word in words
                        if lemmatize_stemming(word) in self.vocab
                    ],
                    topn=n_words + 1,
                    indexer=self.annoy_index)
                return [word[0] for word in ans_words[1:]]
        except ValueError:
            return 'Wrong word'
コード例 #21
0
ファイル: augment.py プロジェクト: nguyenvulebinh/vlsp-hsd
def create_sim_dict(word_map,
                    model_path,
                    similar_threshold=0.5,
                    use_annoy=True,
                    annoy_path=None):
    w2v = KeyedVectors.load_word2vec_format(model_path, binary=True)
    if use_annoy:
        if annoy_path is None:
            indexer = AnnoyIndexer(w2v, 100)
        else:
            indexer = AnnoyIndexer()
            indexer.load(annoy_path)

    sim_dict = dict()
    for word in word_map:
        try:
            closest, score = w2v.wv.most_similar(
                word, topn=2, indexer=indexer if use_annoy else None)[1]
            if score > similar_threshold and closest in word_map:
                sim_dict[word_map[word]] = word_map[closest]
        except:
            continue

    return sim_dict
コード例 #22
0
    def load_indexer(self, model_name):
        if self.full_log is not None:
            self.full_log.info("Loading word model indexer...")

        indexer = None

        if self.settings["use_annoy_indexer"]:
            if model_name == "glove-twitter-100":
                if self.full_log is not None:
                    self.full_log.debug("Loading Twitter 100")
                indexer = AnnoyIndexer()
                indexer.load(
                    r"C:\Users\benja\OneDrive\Documents\UniWork\Aberystwyth\Year3\CS39440"
                    + r"\MajorProject\models\glove-twitter-100-5-trees.ann")
            elif model_name == "glove-twitter-200":
                if self.full_log is not None:
                    self.full_log.debug("Loading Twitter 200")
                indexer = AnnoyIndexer()
                indexer.load(
                    r"C:\Users\benja\OneDrive\Documents\UniWork\Aberystwyth\Year3\CS39440"
                    + r"\MajorProject\models\glove-twitter-200-5-trees.ann")
            elif model_name == "glove-wiki-300":
                if self.full_log is not None:
                    self.full_log.debug("Loading Wiki 300")
                indexer = AnnoyIndexer()
                indexer.load(
                    r"C:\Users\benja\OneDrive\Documents\UniWork\Aberystwyth\Year3\CS39440"
                    + r"\MajorProject\models\glove-wiki-300-5-trees.ann")
            elif model_name == "glove-wiki-100":
                if self.full_log is not None:
                    self.full_log.debug("Loading Wiki 100")
                indexer = AnnoyIndexer()
                indexer.load(
                    r"C:\Users\benja\OneDrive\Documents\UniWork\Aberystwyth\Year3\CS39440"
                    + r"\MajorProject\models\glove-wiki-100-5-trees.ann")
            if self.full_log is not None:
                self.full_log.info("Done loading model indexer")
        else:
            if self.full_log is not None:
                self.full_log.warning("No indexer selected, using default")

        return indexer
コード例 #23
0
ファイル: w2v_fcst.py プロジェクト: MathewXJ/PycharmProjects
# -*- coding: utf-8 -*-
from gensim.models import Word2Vec
from gensim.similarities.index import AnnoyIndexer
import os
from app.common.config import model_path
from app.util.pre_model import W2V_VOCABULARY_SET, VOCABULARY_SET
from app.util.remove_utils import remove_not_sports

# 加载model目录下指定模型
path_to_model = os.path.join(model_path, 'word2vec')
model = Word2Vec.load(path_to_model)

# 从disk加载annoy indexer
path_to_indexer = os.path.join(model_path, 'annoy_indexer_100')
annoy_indexer_100 = AnnoyIndexer()
annoy_indexer_100.load(path_to_indexer)
annoy_indexer_100.model = model


# 使用模型计算输入词组
# 2018-03-08 使用indexer解决cpu占用问题
def associate_words(words, cont_type, with_model=model, top_n=10):
    words = [w.strip() for w in words]
    words = list(filter(lambda x: x in VOCABULARY_SET, words))
    res = {}
    tops = []
    if words is None or len(words) == 0:
        return res
    for i in range(len(words)):
        try:
            tops = (with_model.most_similar(positive=words[0:(len(words) - i)],
コード例 #24
0
class Doc2VecModel:

    BASE_WIKI_QUERY = "https://en.wikipedia.org/w/api.php?action=query&format=json&pageids="
    stopword_list = stopwords.words('english')

    def __init__(self, modelname):
        logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                            level=logging.INFO)
        self.modelname = modelname
        if self.modelname is None:
            trian_corpus = self._get_training_iterator()
            self.model = Doc2Vec(vector_size=100, min_count=5, workers=7)
            self.model.build_vocab(trian_corpus, progress_per=10000)
        else:
            self.model = Doc2Vec.load(self.modelname)

        self.nns_method_init_dict = {
            NNSMethod.BRUTE: True,
            NNSMethod.KD_TREE: False,
            NNSMethod.ANNOY: False
        }

    def infer_file(self, filename, n=10):
        with open(filename, 'r') as f:
            lines = f.readlines()
        lines = ' '.join(lines)
        return self.infer(lines, n)

    def infer(self,
              string,
              n=10,
              nnsmethod=NNSMethod.ANNOY,
              annoymodelpath="gensim_annoy"):
        self._initialize_nns_method(nnsmethod, annoymodelpath)
        words = self._preprocess(string)
        # Set the random seed to make inferred vector determanistic
        self.model.random = np.random.mtrand.RandomState(1337)
        inferred_vector = self.model.infer_vector(words)
        ids, dists = self._calculate_most_similar(inferred_vector, n,
                                                  nnsmethod)
        titles = self._get_title_from_pageids(ids)
        return titles, dists

    def train(self, epochs):
        trian_corpus = self._get_training_iterator()
        self.model.train(trian_corpus,
                         total_examples=self.model.corpus_count,
                         epochs=epochs,
                         report_delay=10)
        self.model.save(self.modelname)

    def _initialize_nns_method(self, nnsmethod, annoymodelpath):
        if self.nns_method_init_dict[nnsmethod]: return
        if nnsmethod == NNSMethod.KD_TREE:
            print("Building KD tree..")
            self.tree = cKDTree(self.model.docvecs.vectors_docs)
            print("Finished building KD tree.")
            self.keys = list(self.model.docvecs.doctags.keys())
        elif nnsmethod == NNSMethod.ANNOY:
            if not os.path.isfile(annoymodelpath):
                print("Generating annoy index...")
                self.annoy_indexer = AnnoyIndexer(self.model, 50)
                print("Finished generating annoy index.")
                self.annoy_indexer.save(annoymodelpath)
            else:
                self.annoy_indexer = AnnoyIndexer()
                self.annoy_indexer.load(annoymodelpath)
                self.annoy_indexer.model = self.model

    def _calculate_most_similar(self, vector, n, nnsmethod):
        start_time = time.clock()
        if nnsmethod == NNSMethod.BRUTE:
            tops = self.model.docvecs.most_similar([vector], topn=n)
            dists, indicies = [t[0] for t in tops], [t[1] for t in tops]
        if nnsmethod == NNSMethod.KD_TREE:
            dists, indicies = self.tree.query(vector, k=n)
            indicies = [self.keys[i] for i in indicies]
        if nnsmethod == NNSMethod.ANNOY:
            tops = self.model.docvecs.most_similar([vector],
                                                   topn=n,
                                                   indexer=self.annoy_indexer)
            dists, indicies = [t[0] for t in tops], [t[1] for t in tops]
        print(f"Time using {nnsmethod} - {time.clock() - start_time}")
        return dists, indicies

    def _preprocess(self, string):
        string = string.lower()
        string = re.sub('[^a-z\s]+', '', string)
        words = nltk.word_tokenize(string)
        return [word for word in words if word not in self.stopword_list]

    def _get_training_iterator(self):
        home = os.path.expanduser("~")
        path = os.path.join(
            home, "Documents",
            "text")  # Data is assumed to be in ~/Documents/text
        files = glob.glob(os.path.join(path, "**/wiki_*"), recursive=True)
        return TaggedDocumentGenerator(files)

    def _get_title_from_pageids(self, ids):
        ids = '|'.join(ids)
        query = self.BASE_WIKI_QUERY + ids
        response = urlopen(query)
        dic = json.loads(response.read())
        return [
            v['title'] if 'title' in v else "PageId: " + str(v['pageid'])
            for v in dic['query']['pages'].values()
        ]
コード例 #25
0
import codecs, json
from collections import defaultdict

import numpy as np
from matplotlib import pyplot as plt

from sklearn.manifold import TSNE
from sklearn.cluster import KMeans

from gensim.models import KeyedVectors
from gensim.similarities.index import AnnoyIndexer

wv_ent = KeyedVectors.load_word2vec_format('entity2vec.bin', binary=True)
annoy_index_ent = AnnoyIndexer()
annoy_index_ent.load('entity2vec.index')
annoy_index_ent.model = wv_ent

wv_rel = KeyedVectors.load_word2vec_format('relation2vec.bin', binary=True)
annoy_index_rel = AnnoyIndexer()
annoy_index_rel.load('relation2vec.index')
annoy_index_rel.model = wv_rel


def tsne_vis(X, labels, name):
    tsne = TSNE(n_components=2).fit_transform(X)
    plt.figure(figsize=(50, 50))
    for i, label in enumerate(labels):
        x, y = tsne[i, :]
        plt.scatter(x, y)
        plt.annotate(label,
                     xy=(x, y),
コード例 #26
0
    annoy_index.save(annoy_file)


# extend_glove()
# build_word2vec()

info('loading model')
model = KeyedVectors.load(w2v_model)
info(model)

info('init sims')
model.init_sims()

# build_annoy(model)
info('loading annoy indexer')
annoy_index = AnnoyIndexer()
annoy_index.load(annoy_file)
annoy_index.model = model

noise = np.random.random([DIM])
noise = np.zeros(DIM)
info('querying with Annoy')
with DisableLogger():
    val = model.most_similar([noise, noise], topn=3, indexer=annoy_index)
info(val)

info('querying with gensim')
with DisableLogger():
    val = model.most_similar([noise, noise], topn=1)
info(val)
コード例 #27
0
class Recommander(object):
    def __init__(self, vec_file, pap, pat, pro):
        # self.wm = gensim.models.KeyedVectors.load_word2vec_format(vec_file,binary=True)
        self.wm = gensim.models.word2vec.Word2Vec.load_word2vec_format(
            vec_file, binary=True)
        self.paper_index = AnnoyIndexer()
        self.paper_index.load(pap)
        self.patent_index = AnnoyIndexer()
        self.patent_index.load(pat)
        self.project_index = AnnoyIndexer()
        self.project_index.load(pro)
        self.t2v = Convert2Vec(self.wm)
        self.cuttor = FilterCut()
        self.db = DB()
        self.featureIndex = self.buildFeatureIndex()

    def buildFeatureIndex(self):
        paperFeature = open(
            "/testdata400/data/recommender/data0828/feature/paper_feature.txt",
            'r')
        patentFeature = open(
            "/testdata400/data/recommender/data0828/feature/patent_feature.txt",
            'r')
        projectFeature = open(
            "/testdata400/data/recommender/data0828/feature/project_feature.txt",
            'r')
        featureIndex = {}
        featureIndex['paper'] = self.loadFeature(paperFeature)
        featureIndex['patent'] = self.loadFeature(patentFeature)
        featureIndex['project'] = self.loadFeature(projectFeature)
        return featureIndex

    def loadFeature(self, file):
        file = file.readlines()
        index = {}
        index['field'] = {}
        index['type'] = {}
        index['province'] = {}
        index['unit'] = {}
        for line in file:
            feature = line.split('\t')
            if feature[1] not in index['field']:
                index['field'][feature[1]] = []
            index['field'][feature[1]].append(feature[0])
            if feature[2] not in index['type']:
                index['type'][feature[2]] = []
            index['type'][feature[2]].append(feature[0])
            if feature[3] not in index['province']:
                index['province'][feature[3]] = []
            index['province'][feature[3]].append(feature[0])
            if feature[4] not in index['unit']:
                index['unit'][feature[4]] = []
            index['unit'][feature[4]].append(feature[0])
        return index

    # 过滤论文,项目,专利
    def filter(self, typee, topDocs, filterParams, topN):
        topDocIds = [i for i, j in topDocs]
        if not (filterParams[0] == '' or filterParams[0] == '-1' or typee
                == 'project'):  # field, 项目没有type,不用过滤,参数为空字符串或者-1表示不过滤
            if filterParams[0] not in self.featureIndex[typee]['field']:
                topDocIds = []
            topDocIds = list(
                set(topDocIds).intersection(
                    self.featureIndex[typee]['field'][filterParams[0]]))
        if not (filterParams[1] == '' or filterParams[1] == '-1'):  # type
            if filterParams[1] not in self.featureIndex[typee]['type']:
                topDocIds = []
            topDocIds = list(
                set(topDocIds).intersection(
                    self.featureIndex[typee]['type'][filterParams[1]]))
        if not (filterParams[2] == '' or filterParams[2] == '-1'):  # province
            if filterParams[2] not in self.featureIndex[typee]['province']:
                topDocIds = []
            topDocIds = list(
                set(topDocIds).intersection(
                    self.featureIndex[typee]['province'][filterParams[2]]))
        if not (filterParams[3] == '' or filterParams[3] == '-1'):  # unit
            if filterParams[3] not in self.featureIndex[typee]['unit']:
                topDocIds = []
            topDocIds = list(
                set(topDocIds).intersection(
                    self.featureIndex[typee]['unit'][filterParams[3]]))
        result = []
        for i in topDocs:
            if i[0] in topDocIds:
                result.append(i)
            if len(result) == topN:
                break
        return result

    # 不过滤地区,且返回全部满足的文档,而不仅仅是topn个文档
    # def filterForExpert(self, typee, topDocs, filterParams):
    #     topDocIds = [i for i,j in topDocs]
    #     if not (filterParams[0] == '' or filterParams[
    #         0] == '-1' or typee == 'project'):  # field, 项目没有type,不用过滤,参数为空字符串或者-1表示不过滤
    #         if filterParams[0] not in self.featureIndex[typee]['field']:
    #             topDocIds = []
    #         topDocIds = list(set(topDocIds).intersection(self.featureIndex[typee]['field'][filterParams[0]]))
    #     if not (filterParams[1] == '' or filterParams[1] == '-1'):  # type
    #         if filterParams[1] not in self.featureIndex[typee]['type']:
    #             topDocIds = []
    #         topDocIds = list(set(topDocIds).intersection(self.featureIndex[typee]['type'][filterParams[1]]))
    #     if not (filterParams[3] == '' or filterParams[3] == '-1'):  # unit
    #         if filterParams[3] not in self.featureIndex[typee]['unit']:
    #             topDocIds = []
    #         topDocIds = list(set(topDocIds).intersection(self.featureIndex[typee]['unit'][filterParams[3]]))
    #     result = []
    #
    #     topDocsMap = {}
    #     for i in range(len(topDocs)):
    #         topDocsMap[topDocs[i][0]]=topDocs[i][1]
    #     for id in topDocIds:
    #         listTemp = [id,topDocsMap[id]]
    #         result.append(listTemp)
    #     return result

    def most_similar_paper(self, text, topn=10):
        vec = self.t2v.text2v(text, self.cuttor)
        return self.paper_index.most_similar(vec, topn)

    def most_similar_patent(self, text, topn=10):
        vec = self.t2v.text2v(text, self.cuttor)
        return self.patent_index.most_similar(vec, topn)

    def most_similar_project(self, text, topn=10):
        vec = self.t2v.text2v(text, self.cuttor)
        return self.project_index.most_similar(vec, topn)

    def getSimExpertsIds(self, topDocs):
        expertInfoOut = {}
        expertMap = {}
        authorSeqWeiht = [1.0, 0.85, 0.7, 0.5]
        for typee in topDocs:
            order = {}
            order[typee] = {}
            k = 0
            for i, j in topDocs[typee]:
                order[typee][i] = k
                k = k + 1
            ids = [i for i, j in topDocs[typee]]
            docExpertIds = self.db.getAuthors(typee, ids)
            for id in docExpertIds:
                if not self.db.idInDB(typee, id):
                    print "docId:" + id + "is not in db"
                    continue
                expertIds = docExpertIds[id]
                qs = 1.0
                sim = qs
                for i, j in topDocs[typee]:
                    if i == id:
                        sim = j * sim
                        break
                for i in range(len(expertIds)):
                    if i >= 4:  # 一个成果考虑4个作者
                        break
                    if expertIds[i] not in expertInfoOut:
                        expertInfoOut[expertIds[i]] = []
                    expertInfoOut[expertIds[i]].append([
                        typee + str(order[typee][id]), sim * authorSeqWeiht[i],
                        i
                    ])
                    if expertIds[i] not in expertMap:
                        expertMap[expertIds[i]] = []
                    expertMap[expertIds[i]].append(sim * authorSeqWeiht[i])
        return expertMap, expertInfoOut

    # 从成果提取专家,有些专家在不过滤省份时排在前,但过滤省份后排在后,为避免此情况,先不过滤成果的地区,
    # 从这些不过滤地区的成果中提取专家,再按地区过滤专家,若不足topN,再在过滤地区的成果中找剩余的专家
    #
    # 这个函数需要重构,但是八成需求会改,所以先不重构了
    def most_similar_expert(self, topPapers, topPatents, topProjects,
                            filterParams, expertTopN):
        file = open("config.ini", 'r')
        config = ConfigParser.ConfigParser()
        config.readfp(file)
        LEN = int(config.get('global', 'len'))  # 对于一个专家要计算多少他的成果
        COE = float(config.get('global', 'coe'))  # 对于一个专家,从第二个的成果相似度乘的系数
        topDocs = {}
        topDocs['paper'] = self.filter('paper', topPapers, filterParams, 50)
        topDocs['patent'] = self.filter('patent', topPatents, filterParams, 50)
        topDocs['project'] = self.filter('project', topProjects, filterParams,
                                         15)
        expertMap, expertInfoOut = self.getSimExpertsIds(
            topDocs)  # 专家id为key,各项成果的相似度list为value
        expertScoreMap = {}  # 专家为key,评分为value
        for expert in expertMap:
            expertMap[expert].sort(reverse=True)
            sim = expertMap[expert][0]
            for i in range(1, len(expertMap[expert])):
                if i >= LEN:
                    break
                sim = sim + COE * expertMap[expert][i]
            expertScoreMap[expert] = sim
        result = sorted(expertScoreMap.items(),
                        key=lambda item: item[1],
                        reverse=True)[0:expertTopN]
        out = []
        for i in result:
            if i[0] in expertInfoOut:
                out.append({i[0]: expertInfoOut[i[0]]})
                # out[i[0]]=expertInfoOut[i[0]]
        self.printOut(out, LEN)
        return result

    def printOut(self, out, l):
        name = str('log/' + time.strftime("%Y-%m-%d %H-%M-%S" +
                                          ".txt", time.localtime()))
        print name
        output = open(name, 'w')
        for expert in out:
            for i in expert:
                list = expert[i]
                expert[i] = sorted(list, key=lambda doc: doc[1],
                                   reverse=True)[0:l]
        for expert in out:
            for i in expert:
                # print i  # 作者id
                output.write(i + '\n')
                list = expert[i]  # list为doc信息
                docOrder = ''
                for j in list:
                    docOrder = docOrder + j[0] + '                  '
                # print docOrder
                output.write(docOrder + '\n')
                sim = ''
                for j in list:
                    sim = sim + str(j[1]) + '             '
                # print sim
                output.write(sim + '\n')
                expertOrder = ''
                for j in list:
                    expertOrder = expertOrder + str(
                        j[2]) + '                            '
                # print expertOrder
                output.write(expertOrder + '\n')
                output.write("\n")
        output.close()

    # def most_similar_expert(self, text, topDocs):
    #     expertMap = self.getSimExpertsIds(topDocs)  # 专家id为key,各项成果的相似度list为value
    #     expertScoreMap = {}  # 专家为key,评分为value
    #     for expert in expertMap:
    #         expertMap[expert].sort(reverse=True)
    #         sim = expertMap[expert][0]
    #         for i in range(1, len(expertMap[expert])):
    #             if i >= 4:
    #                 break
    #             sim = sim + 0.04 * expertMap[expert][i]
    #         expertScoreMap[expert] = sim
    #     return sorted(expertScoreMap.items(), key=lambda item: item[1], reverse=True)

    def get_model(self):
        return self.wm

    def get_cuttor(self):
        return self.cuttor
コード例 #28
0
#
# You can save and load your indexes from/to disk to prevent having to
# construct them each time. This will create two files on disk, *fname* and
# *fname.d*. Both files are needed to correctly restore all attributes. Before
# loading an index, you will have to create an empty AnnoyIndexer object.
#
fname = '/tmp/mymodel.index'

# Persist index to disk
annoy_index.save(fname)

# Load index back
import os.path
if os.path.exists(fname):
    annoy_index2 = AnnoyIndexer()
    annoy_index2.load(fname)
    annoy_index2.model = model

# Results should be identical to above
vector = model.wv["science"]
approximate_neighbors2 = model.wv.most_similar([vector],
                                               topn=11,
                                               indexer=annoy_index2)
for neighbor in approximate_neighbors2:
    print(neighbor)

assert approximate_neighbors == approximate_neighbors2

###############################################################################
# Be sure to use the same model at load that was used originally, otherwise you
# will get unexpected behaviors.
コード例 #29
0
from gensim.models import KeyedVectors
from gensim.similarities.index import AnnoyIndexer

# word2vec bin
#wv = KeyedVectors.load_word2vec_format('numberbatch-en.txt', binary=False)
#wv.save_word2vec_format('numberbatch-en.bin',binary=True)

# annoy index
#wv = KeyedVectors.load_word2vec_format('numberbatch-en.bin',binary=True)
#annoy_index = AnnoyIndexer(wv,200)
#annoy_index.save('numberbatch-en.index')

# wv = KeyedVectors.load_word2vec_format('numberbatch-en.bin', binary=True)
# annoy_index = AnnoyIndexer()
# annoy_index.load('numberbatch-en.index')
# annoy_index.model = wv

wv = KeyedVectors.load_word2vec_format('glove.6B.300d.bin', binary=True)
annoy_index = AnnoyIndexer()
annoy_index.load('glove.6B.300d.index')
annoy_index.model = wv

wv.most_similar(positive=['football','win','organization'], topn=10, indexer=annoy_index)
wv.most_similar(positive=['football','win','nationality'], topn=10, indexer=annoy_index)
コード例 #30
0
ファイル: test_ut.py プロジェクト: MathewXJ/PycharmProjects
# # 创建annoy indexer并关联word2vec模型
# # t0 = time.time()
# # annoy_indexer_100 = AnnoyIndexer(model, 300)
# # t1 = time.time()
# # print("Create AnnoyIndexer: {}ms".format(1000*(t1-t0)))
# #
# # # 保存annoy indexer到disk
# # annoy_indexer_100.save('annoy_indexer_100')
# # t2 = time.time()
# # print("Save AnnoyIndexer to File : {}ms".format(1000*(t2-t1)))

# 从disk加载annoy indexer
t1 = time.time()
annoy_indexer_100 = AnnoyIndexer()
annoy_indexer_100.load('annoy_indexer_100')
annoy_indexer_100.model = model
t2 = time.time()
print("从disk加载annoy indexer : {}ms".format(1000 * (t2 - t1)))

# 计算相似度
pos1 = ['上海队', '梅开二度', 'vs', '发球', '开局', '得分', '接连', '直接', '北京', '上海']
pos2 = ['火箭', '英语', 'NBA', 'VS']
pos3 = ['回放', '森林狼', '18', '赛季', '135', 'NBA', '121', '奇才', '19', '常规赛']
pos4 = ['威斯布鲁克', '雷霆队', '凯尔特人队']
pos5 = ['领衔', '球员', '十佳', '英超', '半场', '进球', '体育', '资讯', '吊射']
t2 = time.time()
print(model.most_similar(positive=pos1, topn=25, indexer=annoy_indexer_100))
t3 = time.time()
print("First Query : {}ms".format(1000 * (t3 - t2)))
print(model.most_similar(positive=pos2, topn=6, indexer=annoy_indexer_100))