Beispiel #1
0
    def _load_classifier(self, **kwargs):
        if self.classifier_type == 'ann':
            for f in list_files(self.s3_conn, self.s3_path):
                filepath = os.path.join(self.temporary_directory, f)
                if not os.path.exists(filepath):
                    logging.warning('calling download from %s to %s',
                                    self.s3_path + f, filepath)
                    download(self.s3_conn, filepath,
                             os.path.join(self.s3_path, f))
            ann_index = AnnoyIndexer()
            ann_index.load(
                os.path.join(self.temporary_directory,
                             self.classifier_id + '.index'))
            return NearestNeighbors(s3_conn=self.s3_conn,
                                    indexer=ann_index,
                                    **kwargs)

        elif self.classifier_type == 'knn':
            return NearestNeighbors(s3_conn=self.s3_conn,
                                    indexed=False,
                                    **kwargs)

        else:
            print('Not implemented yet!')
            return None
Beispiel #2
0
def create_sim_dict(file, vectors, min_sim=0.55, topn=10, num_trees=200):
    indexer = AnnoyIndexer(vectors, num_trees=num_trees)
    sim_dict = dict()
    for w in messages.pbar(vectors.vocab):
        sim = indexer.most_similar(vectors.get_vector(w), topn)
        sim_dict[w] = [s for s in sim if s[1] > min_sim]
    with open(file, 'wb') as fileout:
        pickle.dump(sim_dict, fileout)
Beispiel #3
0
def get_indexer(fpath, model, room_id):
    if os.path.exists(fpath):
        logging.info("Use annoy_index :: room_id:%s", room_id)
        annoy_index = AnnoyIndexer()
        annoy_index.load(fpath)
        annoy_index.model = model

        return annoy_index
    else:
        # indexer: defaut is None
        return None
Beispiel #4
0
    def setUp(self):
        try:
            import annoy
        except ImportError:
            raise unittest.SkipTest("Annoy library is not available")

        from gensim.similarities.index import AnnoyIndexer

        self.model = doc2vec.Doc2Vec(sentences, min_count=1)
        self.model.init_sims()
        self.index = AnnoyIndexer(self.model, 300)
        self.vector = self.model.docvecs.doctag_syn0norm[0]
Beispiel #5
0
    def assertLoadedIndexEqual(self, index, model):
        from gensim.similarities.index import AnnoyIndexer

        index.save('index')

        index2 = AnnoyIndexer()
        index2.load('index')
        index2.model = model

        self.assertEqual(index.index.f, index2.index.f)
        self.assertEqual(index.labels, index2.labels)
        self.assertEqual(index.num_trees, index2.num_trees)
Beispiel #6
0
    def testSaveLoad(self):
        from gensim.similarities.index import AnnoyIndexer

        self.index.save('index')

        self.index2 = AnnoyIndexer()
        self.index2.load('index')
        self.index2.model = self.model

        self.assertEqual(self.index.index.f, self.index2.index.f)
        self.assertEqual(self.index.labels, self.index2.labels)
        self.assertEqual(self.index.num_trees, self.index2.num_trees)
Beispiel #7
0
    def assertLoadedIndexEqual(self, index, model):
        from gensim.similarities.index import AnnoyIndexer

        index.save('index')

        index2 = AnnoyIndexer()
        index2.load('index')
        index2.model = model

        self.assertEqual(index.index.f, index2.index.f)
        self.assertEqual(index.labels, index2.labels)
        self.assertEqual(index.num_trees, index2.num_trees)
Beispiel #8
0
    def setUp(self):
        try:
            import annoy
        except ImportError:
            raise unittest.SkipTest("Annoy library is not available")

        from gensim.similarities.index import AnnoyIndexer

        self.model = word2vec.Word2Vec(texts, min_count=1)
        self.model.init_sims()
        self.index = AnnoyIndexer(self.model, 10)
        self.vector = self.model.syn0norm[0]
Beispiel #9
0
 def train_with_annoy(self):
     self.status = 3
     push.push_to_rtx(push.generate_rtx_markdown("annoy向量空间开始注水"))
     self.annoy_index = AnnoyIndexer(self.tc_wv_model, 200)
     fname = 'tc_index_genoy.index'
     self.annoy_index.save(fname)
     # 导出训练结果,以后直接 load 即可
     # annoy_index = AnnoyIndexer()
     # annoy_index.load(fname)
     # annoy_index.model = tc_wv_model
     push.push_to_rtx(push.generate_rtx_markdown("annoy向量空间注水完毕"))
     self.status = 4
    def testSaveLoad(self):
        from gensim.similarities.index import AnnoyIndexer

        fname = get_tmpfile('gensim_similarities.tst.pkl')
        self.index.save(fname)

        self.index2 = AnnoyIndexer()
        self.index2.load(fname)
        self.index2.model = self.model

        self.assertEqual(self.index.index.f, self.index2.index.f)
        self.assertEqual(self.index.labels, self.index2.labels)
        self.assertEqual(self.index.num_trees, self.index2.num_trees)
    def assertLoadedIndexEqual(self, index, model):
        from gensim.similarities.index import AnnoyIndexer

        fname = get_tmpfile('gensim_similarities.tst.pkl')
        index.save(fname)

        index2 = AnnoyIndexer()
        index2.load(fname)
        index2.model = model

        self.assertEqual(index.index.f, index2.index.f)
        self.assertEqual(index.labels, index2.labels)
        self.assertEqual(index.num_trees, index2.num_trees)
Beispiel #12
0
def f(process_id):
    print('Process Id: {}'.format(os.getpid()))
    process = psutil.Process(os.getpid())
    new_model = Word2Vec.load('/tmp/mymodel.pkl')
    vector = new_model.wv["science"]
    annoy_index = AnnoyIndexer()
    annoy_index.load('/tmp/mymodel.index')
    annoy_index.model = new_model
    approximate_neighbors = new_model.wv.most_similar([vector],
                                                      topn=5,
                                                      indexer=annoy_index)
    print('\nMemory used by process {}: {}\n---'.format(
        os.getpid(), process.memory_info()))
    def assertLoadedIndexEqual(self, index, model):
        from gensim.similarities.index import AnnoyIndexer

        fname = get_tmpfile('gensim_similarities.tst.pkl')
        index.save(fname)

        index2 = AnnoyIndexer()
        index2.load(fname)
        index2.model = model

        self.assertEqual(index.index.f, index2.index.f)
        self.assertEqual(index.labels, index2.labels)
        self.assertEqual(index.num_trees, index2.num_trees)
def load_w2v():
    print("Loading gensim pre-trained model")
    # model = KeyedVectors.load_word2vec_format("SO_vectors_200.bin", binary=True)
    # Above is intolerably slow and large, normed by code found here: https://stackoverflow.com/a/56963501
    model = KeyedVectors.load("SO_vectors_normed", mmap='r')

    # Use this to load the provided AnnoyIndex
    annoy_index = AnnoyIndexer()
    annoy_index.load('SO_vectors_normed_annoy_index')

    # Use this to generate a new AnnoyIndex in ram, number is n-gram size (2 is recommended and seems to work best here)
    # annoy_index = AnnoyIndexer(model, 3)

    return Word2Vec(model, index=annoy_index)
Beispiel #15
0
def load_annoy(annoypath, model):
    '''

    :param annoypath: 
    :type annoypath: 
    :param model: 
    :type model: Word2Vec
    :return: 
    :rtype: AnnoyIndexer
    '''
    if not os.path.exists(annoypath):
        print("开始构建annoy索引:当前时间 : " +
              time.asctime(time.localtime(time.time())))
        starttime12 = time.time()
        aindex = AnnoyIndexer(model, 200)
        print("构建索引完毕 %.2f secs" % (time.time() - starttime12))
        # 保存annoy索引
        print("开始保存annoy索引")
        starttime13 = time.time()
        aindex.save(annoypath)
        print("保存索引完毕 %.2f secs" % (time.time() - starttime13))
    else:
        aindex = AnnoyIndexer()
        aindex.load(annoypath)
    return aindex
Beispiel #16
0
 def load_index(self, path):
     index = {}
     nn = 0
     for field in field_list:
         logger.info(u'---------field:' + field)
         index[field] = {}
         for unit_type in unit_types:
             index[field][unit_type] = {}
             for parent, dirnames, filenames in os.walk(path + field + '/' +
                                                        unit_type + '/'):
                 for filename in filenames:
                     # if len(filename.split('.')) == 2 and (
                     #         'A5' in os.path.join(parent, filename) or 'project' in os.path.join(parent, filename)):
                     if len(filename.split('.')) == 2:
                         nn = nn + 1
                         logger.info(
                             u'创建AnnoyIndexer %s:field=%s,unit_type=%s,province=%s'
                             %
                             (nn, field, unit_type, filename.split('.')[0]))
                         index[field][unit_type][str(
                             filename.split('.')[0])] = AnnoyIndexer()
                         index[field][unit_type][str(
                             filename.split('.')[0])].load(
                                 os.path.join(parent, filename))
     return index
def build_graph(filename,
                TOPN,
                A_name,
                indice2word_name,
                annoy=False,
                dim=100,
                tree_num=20):
    """
    """
    model = read_w2v(filename, dim)
    V = len(model.wv.vocab)
    print("Num. vocab = %i" % V)
    word_indice_dic = {word: i for i, word in enumerate(model.wv.vocab)}
    indice2word = {i: word for word, i in word_indice_dic.items()}
    A = dok_matrix((V, V), dtype=np.float32)
    if annoy:
        print("Using ANNOY...")
        from gensim.similarities.index import AnnoyIndexer
        annoy_index = AnnoyIndexer(model, tree_num)
        add_neighbors(A, TOPN, model, word_indice_dic, annoy_index=annoy_index)
    else:
        add_neighbors(A, TOPN, model, word_indice_dic)

    save_sparse_csr(A_name, A.tocsr())
    pickle.dump(indice2word, open(indice2word_name, "wb"))
Beispiel #18
0
class TestDoc2VecAnnoyIndexer(unittest.TestCase):

    def setUp(self):
        try:
            import annoy
        except ImportError:
            raise unittest.SkipTest("Annoy library is not available")

        from gensim.similarities.index import AnnoyIndexer

        self.model = doc2vec.Doc2Vec(sentences, min_count=1)
        self.model.init_sims()
        self.index = AnnoyIndexer(self.model, 300)
        self.vector = self.model.docvecs.doctag_syn0norm[0]

    def testDocumentIsSimilarToItself(self):
        approx_neighbors = self.index.most_similar(self.vector, 1)
        doc, similarity = approx_neighbors[0]

        self.assertEqual(doc, 0)
        self.assertEqual(similarity, 1.0)

    def testApproxNeighborsMatchExact(self):
        approx_neighbors = self.model.docvecs.most_similar([self.vector], topn=5, indexer=self.index)
        exact_neighbors = self.model.docvecs.most_similar(
            positive=[self.vector], topn=5)

        approx_words = [neighbor[0] for neighbor in approx_neighbors]
        exact_words = [neighbor[0] for neighbor in exact_neighbors]

        self.assertEqual(approx_words, exact_words)
Beispiel #19
0
class TestWord2VecAnnoyIndexer(unittest.TestCase):

    def setUp(self):
        try:
            import annoy
        except ImportError:
            raise unittest.SkipTest("Annoy library is not available")

        from gensim.similarities.index import AnnoyIndexer

        self.model = word2vec.Word2Vec(texts, min_count=1)
        self.model.init_sims()
        self.index = AnnoyIndexer(self.model, 10)
        self.vector = self.model.syn0norm[0]

    def testVectorIsSimilarToItself(self):
        label = self.model.index2word[0]
        approx_neighbors = self.index.most_similar(self.vector, 1)
        word, similarity = approx_neighbors[0]

        self.assertEqual(word, label)
        self.assertEqual(similarity, 1.0)

    def testApproxNeighborsMatchExact(self):
        approx_neighbors = self.model.most_similar([self.vector], topn=5, indexer=self.index)
        exact_neighbors = self.model.most_similar(positive=[self.vector], topn=5)

        approx_words = [neighbor[0] for neighbor in approx_neighbors]
        exact_words = [neighbor[0] for neighbor in exact_neighbors]

        self.assertEqual(approx_words, exact_words)
Beispiel #20
0
def get_similarity(model, args):
    indexer = AnnoyIndexer(model, 10)
    i = 0
    chunk_i = 0
    item = []
    similarity = []
    with tqdm.tqdm(desc="get_similarity",
                   total=len(model.wv.vectors)) as progress:
        for word in np.sort(list(model.wv.vocab.keys())):
            item.append(word)
            similarity.append([
                '{}={}'.format(cscore, cword) for cscore, cword in
                model.wv.most_similar(word, topn=args.k, indexer=indexer)
            ])
            i += 1
            if i % args.save_one_time == 0:
                print("save to csv chunk no: {}".format(chunk_i))
                topk_df = pd.DataFrame({'item': item, 'topk': similarity})
                topk_df.to_csv(args.output_file,
                               mode='a',
                               header=False,
                               index=False)
                i = 0
                chunk_i += 1
                item = []
                similarity = []
            progress.update(1)
        if i > 0:
            print("save to csv chunk no: {}".format(chunk_i))
            topk_df = pd.DataFrame({'item': item, 'topk': similarity})
            topk_df.to_csv(args.output_file,
                           mode='a',
                           header=False,
                           index=False)
    return similarity
 def _initialize_nns_method(self, nnsmethod, annoymodelpath):
     if self.nns_method_init_dict[nnsmethod]: return
     if nnsmethod == NNSMethod.KD_TREE:
         print("Building KD tree..")
         self.tree = cKDTree(self.model.docvecs.vectors_docs)
         print("Finished building KD tree.")
         self.keys = list(self.model.docvecs.doctags.keys())
     elif nnsmethod == NNSMethod.ANNOY:
         if not os.path.isfile(annoymodelpath):
             print("Generating annoy index...")
             self.annoy_indexer = AnnoyIndexer(self.model, 50)
             print("Finished generating annoy index.")
             self.annoy_indexer.save(annoymodelpath)
         else:
             self.annoy_indexer = AnnoyIndexer()
             self.annoy_indexer.load(annoymodelpath)
             self.annoy_indexer.model = self.model
def train_item2vec(df=None, sessions=None, samples=None):
    if df is None and samples is None:
        raise NotImplementedError(
            ">>> Must be specific no items. Can not set `df` and `samples` to None"  # noqa
        )

    if samples is None:
        gen_rooms = RoomsGenerator(df, sessions)
    else:
        gen_rooms = samples

    start_ = time.time()
    model_i2v_path = os.path.join(st.BASE_MODEL,
                                  "{}.model".format(st.ITEM2VEC_KEY))
    if os.path.exists(model_i2v_path):
        logging.info("Load pre-train model")
        model = Word2Vec.load(model_i2v_path)
        logging.info("Vocabulary before re-training: %d", len(model.wv.vocab))

        model.build_vocab(gen_rooms, update=True)
        logging.info("Vocabulary after re-training: %d", len(model.wv.vocab))
        model.train(gen_rooms,
                    total_examples=model.corpus_count,
                    epochs=model.iter,
                    callbacks=())
        logging.info("Pre-train model took %d's'", time.time() - start_)
    else:
        model = Word2Vec(gen_rooms,
                         sg=st.SG,
                         size=st.I2V_DIM,
                         window=st.WINDOWS,
                         min_count=st.MIN_COUNT,
                         workers=st.WORKERS,
                         iter=st.EPOCHS,
                         sample=st.SAMPLE,
                         negative=st.NS,
                         compute_loss=st.COMPUTE_LOSS,
                         callbacks=[Timer(start_)])

    logging.info("Saving item2vec model")
    model.save(model_i2v_path)

    logging.info("Build annoy index for item2vec model")
    annoy_index = AnnoyIndexer(model, 100)
    annoy_index.save(
        os.path.join(st.BASE_MODEL, "{}.model".format(st.ANNOY_INDEX_KEY)))
Beispiel #23
0
def get_annoy(w2v, embedding_type='w2v'):
    dims = 100
    annoy_file_name = data_dir + '/annoy_index_' + '_' + str(dims) + '_' + embedding_type + '_' + str(len(w2v.vocab))
    if os.path.exists(annoy_file_name):
        logging.info("Loading Annoy from file: %s", annoy_file_name)
        annoy_index = AnnoyIndexer()
        annoy_index.load(annoy_file_name)
        annoy_index.model = word_vectors
    else:
        logging.info("Creating Annoy")
        annoy_index = AnnoyIndexer(word_vectors, dims)
        annoy_index.save(annoy_file_name)
        logging.info("Annoy indexing saved to %s", annoy_file_name)
    return annoy_index
Beispiel #24
0
class WordCalc:
    def __init__(self):
        # 0: 未训练
        # 1: 正在训练gensim版
        # 2: gensim版可用
        # 3: 正在训练annoy
        # 4: annoy版可用
        self.status = 0
        push.push_to_rtx(push.generate_rtx_markdown("wordcalc出仓状态良好"))

    def train_with_gensim(self):
        self.status = 1
        push.push_to_rtx(push.generate_rtx_markdown("gensim转子引擎开始加热"))
        self.tc_wv_model = KeyedVectors.load_word2vec_format(
            './Tencent_AILab_ChineseEmbedding.txt', binary=False)
        push.push_to_rtx(push.generate_rtx_markdown("gensim转子引擎加热完毕"))
        self.status = 2

    def train_with_annoy(self):
        self.status = 3
        push.push_to_rtx(push.generate_rtx_markdown("annoy向量空间开始注水"))
        self.annoy_index = AnnoyIndexer(self.tc_wv_model, 200)
        fname = 'tc_index_genoy.index'
        self.annoy_index.save(fname)
        # 导出训练结果,以后直接 load 即可
        # annoy_index = AnnoyIndexer()
        # annoy_index.load(fname)
        # annoy_index.model = tc_wv_model
        push.push_to_rtx(push.generate_rtx_markdown("annoy向量空间注水完毕"))
        self.status = 4

    def calc(self, positive_set, negative_set):
        if self.status == 2 or self.status == 3:
            result = self.tc_wv_model.most_similar(positive=positive_set,
                                                   negative=negative_set,
                                                   topn=10)
            return result
        elif self.status == 4:
            result = self.tc_wv_model.most_similar(positive=positive_set,
                                                   negative=negative_set,
                                                   indexer=self.annoy_index,
                                                   topn=10)
            return result
        else:
            return []
    def testAnnoyIndexingOfKeyedVectors(self):
        from gensim.similarities.index import AnnoyIndexer
        keyVectors_file = datapath('lee_fasttext.vec')
        model = KeyedVectors.load_word2vec_format(keyVectors_file)
        index = AnnoyIndexer(model, 10)

        self.assertEqual(index.num_trees, 10)
        self.assertVectorIsSimilarToItself(model, index)
        self.assertApproxNeighborsMatchExact(model, model, index)
 def index_vector(self, dimensions=300, save=False):
     '''
     make annoy_index which is used in function 'is_word_pairs_similar'
     Using annoy_index, execution may be slower than normal index
     '''
     path = Path.cwd().parent.joinpath('preprocessed/annoy.index')
     if path.exists():
         annoy_index = AnnoyIndexer()
         annoy_index.load(str(path))
         annoy_index.model = self.embedding
     else:
         annoy_index = AnnoyIndexer(self.embedding, dimensions)
         if save:
             annoy_index.save(str(path))
     return annoy_index
Beispiel #27
0
def main():
    parser = argparse.ArgumentParser(description='Trains word embeddings')
    parser.add_argument('--config_file',
                        type=str,
                        default='configs/echoes_local.config',
                        help='location of the configuration file')
    args = parser.parse_args()

    config = configparser.ConfigParser()
    config.read(args.config_file)

    print(config['word']['model_dir'])

    sentences = Sentences(input_file=config['general']['corpus_file'])
    try:
        shutil.rmtree(config['word']['model_dir'])
    except FileNotFoundError:
        pass
    os.mkdir(config['word']['model_dir'])

    logging.info('Building fasttext model...')
    model = FastText(sentences,
                     size=int(config['word']['size']),
                     window=int(config['word']['window']),
                     min_count=int(config['word']['min_count']),
                     iter=int(config['word']['epochs']),
                     workers=int(config['word']['workers']))
    model.init_sims()
    model.save(f"{config['word']['model_dir']}/ft_model")
    logging.info(f"Saved fasttext model under {config['word']['model_dir']}")

    logging.info('Building word2vec model...')
    model = Word2Vec(sentences,
                     size=int(config['word']['size']),
                     window=int(config['word']['window']),
                     min_count=int(config['word']['min_count']),
                     iter=int(config['word']['epochs']),
                     workers=int(config['word']['workers']))
    model.init_sims()
    annoy_index = AnnoyIndexer(model, 100)
    annoy_index.save(f"{config['word']['model_dir']}/annoy_model")
    model.save(f"{config['word']['model_dir']}/w2v_model")
    logging.info(f"Saved word2vec model under {config['word']['model_dir']}")
Beispiel #28
0
class WordNeighbors:
    def __init__(self, model_dir):
        self.ft_model = FastText.load(os.path.join(model_dir, 'ft_model'))
        self.w2v_model = Word2Vec.load(os.path.join(model_dir, 'w2v_model'))
        self.annoy_index = AnnoyIndexer()
        self.annoy_index.load(os.path.join(model_dir, 'annoy_model'))

    def query(self, w, topn):
        if w in self.w2v_model:
            vector = self.w2v_model[w]
            neighbors = self.w2v_model.most_similar([vector],
                                                    topn=topn,
                                                    indexer=self.annoy_index)
        else:
            try:
                neighbors = self.ft_model.most_similar(w, topn=topn)
            except KeyError:
                neighbors = []
        return neighbors
class TestDoc2VecAnnoyIndexer(unittest.TestCase):
    def setUp(self):
        try:
            import annoy  # noqa:F401
        except ImportError:
            raise unittest.SkipTest("Annoy library is not available")

        from gensim.similarities.index import AnnoyIndexer

        self.model = doc2vec.Doc2Vec(sentences, min_count=1)
        self.model.init_sims()
        self.index = AnnoyIndexer(self.model, 300)
        self.vector = self.model.docvecs.doctag_syn0norm[0]

    def testDocumentIsSimilarToItself(self):
        approx_neighbors = self.index.most_similar(self.vector, 1)
        doc, similarity = approx_neighbors[0]

        self.assertEqual(doc, 0)
        self.assertEqual(similarity, 1.0)

    def testApproxNeighborsMatchExact(self):
        approx_neighbors = self.model.docvecs.most_similar([self.vector],
                                                           topn=5,
                                                           indexer=self.index)
        exact_neighbors = self.model.docvecs.most_similar(
            positive=[self.vector], topn=5)

        approx_words = [neighbor[0] for neighbor in approx_neighbors]
        exact_words = [neighbor[0] for neighbor in exact_neighbors]

        self.assertEqual(approx_words, exact_words)

    def testSave(self):
        fname = testfile()
        self.index.save(fname)
        self.assertTrue(os.path.exists(fname))
        self.assertTrue(os.path.exists(fname + '.d'))

    def testLoadNotExist(self):
        from gensim.similarities.index import AnnoyIndexer
        self.test_index = AnnoyIndexer()

        self.assertRaises(IOError, self.test_index.load, fname='test-index')

    def testSaveLoad(self):
        from gensim.similarities.index import AnnoyIndexer

        fname = testfile()
        self.index.save(fname)

        self.index2 = AnnoyIndexer()
        self.index2.load(fname)
        self.index2.model = self.model

        self.assertEqual(self.index.index.f, self.index2.index.f)
        self.assertEqual(self.index.labels, self.index2.labels)
        self.assertEqual(self.index.num_trees, self.index2.num_trees)
Beispiel #30
0
class TestWord2VecAnnoyIndexer(unittest.TestCase):
    def setUp(self):
        try:
            import annoy
        except ImportError:
            raise unittest.SkipTest("Annoy library is not available")

        from gensim.similarities.index import AnnoyIndexer

        self.model = word2vec.Word2Vec(texts, min_count=1)
        self.model.init_sims()
        self.index = AnnoyIndexer(self.model, 10)
        self.vector = self.model.wv.syn0norm[0]

    def testVectorIsSimilarToItself(self):
        label = self.model.index2word[0]
        approx_neighbors = self.index.most_similar(self.vector, 1)
        word, similarity = approx_neighbors[0]

        self.assertEqual(word, label)
        self.assertEqual(similarity, 1.0)

    def testApproxNeighborsMatchExact(self):
        approx_neighbors = self.model.most_similar([self.vector],
                                                   topn=5,
                                                   indexer=self.index)
        exact_neighbors = self.model.most_similar(positive=[self.vector],
                                                  topn=5)

        approx_words = [neighbor[0] for neighbor in approx_neighbors]
        exact_words = [neighbor[0] for neighbor in exact_neighbors]

        self.assertEqual(approx_words, exact_words)

    def testSave(self):
        self.index.save('index')
        self.assertTrue(os.path.exists('index'))
        self.assertTrue(os.path.exists('index.d'))

    def testLoadNotExist(self):
        from gensim.similarities.index import AnnoyIndexer
        self.test_index = AnnoyIndexer()

        self.assertRaises(IOError, self.test_index.load, fname='test-index')

    def testSaveLoad(self):
        from gensim.similarities.index import AnnoyIndexer

        self.index.save('index')

        self.index2 = AnnoyIndexer()
        self.index2.load('index')
        self.index2.model = self.model

        self.assertEqual(self.index.index.f, self.index2.index.f)
        self.assertEqual(self.index.labels, self.index2.labels)
        self.assertEqual(self.index.num_trees, self.index2.num_trees)
Beispiel #31
0
class TestDoc2VecAnnoyIndexer(unittest.TestCase):

    def setUp(self):
        try:
            import annoy
        except ImportError:
            raise unittest.SkipTest("Annoy library is not available")

        from gensim.similarities.index import AnnoyIndexer

        self.model = doc2vec.Doc2Vec(sentences, min_count=1)
        self.model.init_sims()
        self.index = AnnoyIndexer(self.model, 300)
        self.vector = self.model.docvecs.doctag_syn0norm[0]

    def testDocumentIsSimilarToItself(self):
        approx_neighbors = self.index.most_similar(self.vector, 1)
        doc, similarity = approx_neighbors[0]

        self.assertEqual(doc, 0)
        self.assertEqual(similarity, 1.0)

    def testApproxNeighborsMatchExact(self):
        approx_neighbors = self.model.docvecs.most_similar([self.vector], topn=5, indexer=self.index)
        exact_neighbors = self.model.docvecs.most_similar(
            positive=[self.vector], topn=5)

        approx_words = [neighbor[0] for neighbor in approx_neighbors]
        exact_words = [neighbor[0] for neighbor in exact_neighbors]

        self.assertEqual(approx_words, exact_words)

    def testSave(self):
        fname = testfile()
        self.index.save(fname)
        self.assertTrue(os.path.exists(fname))
        self.assertTrue(os.path.exists(fname + '.d'))

    def testLoadNotExist(self):
        from gensim.similarities.index import AnnoyIndexer
        self.test_index = AnnoyIndexer()

        self.assertRaises(IOError, self.test_index.load, fname='test-index')

    def testSaveLoad(self):
        from gensim.similarities.index import AnnoyIndexer

        fname = testfile()
        self.index.save(fname)

        self.index2 = AnnoyIndexer()
        self.index2.load(fname)
        self.index2.model = self.model

        self.assertEqual(self.index.index.f, self.index2.index.f)
        self.assertEqual(self.index.labels, self.index2.labels)
        self.assertEqual(self.index.num_trees, self.index2.num_trees)
Beispiel #32
0
def build_index():
    # for v, i in zip(vecs, indx):
    #     model = gensim.models.word2vec.Word2Vec.load_word2vec_format(v, binary=False)
    #     index = AnnoyIndexer(model, 100)
    #     index.save(i)
    for vec in vecs:
        # 三个参数:分别返回1.父目录 2.所有文件夹名字(不含路径) 3.所有文件名字
        for parent, dirnames, filenames in os.walk(vec):
            for filename in filenames:  # 输出文件信息
                origin_path = os.path.join(parent, filename)
                target_path = os.path.join(parent, filename).replace(
                    'vec_new', 'ind').split('.')[0] + '.ind'
                logger.info('origin path:' + origin_path)
                logger.info('target path:' + target_path)
                if not os.path.exists(parent.replace('vec_new', 'ind')):
                    os.makedirs(parent.replace('vec_new', 'ind'))
                model = gensim.models.word2vec.Word2Vec.load_word2vec_format(
                    origin_path, binary=False)
                index = AnnoyIndexer(model, 100)
                index.save(target_path)
Beispiel #33
0
class Predictor:
    def __init__(self):
        self.model = Word2Vec.load('data/word2vec.model')
        self.vocab = self.model.wv.vocab
        self.annoy_index = AnnoyIndexer()
        self.annoy_index.load('data/word2vec_idx.ann')
        self.annoy_index.model = self.model

    def explain(self, word, n_words):
        try:
            ans_words = self.model.wv.most_similar(
                positive=[lemmatize_stemming(word)],
                topn=n_words + 1,
                indexer=self.annoy_index)
            print([lemmatize_stemming(word[0]) for word in ans_words[1:]])
            return [word[0] for word in ans_words[1:]]
        except KeyError:
            return 'Wrong word'

    def guess(self, words, n_words):
        try:
            if len(words) != 1:
                ans_words = self.model.wv.most_similar(
                    positive=[
                        lemmatize_stemming(word) for word in words
                        if lemmatize_stemming(word) in self.vocab
                    ],
                    topn=n_words,
                    indexer=self.annoy_index)
                return [word[0] for word in ans_words]
            else:
                ans_words = self.model.wv.most_similar(
                    positive=[
                        lemmatize_stemming(word) for word in words
                        if lemmatize_stemming(word) in self.vocab
                    ],
                    topn=n_words + 1,
                    indexer=self.annoy_index)
                return [word[0] for word in ans_words[1:]]
        except ValueError:
            return 'Wrong word'
Beispiel #34
0
    def testSaveLoad(self):
        from gensim.similarities.index import AnnoyIndexer

        self.index.save('index')

        self.index2 = AnnoyIndexer()
        self.index2.load('index')
        self.index2.model = self.model

        self.assertEqual(self.index.index.f, self.index2.index.f)
        self.assertEqual(self.index.labels, self.index2.labels)
        self.assertEqual(self.index.num_trees, self.index2.num_trees)
Beispiel #35
0
class TestWord2VecAnnoyIndexer(unittest.TestCase):

    def setUp(self):
        try:
            import annoy
        except ImportError:
            raise unittest.SkipTest("Annoy library is not available")

        from gensim.similarities.index import AnnoyIndexer

        self.model = word2vec.Word2Vec(texts, min_count=1)
        self.model.init_sims()
        self.index = AnnoyIndexer(self.model, 10)
        self.vector = self.model.syn0norm[0]

    def testVectorIsSimilarToItself(self):
        label = self.model.index2word[0]
        approx_neighbors = self.index.most_similar(self.vector, 1)
        word, similarity = approx_neighbors[0]

        self.assertEqual(word, label)
        self.assertEqual(similarity, 1.0)

    def testApproxNeighborsMatchExact(self):
        approx_neighbors = self.model.most_similar([self.vector], topn=5, indexer=self.index)
        exact_neighbors = self.model.most_similar(positive=[self.vector], topn=5)

        approx_words = [neighbor[0] for neighbor in approx_neighbors]
        exact_words = [neighbor[0] for neighbor in exact_neighbors]

        self.assertEqual(approx_words, exact_words)

    def testSave(self):
        self.index.save('index')
        self.assertTrue(os.path.exists('index'))
        self.assertTrue(os.path.exists('index.d'))

    def testLoadNotExist(self):
        from gensim.similarities.index import AnnoyIndexer
        self.test_index = AnnoyIndexer()

        self.assertRaises(IOError, self.test_index.load, fname='test-index')

    def testSaveLoad(self):
        from gensim.similarities.index import AnnoyIndexer

        self.index.save('index')

        self.index2 = AnnoyIndexer()
        self.index2.load('index')
        self.index2.model = self.model

        self.assertEqual(self.index.index.f, self.index2.index.f)
        self.assertEqual(self.index.labels, self.index2.labels)
        self.assertEqual(self.index.num_trees, self.index2.num_trees)
    def testSaveLoad(self):
        from gensim.similarities.index import AnnoyIndexer

        fname = get_tmpfile('gensim_similarities.tst.pkl')
        self.index.save(fname)

        self.index2 = AnnoyIndexer()
        self.index2.load(fname)
        self.index2.model = self.model

        self.assertEqual(self.index.index.f, self.index2.index.f)
        self.assertEqual(self.index.labels, self.index2.labels)
        self.assertEqual(self.index.num_trees, self.index2.num_trees)