コード例 #1
0
def load_annoy(annoypath, model):
    '''

    :param annoypath: 
    :type annoypath: 
    :param model: 
    :type model: Word2Vec
    :return: 
    :rtype: AnnoyIndexer
    '''
    if not os.path.exists(annoypath):
        print("开始构建annoy索引:当前时间 : " +
              time.asctime(time.localtime(time.time())))
        starttime12 = time.time()
        aindex = AnnoyIndexer(model, 200)
        print("构建索引完毕 %.2f secs" % (time.time() - starttime12))
        # 保存annoy索引
        print("开始保存annoy索引")
        starttime13 = time.time()
        aindex.save(annoypath)
        print("保存索引完毕 %.2f secs" % (time.time() - starttime13))
    else:
        aindex = AnnoyIndexer()
        aindex.load(annoypath)
    return aindex
コード例 #2
0
class TestDoc2VecAnnoyIndexer(unittest.TestCase):
    def setUp(self):
        try:
            import annoy  # noqa:F401
        except ImportError:
            raise unittest.SkipTest("Annoy library is not available")

        from gensim.similarities.index import AnnoyIndexer

        self.model = doc2vec.Doc2Vec(sentences, min_count=1)
        self.model.init_sims()
        self.index = AnnoyIndexer(self.model, 300)
        self.vector = self.model.docvecs.doctag_syn0norm[0]

    def testDocumentIsSimilarToItself(self):
        approx_neighbors = self.index.most_similar(self.vector, 1)
        doc, similarity = approx_neighbors[0]

        self.assertEqual(doc, 0)
        self.assertEqual(similarity, 1.0)

    def testApproxNeighborsMatchExact(self):
        approx_neighbors = self.model.docvecs.most_similar([self.vector],
                                                           topn=5,
                                                           indexer=self.index)
        exact_neighbors = self.model.docvecs.most_similar(
            positive=[self.vector], topn=5)

        approx_words = [neighbor[0] for neighbor in approx_neighbors]
        exact_words = [neighbor[0] for neighbor in exact_neighbors]

        self.assertEqual(approx_words, exact_words)

    def testSave(self):
        fname = testfile()
        self.index.save(fname)
        self.assertTrue(os.path.exists(fname))
        self.assertTrue(os.path.exists(fname + '.d'))

    def testLoadNotExist(self):
        from gensim.similarities.index import AnnoyIndexer
        self.test_index = AnnoyIndexer()

        self.assertRaises(IOError, self.test_index.load, fname='test-index')

    def testSaveLoad(self):
        from gensim.similarities.index import AnnoyIndexer

        fname = testfile()
        self.index.save(fname)

        self.index2 = AnnoyIndexer()
        self.index2.load(fname)
        self.index2.model = self.model

        self.assertEqual(self.index.index.f, self.index2.index.f)
        self.assertEqual(self.index.labels, self.index2.labels)
        self.assertEqual(self.index.num_trees, self.index2.num_trees)
コード例 #3
0
ファイル: test_similarities.py プロジェクト: JKamlah/gensim
class TestDoc2VecAnnoyIndexer(unittest.TestCase):

    def setUp(self):
        try:
            import annoy
        except ImportError:
            raise unittest.SkipTest("Annoy library is not available")

        from gensim.similarities.index import AnnoyIndexer

        self.model = doc2vec.Doc2Vec(sentences, min_count=1)
        self.model.init_sims()
        self.index = AnnoyIndexer(self.model, 300)
        self.vector = self.model.docvecs.doctag_syn0norm[0]

    def testDocumentIsSimilarToItself(self):
        approx_neighbors = self.index.most_similar(self.vector, 1)
        doc, similarity = approx_neighbors[0]

        self.assertEqual(doc, 0)
        self.assertEqual(similarity, 1.0)

    def testApproxNeighborsMatchExact(self):
        approx_neighbors = self.model.docvecs.most_similar([self.vector], topn=5, indexer=self.index)
        exact_neighbors = self.model.docvecs.most_similar(
            positive=[self.vector], topn=5)

        approx_words = [neighbor[0] for neighbor in approx_neighbors]
        exact_words = [neighbor[0] for neighbor in exact_neighbors]

        self.assertEqual(approx_words, exact_words)

    def testSave(self):
        fname = testfile()
        self.index.save(fname)
        self.assertTrue(os.path.exists(fname))
        self.assertTrue(os.path.exists(fname + '.d'))

    def testLoadNotExist(self):
        from gensim.similarities.index import AnnoyIndexer
        self.test_index = AnnoyIndexer()

        self.assertRaises(IOError, self.test_index.load, fname='test-index')

    def testSaveLoad(self):
        from gensim.similarities.index import AnnoyIndexer

        fname = testfile()
        self.index.save(fname)

        self.index2 = AnnoyIndexer()
        self.index2.load(fname)
        self.index2.model = self.model

        self.assertEqual(self.index.index.f, self.index2.index.f)
        self.assertEqual(self.index.labels, self.index2.labels)
        self.assertEqual(self.index.num_trees, self.index2.num_trees)
コード例 #4
0
class TestWord2VecAnnoyIndexer(unittest.TestCase):
    def setUp(self):
        try:
            import annoy
        except ImportError:
            raise unittest.SkipTest("Annoy library is not available")

        from gensim.similarities.index import AnnoyIndexer

        self.model = word2vec.Word2Vec(texts, min_count=1)
        self.model.init_sims()
        self.index = AnnoyIndexer(self.model, 10)
        self.vector = self.model.wv.syn0norm[0]

    def testVectorIsSimilarToItself(self):
        label = self.model.index2word[0]
        approx_neighbors = self.index.most_similar(self.vector, 1)
        word, similarity = approx_neighbors[0]

        self.assertEqual(word, label)
        self.assertEqual(similarity, 1.0)

    def testApproxNeighborsMatchExact(self):
        approx_neighbors = self.model.most_similar([self.vector],
                                                   topn=5,
                                                   indexer=self.index)
        exact_neighbors = self.model.most_similar(positive=[self.vector],
                                                  topn=5)

        approx_words = [neighbor[0] for neighbor in approx_neighbors]
        exact_words = [neighbor[0] for neighbor in exact_neighbors]

        self.assertEqual(approx_words, exact_words)

    def testSave(self):
        self.index.save('index')
        self.assertTrue(os.path.exists('index'))
        self.assertTrue(os.path.exists('index.d'))

    def testLoadNotExist(self):
        from gensim.similarities.index import AnnoyIndexer
        self.test_index = AnnoyIndexer()

        self.assertRaises(IOError, self.test_index.load, fname='test-index')

    def testSaveLoad(self):
        from gensim.similarities.index import AnnoyIndexer

        self.index.save('index')

        self.index2 = AnnoyIndexer()
        self.index2.load('index')
        self.index2.model = self.model

        self.assertEqual(self.index.index.f, self.index2.index.f)
        self.assertEqual(self.index.labels, self.index2.labels)
        self.assertEqual(self.index.num_trees, self.index2.num_trees)
コード例 #5
0
class TestWord2VecAnnoyIndexer(unittest.TestCase):

    def setUp(self):
        try:
            import annoy
        except ImportError:
            raise unittest.SkipTest("Annoy library is not available")

        from gensim.similarities.index import AnnoyIndexer

        self.model = word2vec.Word2Vec(texts, min_count=1)
        self.model.init_sims()
        self.index = AnnoyIndexer(self.model, 10)
        self.vector = self.model.syn0norm[0]

    def testVectorIsSimilarToItself(self):
        label = self.model.index2word[0]
        approx_neighbors = self.index.most_similar(self.vector, 1)
        word, similarity = approx_neighbors[0]

        self.assertEqual(word, label)
        self.assertEqual(similarity, 1.0)

    def testApproxNeighborsMatchExact(self):
        approx_neighbors = self.model.most_similar([self.vector], topn=5, indexer=self.index)
        exact_neighbors = self.model.most_similar(positive=[self.vector], topn=5)

        approx_words = [neighbor[0] for neighbor in approx_neighbors]
        exact_words = [neighbor[0] for neighbor in exact_neighbors]

        self.assertEqual(approx_words, exact_words)

    def testSave(self):
        self.index.save('index')
        self.assertTrue(os.path.exists('index'))
        self.assertTrue(os.path.exists('index.d'))

    def testLoadNotExist(self):
        from gensim.similarities.index import AnnoyIndexer
        self.test_index = AnnoyIndexer()

        self.assertRaises(IOError, self.test_index.load, fname='test-index')

    def testSaveLoad(self):
        from gensim.similarities.index import AnnoyIndexer

        self.index.save('index')

        self.index2 = AnnoyIndexer()
        self.index2.load('index')
        self.index2.model = self.model

        self.assertEqual(self.index.index.f, self.index2.index.f)
        self.assertEqual(self.index.labels, self.index2.labels)
        self.assertEqual(self.index.num_trees, self.index2.num_trees)
コード例 #6
0
def get_annoy(w2v, embedding_type='w2v'):
    dims = 100
    annoy_file_name = data_dir + '/annoy_index_' + '_' + str(dims) + '_' + embedding_type + '_' + str(len(w2v.vocab))
    if os.path.exists(annoy_file_name):
        logging.info("Loading Annoy from file: %s", annoy_file_name)
        annoy_index = AnnoyIndexer()
        annoy_index.load(annoy_file_name)
        annoy_index.model = word_vectors
    else:
        logging.info("Creating Annoy")
        annoy_index = AnnoyIndexer(word_vectors, dims)
        annoy_index.save(annoy_file_name)
        logging.info("Annoy indexing saved to %s", annoy_file_name)
    return annoy_index
コード例 #7
0
 def index_vector(self, dimensions=300, save=False):
     '''
     make annoy_index which is used in function 'is_word_pairs_similar'
     Using annoy_index, execution may be slower than normal index
     '''
     path = Path.cwd().parent.joinpath('preprocessed/annoy.index')
     if path.exists():
         annoy_index = AnnoyIndexer()
         annoy_index.load(str(path))
         annoy_index.model = self.embedding
     else:
         annoy_index = AnnoyIndexer(self.embedding, dimensions)
         if save:
             annoy_index.save(str(path))
     return annoy_index
コード例 #8
0
def train_item2vec(df=None, sessions=None, samples=None):
    if df is None and samples is None:
        raise NotImplementedError(
            ">>> Must be specific no items. Can not set `df` and `samples` to None"  # noqa
        )

    if samples is None:
        gen_rooms = RoomsGenerator(df, sessions)
    else:
        gen_rooms = samples

    start_ = time.time()
    model_i2v_path = os.path.join(st.BASE_MODEL,
                                  "{}.model".format(st.ITEM2VEC_KEY))
    if os.path.exists(model_i2v_path):
        logging.info("Load pre-train model")
        model = Word2Vec.load(model_i2v_path)
        logging.info("Vocabulary before re-training: %d", len(model.wv.vocab))

        model.build_vocab(gen_rooms, update=True)
        logging.info("Vocabulary after re-training: %d", len(model.wv.vocab))
        model.train(gen_rooms,
                    total_examples=model.corpus_count,
                    epochs=model.iter,
                    callbacks=())
        logging.info("Pre-train model took %d's'", time.time() - start_)
    else:
        model = Word2Vec(gen_rooms,
                         sg=st.SG,
                         size=st.I2V_DIM,
                         window=st.WINDOWS,
                         min_count=st.MIN_COUNT,
                         workers=st.WORKERS,
                         iter=st.EPOCHS,
                         sample=st.SAMPLE,
                         negative=st.NS,
                         compute_loss=st.COMPUTE_LOSS,
                         callbacks=[Timer(start_)])

    logging.info("Saving item2vec model")
    model.save(model_i2v_path)

    logging.info("Build annoy index for item2vec model")
    annoy_index = AnnoyIndexer(model, 100)
    annoy_index.save(
        os.path.join(st.BASE_MODEL, "{}.model".format(st.ANNOY_INDEX_KEY)))
コード例 #9
0
ファイル: wordcalc.py プロジェクト: juvu3/WordCalc
class WordCalc:
    def __init__(self):
        # 0: 未训练
        # 1: 正在训练gensim版
        # 2: gensim版可用
        # 3: 正在训练annoy
        # 4: annoy版可用
        self.status = 0
        push.push_to_rtx(push.generate_rtx_markdown("wordcalc出仓状态良好"))

    def train_with_gensim(self):
        self.status = 1
        push.push_to_rtx(push.generate_rtx_markdown("gensim转子引擎开始加热"))
        self.tc_wv_model = KeyedVectors.load_word2vec_format(
            './Tencent_AILab_ChineseEmbedding.txt', binary=False)
        push.push_to_rtx(push.generate_rtx_markdown("gensim转子引擎加热完毕"))
        self.status = 2

    def train_with_annoy(self):
        self.status = 3
        push.push_to_rtx(push.generate_rtx_markdown("annoy向量空间开始注水"))
        self.annoy_index = AnnoyIndexer(self.tc_wv_model, 200)
        fname = 'tc_index_genoy.index'
        self.annoy_index.save(fname)
        # 导出训练结果,以后直接 load 即可
        # annoy_index = AnnoyIndexer()
        # annoy_index.load(fname)
        # annoy_index.model = tc_wv_model
        push.push_to_rtx(push.generate_rtx_markdown("annoy向量空间注水完毕"))
        self.status = 4

    def calc(self, positive_set, negative_set):
        if self.status == 2 or self.status == 3:
            result = self.tc_wv_model.most_similar(positive=positive_set,
                                                   negative=negative_set,
                                                   topn=10)
            return result
        elif self.status == 4:
            result = self.tc_wv_model.most_similar(positive=positive_set,
                                                   negative=negative_set,
                                                   indexer=self.annoy_index,
                                                   topn=10)
            return result
        else:
            return []
コード例 #10
0
def main():
    parser = argparse.ArgumentParser(description='Trains word embeddings')
    parser.add_argument('--config_file',
                        type=str,
                        default='configs/echoes_local.config',
                        help='location of the configuration file')
    args = parser.parse_args()

    config = configparser.ConfigParser()
    config.read(args.config_file)

    print(config['word']['model_dir'])

    sentences = Sentences(input_file=config['general']['corpus_file'])
    try:
        shutil.rmtree(config['word']['model_dir'])
    except FileNotFoundError:
        pass
    os.mkdir(config['word']['model_dir'])

    logging.info('Building fasttext model...')
    model = FastText(sentences,
                     size=int(config['word']['size']),
                     window=int(config['word']['window']),
                     min_count=int(config['word']['min_count']),
                     iter=int(config['word']['epochs']),
                     workers=int(config['word']['workers']))
    model.init_sims()
    model.save(f"{config['word']['model_dir']}/ft_model")
    logging.info(f"Saved fasttext model under {config['word']['model_dir']}")

    logging.info('Building word2vec model...')
    model = Word2Vec(sentences,
                     size=int(config['word']['size']),
                     window=int(config['word']['window']),
                     min_count=int(config['word']['min_count']),
                     iter=int(config['word']['epochs']),
                     workers=int(config['word']['workers']))
    model.init_sims()
    annoy_index = AnnoyIndexer(model, 100)
    annoy_index.save(f"{config['word']['model_dir']}/annoy_model")
    model.save(f"{config['word']['model_dir']}/w2v_model")
    logging.info(f"Saved word2vec model under {config['word']['model_dir']}")
コード例 #11
0
def build_index():
    # for v, i in zip(vecs, indx):
    #     model = gensim.models.word2vec.Word2Vec.load_word2vec_format(v, binary=False)
    #     index = AnnoyIndexer(model, 100)
    #     index.save(i)
    for vec in vecs:
        # 三个参数:分别返回1.父目录 2.所有文件夹名字(不含路径) 3.所有文件名字
        for parent, dirnames, filenames in os.walk(vec):
            for filename in filenames:  # 输出文件信息
                origin_path = os.path.join(parent, filename)
                target_path = os.path.join(parent, filename).replace(
                    'vec_new', 'ind').split('.')[0] + '.ind'
                logger.info('origin path:' + origin_path)
                logger.info('target path:' + target_path)
                if not os.path.exists(parent.replace('vec_new', 'ind')):
                    os.makedirs(parent.replace('vec_new', 'ind'))
                model = gensim.models.word2vec.Word2Vec.load_word2vec_format(
                    origin_path, binary=False)
                index = AnnoyIndexer(model, 100)
                index.save(target_path)
コード例 #12
0
ファイル: similarity.py プロジェクト: zhaoqinghai/harvester
def train():
    documents = []

    with open('/home/ycw/tax_data.csv', 'r') as f:
        reader = csv.reader(f, dialect='excel', delimiter=',')
        for line in reader:
            print(line)
            word_list = transform_text(line[1].strip(), strip=False)
            # word_list = eval(line[2])
            documents.append(doc2vec.LabeledSentence(word_list, [line[0]]))

    model = Doc2Vec(documents,
                    dm=1,
                    size=DIMENSION,
                    window=5,
                    negative=5,
                    min_count=2,
                    workers=4)
    model.save('../models/doc2vec.model')

    indexer = AnnoyIndexer(model, 2)
    # _, tem_fn = mkstemp()
    indexer.save('../models/dv_index')
コード例 #13
0
def build_annoy(w2v):
    info('building index')
    annoy_index = AnnoyIndexer(w2v, 500)
    info('saving index')
    annoy_index.save(annoy_file)
コード例 #14
0
#    for more details.
#

###############################################################################
# 5. Persisting indices to disk
# -----------------------------
#
# You can save and load your indexes from/to disk to prevent having to
# construct them each time. This will create two files on disk, *fname* and
# *fname.d*. Both files are needed to correctly restore all attributes. Before
# loading an index, you will have to create an empty AnnoyIndexer object.
#
fname = '/tmp/mymodel.index'

# Persist index to disk
annoy_index.save(fname)

# Load index back
import os.path
if os.path.exists(fname):
    annoy_index2 = AnnoyIndexer()
    annoy_index2.load(fname)
    annoy_index2.model = model

# Results should be identical to above
vector = model.wv["science"]
approximate_neighbors2 = model.wv.most_similar([vector],
                                               topn=11,
                                               indexer=annoy_index2)
for neighbor in approximate_neighbors2:
    print(neighbor)
コード例 #15
0
#!/usr/bin/env python3
import sys
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from gensim.similarities.index import AnnoyIndexer

if len(sys.argv) != 3:
    sys.stderr.write("local/prepare_words_embedding.py <src-mdl> <dest-mdl>\n")
    sys.exit(1)

srcmdl = sys.argv[1]
dstmdl = sys.argv[2]

model = KeyedVectors.load_word2vec_format(srcmdl)
annoy_index = AnnoyIndexer(model, 200)
annoy_index.save(dstmdl)
コード例 #16
0
ファイル: cluster.py プロジェクト: rameshb/email-alerts
#from gensim.models.keyedvectors import KeyedVectors
import gensim

model = gensim.models.Word2Vec.load(file)
#model = gensim.models.KeyedVectors.load_word2vec_format(file, binary=True)

model.vector_size = 1000
from gensim.similarities.index import AnnoyIndexer
# 100 trees are being used in this example
annoy_index = AnnoyIndexer(model,100)


fname = 'index.ann'

# Persist index to disk
annoy_index.save(fname)

# Load index back
#if os.path.exists(fname):
#   annoy_index2 = AnnoyIndexer()
#   annoy_index2.load(fname)
#   annoy_index2.model = model
#

# Derive the vector for the word "army" in our model
#vector = model["science"]
# The instance of AnnoyIndexer we just created is passed 
#approximate_neighbors = model.most_similar([vector], topn=5, indexer=annoy_index)
# Neatly print the approximate_neighbors and their corresponding cosine similarity values
#for neighbor in approximate_neighbors:
#  print(neighbor)
コード例 #17
0
    return ans


if __name__ == '__main__':
    data = api.load("20-newsgroups", return_path=False)
    start = time()
    all_sentences = []
    for text in data:
        all_sentences.extend(text2sentences(text['data']))
    print('Text to sentenses step complited for {}s'.format(time()-start))
    start = time()
    preprocessed_sentences = []
    for sentence in all_sentences:
        preprocessed_sentences.append(preprocess(sentence))
    print('Sentenses to tokens step complited for {}s'.format(time() - start))
    print('Learning...')
    start = time()
    epoch_num = 60
    w2v_model = Word2Vec(min_count=20, window=5, size=100, sample=6e-5, alpha=0.03,
                         min_alpha=0.0007, negative=20, workers=cpu_count())
    w2v_model.build_vocab(preprocessed_sentences, progress_per=1)
    w2v_model.train(preprocessed_sentences, total_examples=w2v_model.corpus_count, epochs=epoch_num, report_delay=1.0)
    print('Learning complited for {}s'.format(time() - start))
    w2v_model.init_sims(replace=True)
    w2v_model.save('data/word2vec.model')
    start = time()
    print('Annoy indexing...')
    ann_model = AnnoyIndexer(w2v_model, 1000)
    ann_model.save('data/word2vec_idx.ann')
    print('Annoy indexing complited for {}s'.format(time() - start))
コード例 #18
0
class Doc2VecModel:

    BASE_WIKI_QUERY = "https://en.wikipedia.org/w/api.php?action=query&format=json&pageids="
    stopword_list = stopwords.words('english')

    def __init__(self, modelname):
        logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                            level=logging.INFO)
        self.modelname = modelname
        if self.modelname is None:
            trian_corpus = self._get_training_iterator()
            self.model = Doc2Vec(vector_size=100, min_count=5, workers=7)
            self.model.build_vocab(trian_corpus, progress_per=10000)
        else:
            self.model = Doc2Vec.load(self.modelname)

        self.nns_method_init_dict = {
            NNSMethod.BRUTE: True,
            NNSMethod.KD_TREE: False,
            NNSMethod.ANNOY: False
        }

    def infer_file(self, filename, n=10):
        with open(filename, 'r') as f:
            lines = f.readlines()
        lines = ' '.join(lines)
        return self.infer(lines, n)

    def infer(self,
              string,
              n=10,
              nnsmethod=NNSMethod.ANNOY,
              annoymodelpath="gensim_annoy"):
        self._initialize_nns_method(nnsmethod, annoymodelpath)
        words = self._preprocess(string)
        # Set the random seed to make inferred vector determanistic
        self.model.random = np.random.mtrand.RandomState(1337)
        inferred_vector = self.model.infer_vector(words)
        ids, dists = self._calculate_most_similar(inferred_vector, n,
                                                  nnsmethod)
        titles = self._get_title_from_pageids(ids)
        return titles, dists

    def train(self, epochs):
        trian_corpus = self._get_training_iterator()
        self.model.train(trian_corpus,
                         total_examples=self.model.corpus_count,
                         epochs=epochs,
                         report_delay=10)
        self.model.save(self.modelname)

    def _initialize_nns_method(self, nnsmethod, annoymodelpath):
        if self.nns_method_init_dict[nnsmethod]: return
        if nnsmethod == NNSMethod.KD_TREE:
            print("Building KD tree..")
            self.tree = cKDTree(self.model.docvecs.vectors_docs)
            print("Finished building KD tree.")
            self.keys = list(self.model.docvecs.doctags.keys())
        elif nnsmethod == NNSMethod.ANNOY:
            if not os.path.isfile(annoymodelpath):
                print("Generating annoy index...")
                self.annoy_indexer = AnnoyIndexer(self.model, 50)
                print("Finished generating annoy index.")
                self.annoy_indexer.save(annoymodelpath)
            else:
                self.annoy_indexer = AnnoyIndexer()
                self.annoy_indexer.load(annoymodelpath)
                self.annoy_indexer.model = self.model

    def _calculate_most_similar(self, vector, n, nnsmethod):
        start_time = time.clock()
        if nnsmethod == NNSMethod.BRUTE:
            tops = self.model.docvecs.most_similar([vector], topn=n)
            dists, indicies = [t[0] for t in tops], [t[1] for t in tops]
        if nnsmethod == NNSMethod.KD_TREE:
            dists, indicies = self.tree.query(vector, k=n)
            indicies = [self.keys[i] for i in indicies]
        if nnsmethod == NNSMethod.ANNOY:
            tops = self.model.docvecs.most_similar([vector],
                                                   topn=n,
                                                   indexer=self.annoy_indexer)
            dists, indicies = [t[0] for t in tops], [t[1] for t in tops]
        print(f"Time using {nnsmethod} - {time.clock() - start_time}")
        return dists, indicies

    def _preprocess(self, string):
        string = string.lower()
        string = re.sub('[^a-z\s]+', '', string)
        words = nltk.word_tokenize(string)
        return [word for word in words if word not in self.stopword_list]

    def _get_training_iterator(self):
        home = os.path.expanduser("~")
        path = os.path.join(
            home, "Documents",
            "text")  # Data is assumed to be in ~/Documents/text
        files = glob.glob(os.path.join(path, "**/wiki_*"), recursive=True)
        return TaggedDocumentGenerator(files)

    def _get_title_from_pageids(self, ids):
        ids = '|'.join(ids)
        query = self.BASE_WIKI_QUERY + ids
        response = urlopen(query)
        dic = json.loads(response.read())
        return [
            v['title'] if 'title' in v else "PageId: " + str(v['pageid'])
            for v in dic['query']['pages'].values()
        ]
コード例 #19
0
        f.write(line[0][0] + ' ' + ' '.join([str(vec)
                                             for vec in line[1]]) + '\n')
with codecs.open('./relation2vec.txt', 'a+', encoding='utf-8') as f:
    f.write(
        str(relation2vec.shape[0]) + ' ' + str(relation2vec.shape[1]) + '\n')
    for line in zip(relation2id.items(), relation2vec):
        f.write(line[0][0] + ' ' + ' '.join([str(vec)
                                             for vec in line[1]]) + '\n')

# word2vec bin
wv_ent = KeyedVectors.load_word2vec_format('./entity2vec.txt', binary=False)
wv_ent.save_word2vec_format('./entity2vec.bin', binary=True)
# annoy index
wv_ent = KeyedVectors.load_word2vec_format('./entity2vec.bin', binary=True)
annoy_index_ent = AnnoyIndexer(wv_ent, 200)
annoy_index_ent.save('./entity2vec.index')
# rel
wv_rel = KeyedVectors.load_word2vec_format('./relation2vec.txt', binary=False)
wv_rel.save_word2vec_format('./relation2vec.bin', binary=True)
wv_rel = KeyedVectors.load_word2vec_format('./relation2vec.bin', binary=True)
annoy_index_rel = AnnoyIndexer(wv_rel, 200)
annoy_index_rel.save('./relation2vec.index')


# tsne-plot
def tsne_vis(X, labels, name):
    tsne = TSNE(n_components=2).fit_transform(X)
    plt.figure(figsize=(50, 50))
    for i, label in enumerate(labels):
        x, y = tsne[i, :]
        plt.scatter(x, y)
コード例 #20
0
ファイル: setup.py プロジェクト: ImperialSquid/MajorProject
# log.info("Done")
# log.info("Saving as binary...")
# word_model.save("word2vec-gnews-300.bin")
# log.info("Done")
# log.info("Loading binary model")
# word_model = KeyedVectors.load(
#     r"C:\Users\benja\OneDrive\Documents\UniWork\Aberystwyth\Year3\CS39440\MajorProject\word2vec-gnews-300.bin")
# log.info("Done!")

# for filename in os.listdir(r"C:\Users\benja\OneDrive\Documents\UniWork\Aberystwyth\Year3\CS39440\MajorProject\models"):
#     if filename[-4:] == ".bin" and filename != "word2vec-gnews-300.bin":
#         log.info("Loading {0}".format(filename))
#         wm = KeyedVectors.load(r"C:\Users\benja\OneDrive\Documents\UniWork\Aberystwyth\Year3" +
#                                r"\CS39440\MajorProject\models\{0}".format(filename))
#         log.info("Loaded, preprocessing L2 norms...")
#         wm.init_sims(replace=True)
#         log.info("Preprocessed, saving")
#         wm.save(filename)

for filename in os.listdir(r"C:\Users\benja\OneDrive\Documents\UniWork\Aberystwyth\Year3\CS39440\MajorProject\models"):
    if filename[-4:] == ".bin" and filename != "word2vec-gnews-300.bin":
        log.info("Loading {0}".format(filename))
        wm = KeyedVectors.load(r"C:\Users\benja\OneDrive\Documents\UniWork\Aberystwyth\Year3" +
                               r"\CS39440\MajorProject\models\{0}".format(filename))
        log.info("Building annoy indexer")
        indexer = AnnoyIndexer(wm, 5)
        log.info("Saving")
        indexer.save(r"models\{0}-5-trees.ann".format(filename.split(".")[0]))

log.info("All Done")
コード例 #21
0
        epochs=args.epochs,
        hs=args.hs,
        negative=args.negative,
        ns_exponent=args.ns_exponent,
        dbow_words=args.dbow_words,
        workers=multiprocessing.cpu_count()
    )
    end_time = time.perf_counter()
    logging.info('It took {} ms to train word2vec model.'.format(end_time - start_time))
    logging.info('Saving model ...')

    model.save('./models/doc2vec/d2v_{}_d{}_win{}_mc{}_hs{}.bin'.format(
        'dbow' if args.dm == 0 else 'dm',
        args.vector_dim,
        args.window,
        args.min_count,
        args.hs
    ))

    logging.info('Creating Annoy index ...')
    index = AnnoyIndexer(model, 300)

    logging.info('Saving index ...')
    index.save('./models/doc2vec/d2v_{}_d{}_win{}_mc{}_hs{}.idx'.format(
        'dbow' if args.dm == 0 else 'dm',
        args.vector_dim,
        args.window,
        args.min_count,
        args.hs
    ))