def testTransformSerialized(self):
        # Same as testTransform, using serialized corpora.
        passed = False
        # sometimes, training gets stuck at a local minimum
        # in that case try re-training the model from scratch, hoping for a
        # better random initialization
        for i in range(25):  # restart at most 5 times
            # create the transformation model
            model = self.class_(
                id2word=dictionary, num_topics=2, passes=100, random_state=0,
                serialized=True, serialization_path=datapath('testcorpus_serialization.mm')
            )
            model.update(self.corpus, author2doc)

            jill_topics = model.get_author_topics('jill')

            # NOTE: this test may easily fail if the author-topic model is altered in any way. The model's
            # output is sensitive to a lot of things, like the scheduling of the updates, or like the
            # author2id (because the random initialization changes when author2id changes). If it does
            # fail, simply be aware of whether we broke something, or if it just naturally changed the
            # output of the model slightly.
            vec = matutils.sparse2full(jill_topics, 2)  # convert to dense vector, for easier equality tests
            expected = [0.91, 0.08]
            # must contain the same values, up to re-ordering
            passed = np.allclose(sorted(vec), sorted(expected), atol=1e-1)

            # Delete the MmCorpus used for serialization inside the author-topic model.
            remove(datapath('testcorpus_serialization.mm'))
            if passed:
                break
            logging.warning(
                "Author-topic model failed to converge on attempt %i (got %s, expected %s)",
                i, sorted(vec), sorted(expected)
            )
        self.assertTrue(passed)
    def testLoadOldModel(self):
        """Test loading fasttext models from previous version"""

        model_file = 'fasttext_old'
        model = FT_gensim.load(datapath(model_file))
        self.assertTrue(model.wv.vectors.shape == (12, 100))
        self.assertTrue(len(model.wv.vocab) == 12)
        self.assertTrue(len(model.wv.index2word) == 12)
        self.assertTrue(model.syn1neg.shape == (len(model.wv.vocab), model.vector_size))
        self.assertTrue(model.trainables.vectors_lockf.shape == (12, ))
        self.assertTrue(model.vocabulary.cum_table.shape == (12, ))

        self.assertEqual(len(model.wv.hash2index), 202)
        self.assertTrue(model.wv.vectors_vocab.shape == (12, 100))
        self.assertTrue(model.wv.vectors_ngrams.shape == (202, 100))

        # Model stored in multiple files
        model_file = 'fasttext_old_sep'
        model = FT_gensim.load(datapath(model_file))
        self.assertTrue(model.wv.vectors.shape == (12, 100))
        self.assertTrue(len(model.wv.vocab) == 12)
        self.assertTrue(len(model.wv.index2word) == 12)
        self.assertTrue(model.syn1neg.shape == (len(model.wv.vocab), model.vector_size))
        self.assertTrue(model.trainables.vectors_lockf.shape == (12, ))
        self.assertTrue(model.vocabulary.cum_table.shape == (12, ))

        self.assertEqual(len(model.wv.hash2index), 202)
        self.assertTrue(model.wv.vectors_vocab.shape == (12, 100))
        self.assertTrue(model.wv.vectors_ngrams.shape == (202, 100))
 def testLineSentenceWorksWithNormalFile(self):
     """Does LineSentence work with a file object argument, rather than filename?"""
     with utils.smart_open(datapath('head500.noblanks.cor')) as orig:
         with utils.smart_open(datapath('head500.noblanks.cor')) as fin:
             sentences = word2vec.LineSentence(fin)
             for words in sentences:
                 self.assertEqual(words, utils.to_unicode(orig.readline()).split())
    def testLoadOldModel(self):
        """Test loading word2vec models from previous version"""

        model_file = 'word2vec_old'
        model = word2vec.Word2Vec.load(datapath(model_file))
        self.assertTrue(model.wv.vectors.shape == (12, 100))
        self.assertTrue(len(model.wv.vocab) == 12)
        self.assertTrue(len(model.wv.index2word) == 12)
        self.assertTrue(model.syn1neg.shape == (len(model.wv.vocab), model.wv.vector_size))
        self.assertTrue(model.trainables.vectors_lockf.shape == (12,))
        self.assertTrue(model.vocabulary.cum_table.shape == (12,))

        self.onlineSanity(model, trained_model=True)

        # Model stored in multiple files
        model_file = 'word2vec_old_sep'
        model = word2vec.Word2Vec.load(datapath(model_file))
        self.assertTrue(model.wv.vectors.shape == (12, 100))
        self.assertTrue(len(model.wv.vocab) == 12)
        self.assertTrue(len(model.wv.index2word) == 12)
        self.assertTrue(model.syn1neg.shape == (len(model.wv.vocab), model.wv.vector_size))
        self.assertTrue(model.trainables.vectors_lockf.shape == (12,))
        self.assertTrue(model.vocabulary.cum_table.shape == (12,))

        self.onlineSanity(model, trained_model=True)

        # load really old model
        model_file = 'w2v-lee-v0.12.0'
        model = word2vec.Word2Vec.load(datapath(model_file))
        self.onlineSanity(model, trained_model=True)

        # test for max_final_vocab for model saved in 3.3
        model_file = 'word2vec_3.3'
        model = word2vec.Word2Vec.load(datapath(model_file))
        self.assertEqual(model.max_final_vocab, None)
        self.assertEqual(model.vocabulary.max_final_vocab, None)

        # Test loading word2vec models from all previous versions
        old_versions = [
            '0.12.0', '0.12.1', '0.12.2', '0.12.3', '0.12.4',
            '0.13.0', '0.13.1', '0.13.2', '0.13.3', '0.13.4',
            '1.0.0', '1.0.1', '2.0.0', '2.1.0', '2.2.0', '2.3.0',
            '3.0.0', '3.1.0', '3.2.0', '3.3.0', '3.4.0'
        ]

        saved_models_dir = datapath('old_w2v_models/w2v_{}.mdl')
        for old_version in old_versions:
            model = word2vec.Word2Vec.load(saved_models_dir.format(old_version))
            self.assertTrue(len(model.wv.vocab) == 3)
            self.assertTrue(model.wv.vectors.shape == (3, 4))
            # check if similarity search and online training works.
            self.assertTrue(len(model.wv.most_similar('sentence')) == 2)
            model.build_vocab(list_corpus, update=True)
            model.train(list_corpus, total_examples=model.corpus_count, epochs=model.iter)
            # check if similarity search and online training works after saving and loading back the model.
            tmpf = get_tmpfile('gensim_word2vec.tst')
            model.save(tmpf)
            loaded_model = word2vec.Word2Vec.load(tmpf)
            loaded_model.build_vocab(list_corpus, update=True)
            loaded_model.train(list_corpus, total_examples=model.corpus_count, epochs=model.iter)
    def testSerialized(self):
        # Test the model using serialized corpora. Basic tests, plus test of update functionality.

        model = self.class_(
            self.corpus, author2doc=author2doc, id2word=dictionary, num_topics=2,
            serialized=True, serialization_path=datapath('testcorpus_serialization.mm')
        )

        jill_topics = model.get_author_topics('jill')
        jill_topics = matutils.sparse2full(jill_topics, model.num_topics)
        self.assertTrue(all(jill_topics > 0))

        model.update()
        jill_topics2 = model.get_author_topics('jill')
        jill_topics2 = matutils.sparse2full(jill_topics2, model.num_topics)

        # Did we learn more about Jill?
        self.assertFalse(all(np.equal(jill_topics, jill_topics2)))

        model.update(corpus_new, author2doc_new)

        # Did we learn something about Sally?
        sally_topics = model.get_author_topics('sally')
        sally_topics = matutils.sparse2full(sally_topics, model.num_topics)
        self.assertTrue(all(sally_topics > 0))

        # Delete the MmCorpus used for serialization inside the author-topic model.
        remove(datapath('testcorpus_serialization.mm'))
def get_corpus():
    text_path = datapath('ldavowpalwabbit.txt')
    dict_path = datapath('ldavowpalwabbit.dict.txt')
    dictionary = Dictionary.load_from_text(dict_path)
    with open(text_path) as fhandle:
        corpus = [dictionary.doc2bow(l.strip().split()) for l in fhandle]
    return corpus, dictionary
Exemple #7
0
    def testLoadOldModel(self):
        """Test loading doc2vec models from previous version"""

        model_file = 'doc2vec_old'
        model = doc2vec.Doc2Vec.load(datapath(model_file))
        self.assertTrue(model.wv.vectors.shape == (3955, 100))
        self.assertTrue(len(model.wv.vocab) == 3955)
        self.assertTrue(len(model.wv.index2word) == 3955)
        self.assertTrue(model.syn1neg.shape == (len(model.wv.vocab), model.vector_size))
        self.assertTrue(model.trainables.vectors_lockf.shape == (3955, ))
        self.assertTrue(model.vocabulary.cum_table.shape == (3955, ))

        self.assertTrue(model.docvecs.vectors_docs.shape == (300, 100))
        self.assertTrue(model.trainables.vectors_docs_lockf.shape == (300, ))
        self.assertTrue(model.docvecs.max_rawint == 299)
        self.assertTrue(model.docvecs.count == 300)

        # Model stored in multiple files
        model_file = 'doc2vec_old_sep'
        model = doc2vec.Doc2Vec.load(datapath(model_file))
        self.assertTrue(model.wv.vectors.shape == (3955, 100))
        self.assertTrue(len(model.wv.vocab) == 3955)
        self.assertTrue(len(model.wv.index2word) == 3955)
        self.assertTrue(model.syn1neg.shape == (len(model.wv.vocab), model.vector_size))
        self.assertTrue(model.trainables.vectors_lockf.shape == (3955, ))
        self.assertTrue(model.vocabulary.cum_table.shape == (3955, ))

        self.assertTrue(model.docvecs.vectors_docs.shape == (300, 100))
        self.assertTrue(model.trainables.vectors_docs_lockf.shape == (300, ))
        self.assertTrue(model.docvecs.max_rawint == 299)
        self.assertTrue(model.docvecs.count == 300)
 def setUp(self):
     ft_home = os.environ.get('FT_HOME', None)
     self.ft_path = os.path.join(ft_home, 'fasttext') if ft_home else None
     self.corpus_file = datapath('lee_background.cor')
     self.test_model_file = datapath('lee_fasttext')
     self.test_new_model_file = datapath('lee_fasttext_new')
     # Load pre-trained model to perform tests in case FastText binary isn't available in test environment
     self.test_model = fasttext.FastText.load_fasttext_format(self.test_model_file)
Exemple #9
0
 def setUp(self):
     self.time_slices = [3, 7]
     self.corpus = corpora.mmcorpus.MmCorpus(datapath('dtm_test.mm'))
     self.id2word = corpora.Dictionary.load(datapath('dtm_test.dict'))
     # first you need to setup the environment variable $DTM_PATH for the dtm executable file
     self.dtm_path = os.environ.get('DTM_PATH', None)
     if not self.dtm_path:
         self.skipTest("$DTM_PATH is not properly set up.")
    def setUp(self):
        filename = datapath("alldata-id-10.txt")
        train_docs = read_sentiment_docs(filename)
        self.train_docs = train_docs
        self.source_doc_vec_file = datapath("small_tag_doc_5_iter50")
        self.target_doc_vec_file = datapath("large_tag_doc_10_iter50")

        self.source_doc_vec = Doc2Vec.load(self.source_doc_vec_file)
        self.target_doc_vec = Doc2Vec.load(self.target_doc_vec_file)
 def test_type_conversion(self):
     path = datapath('high_precision.kv.txt')
     binary_path = datapath('high_precision.kv.bin')
     model1 = KeyedVectors.load_word2vec_format(path, datatype=np.float16)
     model1.save_word2vec_format(binary_path, binary=True)
     model2 = KeyedVectors.load_word2vec_format(binary_path, datatype=np.float64, binary=True)
     self.assertAlmostEqual(model1["horse.n.01"][0], np.float16(model2["horse.n.01"][0]))
     self.assertEqual(model1["horse.n.01"][0].dtype, np.float16)
     self.assertEqual(model2["horse.n.01"][0].dtype, np.float64)
Exemple #12
0
 def testPathLineSentences(self):
     """Does PathLineSentences work with a path argument?"""
     with utils.smart_open(os.path.join(datapath('PathLineSentences'), '1.txt')) as orig1,\
     utils.smart_open(os.path.join(datapath('PathLineSentences'), '2.txt.bz2')) as orig2:
         sentences = word2vec.PathLineSentences(datapath('PathLineSentences'))
         orig = orig1.readlines() + orig2.readlines()
         orig_counter = 0  # to go through orig while matching PathLineSentences
         for words in sentences:
             self.assertEqual(words, utils.to_unicode(orig[orig_counter]).split())
             orig_counter += 1
    def test_encoding_handling(self):
        """Tests whether utf8 and non-utf8 data loaded correctly."""
        non_utf8_file = datapath('poincare_cp852.tsv')
        relations = [relation for relation in PoincareRelations(non_utf8_file, encoding='cp852')]
        self.assertEqual(len(relations), 2)
        self.assertEqual(relations[0], (u'tímto', u'budeš'))

        utf8_file = datapath('poincare_utf8.tsv')
        relations = [relation for relation in PoincareRelations(utf8_file)]
        self.assertEqual(len(relations), 2)
        self.assertEqual(relations[0], (u'tímto', u'budeš'))
Exemple #14
0
 def testEvaluateWordPairs(self):
     """Test Spearman and Pearson correlation coefficients give sane results on similarity datasets"""
     corpus = word2vec.LineSentence(datapath('head500.noblanks.cor.bz2'))
     model = word2vec.Word2Vec(corpus, min_count=3, iter=10)
     correlation = model.evaluate_word_pairs(datapath('wordsim353.tsv'))
     pearson = correlation[0][0]
     spearman = correlation[1][0]
     oov = correlation[2]
     self.assertTrue(0.1 < pearson < 1.0)
     self.assertTrue(0.1 < spearman < 1.0)
     self.assertTrue(0.0 <= oov < 90.0)
Exemple #15
0
 def testModelCompatibilityWithPythonVersions(self):
     fname_model_2_7 = datapath('ldamodel_python_2_7')
     model_2_7 = self.class_.load(fname_model_2_7)
     fname_model_3_5 = datapath('ldamodel_python_3_5')
     model_3_5 = self.class_.load(fname_model_3_5)
     self.assertEqual(model_2_7.num_topics, model_3_5.num_topics)
     self.assertTrue(np.allclose(model_2_7.expElogbeta, model_3_5.expElogbeta))
     tstvec = []
     self.assertTrue(np.allclose(model_2_7[tstvec], model_3_5[tstvec]))  # try projecting an empty vector
     id2word_2_7 = dict(model_2_7.id2word.iteritems())
     id2word_3_5 = dict(model_3_5.id2word.iteritems())
     self.assertEqual(set(id2word_2_7.keys()), set(id2word_3_5.keys()))
    def test_persistence(self):
        # Test persistence without using `smartirs`
        fname = get_tmpfile('gensim_models.tst')
        model = tfidfmodel.TfidfModel(self.corpus, normalize=True)
        model.save(fname)
        model2 = tfidfmodel.TfidfModel.load(fname)
        self.assertTrue(model.idfs == model2.idfs)
        tstvec = [corpus[1], corpus[2]]
        self.assertTrue(np.allclose(model[tstvec[0]], model2[tstvec[0]]))
        self.assertTrue(np.allclose(model[tstvec[1]], model2[tstvec[1]]))
        self.assertTrue(np.allclose(model[[]], model2[[]]))  # try projecting an empty vector

        # Test persistence with using `smartirs`
        fname = get_tmpfile('gensim_models_smartirs.tst')
        model = tfidfmodel.TfidfModel(self.corpus, smartirs="ntc")
        model.save(fname)
        model2 = tfidfmodel.TfidfModel.load(fname)
        self.assertTrue(model.idfs == model2.idfs)
        tstvec = [corpus[1], corpus[2]]
        self.assertTrue(np.allclose(model[tstvec[0]], model2[tstvec[0]]))
        self.assertTrue(np.allclose(model[tstvec[1]], model2[tstvec[1]]))
        self.assertTrue(np.allclose(model[[]], model2[[]]))  # try projecting an empty vector

        # Test persistence between Gensim v3.2.0 and current model.
        model3 = tfidfmodel.TfidfModel(self.corpus, smartirs="ntc")
        model4 = tfidfmodel.TfidfModel.load(datapath('tfidf_model.tst'))
        idfs3 = [model3.idfs[key] for key in sorted(model3.idfs.keys())]
        idfs4 = [model4.idfs[key] for key in sorted(model4.idfs.keys())]
        self.assertTrue(np.allclose(idfs3, idfs4))
        tstvec = [corpus[1], corpus[2]]
        self.assertTrue(np.allclose(model3[tstvec[0]], model4[tstvec[0]]))
        self.assertTrue(np.allclose(model3[tstvec[1]], model4[tstvec[1]]))
        self.assertTrue(np.allclose(model3[[]], model4[[]]))  # try projecting an empty vector

        # Test persistence with using pivoted normalization
        fname = get_tmpfile('gensim_models_smartirs.tst')
        model = tfidfmodel.TfidfModel(self.corpus, pivot=0, slope=1)
        model.save(fname)
        model2 = tfidfmodel.TfidfModel.load(fname, mmap=None)
        self.assertTrue(model.idfs == model2.idfs)
        tstvec = [corpus[1], corpus[2]]
        self.assertTrue(np.allclose(model[tstvec[0]], model2[tstvec[0]]))
        self.assertTrue(np.allclose(model[tstvec[1]], model2[tstvec[1]]))

        # Test persistence between Gensim v3.2.0 and pivoted normalization compressed model.
        model3 = tfidfmodel.TfidfModel(self.corpus, pivot=0, slope=1)
        model4 = tfidfmodel.TfidfModel.load(datapath('tfidf_model.tst'))
        idfs3 = [model3.idfs[key] for key in sorted(model3.idfs.keys())]
        idfs4 = [model4.idfs[key] for key in sorted(model4.idfs.keys())]
        self.assertTrue(np.allclose(idfs3, idfs4))
        tstvec = [corpus[1], corpus[2]]
        self.assertTrue(np.allclose(model3[tstvec[0]], model4[tstvec[0]]))
        self.assertTrue(np.allclose(model3[tstvec[1]], model4[tstvec[1]]))
    def test_ft_kv_backward_compat_w_360(self):
        kv = EuclideanKeyedVectors.load(datapath("ft_kv_3.6.0.model.gz"))
        ft_kv = FastTextKeyedVectors.load(datapath("ft_kv_3.6.0.model.gz"))

        expected = ['trees', 'survey', 'system', 'graph', 'interface']
        actual = [word for (word, similarity) in kv.most_similar("human", topn=5)]

        self.assertEqual(actual, expected)

        actual = [word for (word, similarity) in ft_kv.most_similar("human", topn=5)]

        self.assertEqual(actual, expected)
 def setUp(self):
     wr_home = os.environ.get('WR_HOME', None)
     self.wr_path = wr_home if wr_home else None
     self.corpus_file = datapath('lee.cor')
     self.out_name = 'testmodel'
     self.wr_file = datapath('test_glove.txt')
     if not self.wr_path:
         return
     self.test_model = wordrank.Wordrank.train(
         self.wr_path, self.corpus_file, self.out_name, iter=6,
         dump_period=5, period=5, np=4, cleanup_files=True
     )
    def testCompatibilty(self):
        phr = Phraser.load(datapath("phraser-3.6.0.model"))
        model = Phrases.load(datapath("phrases-3.6.0.model"))

        test_sentences = ['trees', 'graph', 'minors']
        expected_res = ['trees', 'graph_minors']

        phr_out = phr[test_sentences]
        model_out = model[test_sentences]

        self.assertEqual(phr_out, expected_res)
        self.assertEqual(model_out, expected_res)
    def setUp(self):
        self.source_word_vec_file = datapath("EN.1-10.cbow1_wind5_hs0_neg10_size300_smpl1e-05.txt")
        self.target_word_vec_file = datapath("IT.1-10.cbow1_wind5_hs0_neg10_size300_smpl1e-05.txt")

        self.word_pairs = [("one", "uno"), ("two", "due"), ("three", "tre"),
            ("four", "quattro"), ("five", "cinque"), ("seven", "sette"), ("eight", "otto"),
            ("dog", "cane"), ("pig", "maiale"), ("fish", "cavallo"), ("birds", "uccelli"),
            ("apple", "mela"), ("orange", "arancione"), ("grape", "acino"), ("banana", "banana")
        ]

        self.test_word_pairs = [("ten", "dieci"), ("cat", "gatto")]

        self.source_word_vec = KeyedVectors.load_word2vec_format(self.source_word_vec_file, binary=False)
        self.target_word_vec = KeyedVectors.load_word2vec_format(self.target_word_vec_file, binary=False)
 def setUp(self):
     self.corpus = mmcorpus.MmCorpus(datapath('testcorpus.mm'))
     # Choose doc to be normalized. [3] chosen to demonstrate different results for l1 and l2 norm.
     # doc is [(1, 1.0), (5, 2.0), (8, 1.0)]
     self.doc = list(self.corpus)[3]
     self.model_l1 = normmodel.NormModel(self.corpus, norm='l1')
     self.model_l2 = normmodel.NormModel(self.corpus, norm='l2')
Exemple #22
0
 def test_load_model_with_non_ascii_vocab(self):
     model = FT_gensim.load_fasttext_format(datapath('non_ascii_fasttext'))
     self.assertTrue(u'který' in model)
     try:
         model[u'který']
     except UnicodeDecodeError:
         self.fail('Unable to access vector for utf8 encoded non-ascii word')
Exemple #23
0
 def test_load_model_non_utf8_encoding(self):
     model = FT_gensim.load_fasttext_format(datapath('cp852_fasttext'), encoding='cp852')
     self.assertTrue(u'který' in model)
     try:
         model[u'který']
     except KeyError:
         self.fail('Unable to access vector for cp-852 word')
Exemple #24
0
    def test_sg_neg_training(self):

        model_gensim = FT_gensim(
            size=50, sg=1, cbow_mean=1, alpha=0.025, window=5, hs=0, negative=5,
            min_count=5, iter=5, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6,
            sorted_vocab=1, workers=1, min_alpha=0.0)

        lee_data = LineSentence(datapath('lee_background.cor'))
        model_gensim.build_vocab(lee_data)
        orig0 = np.copy(model_gensim.wv.vectors[0])
        model_gensim.train(lee_data, total_examples=model_gensim.corpus_count, epochs=model_gensim.epochs)
        self.assertFalse((orig0 == model_gensim.wv.vectors[0]).all())  # vector should vary after training

        sims_gensim = model_gensim.wv.most_similar('night', topn=10)
        sims_gensim_words = [word for (word, distance) in sims_gensim]  # get similar words
        expected_sims_words = [
            u'night.',
            u'night,',
            u'eight',
            u'overnight',
            u'overnight.',
            u'month',
            u'land',
            u'firm',
            u'singles',
            u'death']
        overlap_count = len(set(sims_gensim_words).intersection(expected_sims_words))
        self.assertGreaterEqual(overlap_count, 2)
Exemple #25
0
 def testPathLineSentencesOneFile(self):
     """Does PathLineSentences work with a single file argument?"""
     test_file = os.path.join(datapath('PathLineSentences'), '1.txt')
     with utils.smart_open(test_file) as orig:
         sentences = word2vec.PathLineSentences(test_file)
         for words in sentences:
             self.assertEqual(words, utils.to_unicode(orig.readline()).split())
Exemple #26
0
    def test_load(self):
        fname = datapath('testcorpus.' + self.file_extension.lstrip('.'))
        corpus = self.corpus_class(fname)

        docs = list(corpus)
        # the deerwester corpus always has nine documents
        self.assertEqual(len(docs), 9)
 def test_persistence_old_model(self):
     """Tests whether model from older gensim version is loaded correctly."""
     loaded = PoincareModel.load(datapath('poincare_test_3.4.0'))
     self.assertEqual(loaded.kv.syn0.shape, (239, 2))
     self.assertEqual(len(loaded.kv.vocab), 239)
     self.assertEqual(loaded.size, 2)
     self.assertEqual(len(loaded.all_relations), 200)
    def test_line2doc(self):
        # case with metadata=False (by default)
        super(TestMalletCorpus, self).test_line2doc()

        # case with metadata=True
        fname = datapath('testcorpus.' + self.file_extension.lstrip('.'))
        id2word = {1: 'mom', 2: 'window'}

        corpus = self.corpus_class(fname, id2word=id2word, metadata=True)

        # should return all words in doc
        corpus.use_wordids = False
        doc, (docid, doclang) = corpus.line2doc(self.CORPUS_LINE)
        self.assertEqual(docid, '#3')
        self.assertEqual(doclang, 'lang')
        self.assertEqual(
            sorted(doc),
            [('mom', 1), ('was', 1), ('wash', 1), ('washed', 1), ('window', 2)])

        # should return words in word2id
        corpus.use_wordids = True
        doc, (docid, doclang) = corpus.line2doc(self.CORPUS_LINE)

        self.assertEqual(docid, '#3')
        self.assertEqual(doclang, 'lang')
        self.assertEqual(
            sorted(doc),
            [(1, 1), (2, 2)])
Exemple #29
0
 def testSaveLoadNoCommonTerms(self):
     """ Ensure backwards compatibility with old versions of Phrases, before common_terms"""
     bigram_loaded = Phrases.load(datapath("phrases-no-common-terms.pkl"))
     self.assertEqual(bigram_loaded.common_terms, frozenset())
     # can make a phraser, cf #1751
     phraser = Phraser(bigram_loaded)  # does not raise
     phraser[["human", "interface", "survey"]]  # does not raise
Exemple #30
0
 def test_closed_file_object(self):
     file_obj = open(datapath('testcorpus.mm'))
     f = file_obj.closed
     mmcorpus.MmCorpus(file_obj)
     s = file_obj.closed
     self.assertEqual(f, 0)
     self.assertEqual(s, 0)
Exemple #31
0
install_java()
initialize(java_options="-Xmx4g -Dfile.encoding=utf-8",
           KKMA="2.0.2",
           RHINO="2.0.5",
           EUNJEON="2.0.2",
           ETRI="2.0.2")


def line():
    print("\n________________________\n\n")


print("Saving cleaned words complete")
cleaned_koala_words = koala_bow()
topics = [3, 4, 5]
dictionary, vocab = split_train(cleaned_koala_words, 1)
# create model
print("Generating and saving LDA models")
for num_topic in topics:
    line()
    ldamodel, corpus, id2word = create_lda(num_topic, dictionary)
    line()
    temp_file = datapath("../models/koala_model_" + str(num_topic))
    ldamodel.save(temp_file)
    line()
    model_evaluate(ldamodel, dictionary, id2word, num_topic)
    line()
    lda_visualize(ldamodel, dictionary, num_topic, "koala")
    line()
Exemple #32
0
 def test_save_load_no_common_terms(self):
     """Ensure backwards compatibility with old versions of FrozenPhrases, before connector_words."""
     bigram_loaded = FrozenPhrases.load(
         datapath("phraser-no-common-terms.pkl"))
     self.assertEqual(bigram_loaded.connector_words, frozenset())
Exemple #33
0
# so we could use it for visualization via pyLDAvis
mallet_lda_model = gensim.models.wrappers.ldamallet.malletmodel2ldamodel(
    ldamallet)

# using sort_topics=False will help us to synchronize the results by this visualization
# and the word clouds results
vis = pyLDAvis.gensim.prepare(mallet_lda_model,
                              corpus,
                              id2word,
                              sort_topics=False)
pyLDAvis.save_html(
    vis,
    'LDA_Mallet_' + str(len(mallet_lda_model.get_topics())) + '_topics.html')

# tool to save the model to some file and then use it in other python files
temp_file = datapath("mallet_lda_model")
mallet_lda_model.save(temp_file)

# This LDA model is regular gensim, produced worse results then mallet
# Build LDA model
# lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
#                                            id2word=id2word,
#                                            num_topics=14,
#                                            random_state=100,
#                                            update_every=1,
#                                            chunksize=100,
#                                            passes=10,
#                                            alpha='auto',
#                                            minimum_probability=0.0,
#                                            per_word_topics=True)
Exemple #34
0
for p in pd:
    processed_docs = p[u'message'].map(n_all)
    dictionary = gensim.corpora.Dictionary(processed_docs)

    dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

    bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

    tfidf = models.TfidfModel(bow_corpus)
    corpus_tfidf = tfidf[bow_corpus]

    lda_model.update(bow_corpus, num_topics=25, id2word=dictionary, passes=2)
    lda_model_tfidf.update(corpus_tfidf, num_topics=25, id2word=dictionary, passes=2)


temp_file = datapath("/data/06333/aroraish/models/ModEModelBOW")
lda_model.save(temp_file)

temp_file = datapath("/data/06333/aroraish/models/ModEModelTFIDF")
lda_model_tfidf.save(temp_file)

with open("/data/06333/aroraish/outputs/lda_bag_of_words_overall.txt", 'w') as bw:

    for idx, topic in lda_model.print_topics(-1):
        bw.write('Topic: {} \nWords: {}\n\n'.format(idx, topic.encode('utf-8')))


with open("/data/06333/aroraish/outputs/lda_tfidf_overall.txt", 'w') as tf:

    for idx, topic in lda_model_tfidf.print_topics(-1):
        tf.write('Topic: {} \nWord: {}\n\n'.format(idx, topic.encode('utf-8')))
Exemple #35
0
 def setUp(self):
     self.vectors = EuclideanKeyedVectors.load_word2vec_format(
         datapath('euclidean_vectors.bin'), binary=True, datatype=np.float64)
 def setUp(self):
     texts = [
         [
             u'senior', u'studios', u'studios', u'studios', u'creators',
             u'award', u'mobile', u'currently', u'challenges', u'senior',
             u'summary', u'senior', u'motivated', u'creative', u'senior'
         ],
         [
             u'performs', u'engineering', u'tasks', u'infrastructure',
             u'focusing', u'primarily', u'programming', u'interaction',
             u'designers', u'engineers', u'leadership', u'teams', u'teams',
             u'crews', u'responsibilities', u'engineering', u'quality',
             u'functional', u'functional', u'teams', u'organizing',
             u'prioritizing', u'technical', u'decisions', u'engineering',
             u'participates', u'participates', u'reviews', u'participates',
             u'hiring', u'conducting', u'interviews'
         ],
         [
             u'feedback', u'departments', u'define', u'focusing',
             u'engineering', u'teams', u'crews', u'facilitate',
             u'engineering', u'departments', u'deadlines', u'milestones',
             u'typically', u'spends', u'designing', u'developing',
             u'updating', u'bugs', u'mentoring', u'engineers', u'define',
             u'schedules', u'milestones', u'participating'
         ],
         [
             u'reviews', u'interviews', u'sized', u'teams', u'interacts',
             u'disciplines', u'knowledge', u'skills', u'knowledge',
             u'knowledge', u'xcode', u'scripting', u'debugging', u'skills',
             u'skills', u'knowledge', u'disciplines', u'animation',
             u'networking', u'expertise', u'competencies', u'oral',
             u'skills', u'management', u'skills', u'proven', u'effectively',
             u'teams', u'deadline', u'environment', u'bachelor', u'minimum',
             u'shipped', u'leadership', u'teams', u'location', u'resumes',
             u'jobs', u'candidates', u'openings', u'jobs'
         ],
         [
             u'maryland', u'client', u'producers', u'electricity',
             u'operates', u'storage', u'utility', u'retail', u'customers',
             u'engineering', u'consultant', u'maryland', u'summary',
             u'technical', u'technology', u'departments', u'expertise',
             u'maximizing', u'output', u'reduces', u'operating',
             u'participates', u'areas', u'engineering', u'conducts',
             u'testing', u'solve', u'supports', u'environmental',
             u'understands', u'objectives', u'operates',
             u'responsibilities', u'handles', u'complex', u'engineering',
             u'aspects', u'monitors', u'quality', u'proficiency',
             u'optimization', u'recommendations', u'supports', u'personnel',
             u'troubleshooting', u'commissioning', u'startup', u'shutdown',
             u'supports', u'procedure', u'operating', u'units', u'develops',
             u'simulations', u'troubleshooting', u'tests', u'enhancing',
             u'solving', u'develops', u'estimates', u'schedules', u'scopes',
             u'understands', u'technical', u'management', u'utilize',
             u'routine', u'conducts', u'hazards', u'utilizing', u'hazard',
             u'operability', u'methodologies', u'participates', u'startup',
             u'reviews', u'pssr', u'participate', u'teams', u'participate',
             u'regulatory', u'audits', u'define', u'scopes', u'budgets',
             u'schedules', u'technical', u'management', u'environmental',
             u'awareness', u'interfacing', u'personnel', u'interacts',
             u'regulatory', u'departments', u'input', u'objectives',
             u'identifying', u'introducing', u'concepts', u'solutions',
             u'peers', u'customers', u'coworkers', u'knowledge', u'skills',
             u'engineering', u'quality', u'engineering'
         ],
         [
             u'commissioning', u'startup', u'knowledge', u'simulators',
             u'technologies', u'knowledge', u'engineering', u'techniques',
             u'disciplines', u'leadership', u'skills', u'proven',
             u'engineers', u'oral', u'skills', u'technical', u'skills',
             u'analytically', u'solve', u'complex', u'interpret',
             u'proficiency', u'simulation', u'knowledge', u'applications',
             u'manipulate', u'applications', u'engineering'
         ],
         [
             u'calculations', u'programs', u'matlab', u'excel',
             u'independently', u'environment', u'proven', u'skills',
             u'effectively', u'multiple', u'tasks', u'planning',
             u'organizational', u'management', u'skills', u'rigzone',
             u'jobs', u'developer', u'exceptional', u'strategies',
             u'junction', u'exceptional', u'strategies', u'solutions',
             u'solutions', u'biggest', u'insurers', u'operates',
             u'investment'
         ],
         [
             u'vegas', u'tasks', u'electrical', u'contracting',
             u'expertise', u'virtually', u'electrical', u'developments',
             u'institutional', u'utilities', u'technical', u'experts',
             u'relationships', u'credibility', u'contractors', u'utility',
             u'customers', u'customer', u'relationships', u'consistently',
             u'innovations', u'profile', u'construct', u'envision',
             u'dynamic', u'complex', u'electrical', u'management', u'grad',
             u'internship', u'electrical', u'engineering',
             u'infrastructures', u'engineers', u'documented', u'management',
             u'engineering', u'quality', u'engineering', u'electrical',
             u'engineers', u'complex', u'distribution', u'grounding',
             u'estimation', u'testing', u'procedures', u'voltage',
             u'engineering'
         ],
         [
             u'troubleshooting', u'installation', u'documentation', u'bsee',
             u'certification', u'electrical', u'voltage', u'cabling',
             u'electrical', u'engineering', u'candidates', u'electrical',
             u'internships', u'oral', u'skills', u'organizational',
             u'prioritization', u'skills', u'skills', u'excel', u'cadd',
             u'calculation', u'autocad', u'mathcad', u'skills', u'skills',
             u'customer', u'relationships', u'solving', u'ethic',
             u'motivation', u'tasks', u'budget', u'affirmative',
             u'diversity', u'workforce', u'gender', u'orientation',
             u'disability', u'disabled', u'veteran', u'vietnam', u'veteran',
             u'qualifying', u'veteran', u'diverse', u'candidates',
             u'respond', u'developing', u'workplace', u'reflects',
             u'diversity', u'communities', u'reviews', u'electrical',
             u'contracting', u'southwest', u'electrical', u'contractors'
         ],
         [
             u'intern', u'electrical', u'engineering', u'idexx',
             u'laboratories', u'validating', u'idexx', u'integrated',
             u'hardware', u'entails', u'planning', u'debug', u'validation',
             u'engineers', u'validation', u'methodologies', u'healthcare',
             u'platforms', u'brightest', u'solve', u'challenges',
             u'innovation', u'technology', u'idexx', u'intern', u'idexx',
             u'interns', u'supplement', u'interns', u'teams', u'roles',
             u'competitive', u'interns', u'idexx', u'interns',
             u'participate', u'internships', u'mentors', u'seminars',
             u'topics', u'leadership', u'workshops', u'relevant',
             u'planning', u'topics', u'intern', u'presentations', u'mixers',
             u'applicants', u'ineligible', u'laboratory', u'compliant',
             u'idexx', u'laboratories', u'healthcare', u'innovation',
             u'practicing', u'veterinarians', u'diagnostic', u'technology',
             u'idexx', u'enhance', u'veterinarians', u'efficiency',
             u'economically', u'idexx', u'worldwide', u'diagnostic',
             u'tests', u'tests', u'quality', u'headquartered', u'idexx',
             u'laboratories', u'employs', u'customers', u'qualifications',
             u'applicants', u'idexx', u'interns', u'potential',
             u'demonstrated', u'portfolio', u'recommendation', u'resumes',
             u'marketing', u'location', u'americas', u'verification',
             u'validation', u'schedule', u'overtime', u'idexx',
             u'laboratories', u'reviews', u'idexx', u'laboratories',
             u'nasdaq', u'healthcare', u'innovation', u'practicing',
             u'veterinarians'
         ],
         [
             u'location', u'duration', u'temp', u'verification',
             u'validation', u'tester', u'verification', u'validation',
             u'middleware', u'specifically', u'testing', u'applications',
             u'clinical', u'laboratory', u'regulated', u'environment',
             u'responsibilities', u'complex', u'hardware', u'testing',
             u'clinical', u'analyzers', u'laboratory', u'graphical',
             u'interfaces', u'complex', u'sample', u'sequencing',
             u'protocols', u'developers', u'correction', u'tracking',
             u'tool', u'timely', u'troubleshoot', u'testing', u'functional',
             u'manual', u'automated', u'participate', u'ongoing'
         ],
         [
             u'testing', u'coverage', u'planning', u'documentation',
             u'testing', u'validation', u'corrections', u'monitor',
             u'implementation', u'recurrence', u'operating', u'statistical',
             u'quality', u'testing', u'global', u'multi', u'teams',
             u'travel', u'skills', u'concepts', u'waterfall', u'agile',
             u'methodologies', u'debugging', u'skills', u'complex',
             u'automated', u'instrumentation', u'environment', u'hardware',
             u'mechanical', u'components', u'tracking', u'lifecycle',
             u'management', u'quality', u'organize', u'define',
             u'priorities', u'organize', u'supervision', u'aggressive',
             u'deadlines', u'ambiguity', u'analyze', u'complex',
             u'situations', u'concepts', u'technologies', u'verbal',
             u'skills', u'effectively', u'technical', u'clinical',
             u'diverse', u'strategy', u'clinical', u'chemistry',
             u'analyzer', u'laboratory', u'middleware', u'basic',
             u'automated', u'testing', u'biomedical', u'engineering',
             u'technologists', u'laboratory', u'technology',
             u'availability', u'click', u'attach'
         ],
         [
             u'scientist', u'linux', u'asrc', u'scientist', u'linux',
             u'asrc', u'technology', u'solutions', u'subsidiary', u'asrc',
             u'engineering', u'technology', u'contracts'
         ],
         [
             u'multiple', u'agencies', u'scientists', u'engineers',
             u'management', u'personnel', u'allows', u'solutions',
             u'complex', u'aeronautics', u'aviation', u'management',
             u'aviation', u'engineering', u'hughes', u'technical',
             u'technical', u'aviation', u'evaluation', u'engineering',
             u'management', u'technical', u'terminal', u'surveillance',
             u'programs', u'currently', u'scientist', u'travel',
             u'responsibilities', u'develops', u'technology', u'modifies',
             u'technical', u'complex', u'reviews', u'draft', u'conformity',
             u'completeness', u'testing', u'interface', u'hardware',
             u'regression', u'impact', u'reliability', u'maintainability',
             u'factors', u'standardization', u'skills', u'travel',
             u'programming', u'linux', u'environment', u'cisco',
             u'knowledge', u'terminal', u'environment', u'clearance',
             u'clearance', u'input', u'output', u'digital', u'automatic',
             u'terminal', u'management', u'controller', u'termination',
             u'testing', u'evaluating', u'policies', u'procedure',
             u'interface', u'installation', u'verification',
             u'certification', u'core', u'avionic', u'programs',
             u'knowledge', u'procedural', u'testing', u'interfacing',
             u'hardware', u'regression', u'impact', u'reliability',
             u'maintainability', u'factors', u'standardization',
             u'missions', u'asrc', u'subsidiaries', u'affirmative',
             u'employers', u'applicants', u'disability', u'veteran',
             u'technology', u'location', u'airport', u'bachelor',
             u'schedule', u'travel', u'contributor', u'management', u'asrc',
             u'reviews'
         ],
         [
             u'technical', u'solarcity', u'niche', u'vegas', u'overview',
             u'resolving', u'customer', u'clients', u'expanding',
             u'engineers', u'developers', u'responsibilities', u'knowledge',
             u'planning', u'adapt', u'dynamic', u'environment',
             u'inventive', u'creative', u'solarcity', u'lifecycle',
             u'responsibilities', u'technical', u'analyzing', u'diagnosing',
             u'troubleshooting', u'customers', u'ticketing', u'console',
             u'escalate', u'knowledge', u'engineering', u'timely', u'basic',
             u'phone', u'functionality', u'customer', u'tracking',
             u'knowledgebase', u'rotation', u'configure', u'deployment',
             u'sccm', u'technical', u'deployment', u'deploy', u'hardware',
             u'solarcity', u'bachelor', u'knowledge', u'dell', u'laptops',
             u'analytical', u'troubleshooting', u'solving', u'skills',
             u'knowledge', u'databases', u'preferably', u'server',
             u'preferably', u'monitoring', u'suites', u'documentation',
             u'procedures', u'knowledge', u'entries', u'verbal', u'skills',
             u'customer', u'skills', u'competitive', u'solar', u'package',
             u'insurance', u'vacation', u'savings', u'referral',
             u'eligibility', u'equity', u'performers', u'solarcity',
             u'affirmative', u'diversity', u'workplace', u'applicants',
             u'orientation', u'disability', u'veteran', u'careerrookie'
         ],
         [
             u'embedded', u'exelis', u'junction', u'exelis', u'embedded',
             u'acquisition', u'networking', u'capabilities', u'classified',
             u'customer', u'motivated', u'develops', u'tests',
             u'innovative', u'solutions', u'minimal', u'supervision',
             u'paced', u'environment', u'enjoys', u'assignments',
             u'interact', u'multi', u'disciplined', u'challenging',
             u'focused', u'embedded', u'developments', u'spanning',
             u'engineering', u'lifecycle', u'specification', u'enhancement',
             u'applications', u'embedded', u'freescale', u'applications',
             u'android', u'platforms', u'interface', u'customers',
             u'developers', u'refine', u'specifications', u'architectures'
         ],
         [
             u'java', u'programming', u'scripts', u'python', u'debug',
             u'debugging', u'emulators', u'regression', u'revisions',
             u'specialized', u'setups', u'capabilities', u'subversion',
             u'technical', u'documentation', u'multiple', u'engineering',
             u'techexpousa', u'reviews'
         ],
         [
             u'modeler', u'semantic', u'modeling', u'models', u'skills',
             u'ontology', u'resource', u'framework', u'schema',
             u'technologies', u'hadoop', u'warehouse', u'oracle',
             u'relational', u'artifacts', u'models', u'dictionaries',
             u'models', u'interface', u'specifications', u'documentation',
             u'harmonization', u'mappings', u'aligned', u'coordinate',
             u'technical', u'peer', u'reviews', u'stakeholder',
             u'communities', u'impact', u'domains', u'relationships',
             u'interdependencies', u'models', u'define', u'analyze',
             u'legacy', u'models', u'corporate', u'databases',
             u'architectural', u'alignment', u'customer', u'expertise',
             u'harmonization', u'modeling', u'modeling', u'consulting',
             u'stakeholders', u'quality', u'models', u'storage', u'agile',
             u'specifically', u'focus', u'modeling', u'qualifications',
             u'bachelors', u'accredited', u'modeler', u'encompass',
             u'evaluation', u'skills', u'knowledge', u'modeling',
             u'techniques', u'resource', u'framework', u'schema',
             u'technologies', u'unified', u'modeling', u'technologies',
             u'schemas', u'ontologies', u'sybase', u'knowledge', u'skills',
             u'interpersonal', u'skills', u'customers', u'clearance',
             u'applicants', u'eligibility', u'classified', u'clearance',
             u'polygraph', u'techexpousa', u'solutions', u'partnership',
             u'solutions', u'integration'
         ],
         [
             u'technologies', u'junction', u'develops', u'maintains',
             u'enhances', u'complex', u'diverse', u'intensive',
             u'analytics', u'algorithm', u'manipulation', u'management',
             u'documented', u'individually', u'reviews', u'tests',
             u'components', u'adherence', u'resolves', u'utilizes',
             u'methodologies', u'environment', u'input', u'components',
             u'hardware', u'offs', u'reuse', u'cots', u'gots', u'synthesis',
             u'components', u'tasks', u'individually', u'analyzes',
             u'modifies', u'debugs', u'corrects', u'integrates',
             u'operating', u'environments', u'develops', u'queries',
             u'databases', u'repositories', u'recommendations',
             u'improving', u'documentation', u'develops', u'implements',
             u'algorithms', u'functional', u'assists', u'developing',
             u'executing', u'procedures', u'components', u'reviews',
             u'documentation', u'solutions', u'analyzing', u'conferring',
             u'users', u'engineers', u'analyzing', u'investigating',
             u'areas', u'adapt', u'hardware', u'mathematical', u'models',
             u'predict', u'outcome', u'implement', u'complex', u'database',
             u'repository', u'interfaces', u'queries', u'bachelors',
             u'accredited', u'substituted', u'bachelors', u'firewalls',
             u'ipsec', u'vpns', u'technology', u'administering', u'servers',
             u'apache', u'jboss', u'tomcat', u'developing', u'interfaces',
             u'firefox', u'internet', u'explorer', u'operating',
             u'mainframe', u'linux', u'solaris', u'virtual', u'scripting',
             u'programming', u'oriented', u'programming', u'ajax',
             u'script', u'procedures', u'cobol', u'cognos', u'fusion',
             u'focus', u'html', u'java', u'java', u'script', u'jquery',
             u'perl', u'visual', u'basic', u'powershell', u'cots', u'cots',
             u'oracle', u'apex', u'integration', u'competitive', u'package',
             u'bonus', u'corporate', u'equity', u'tuition',
             u'reimbursement', u'referral', u'bonus', u'holidays',
             u'insurance', u'flexible', u'disability', u'insurance'
         ],
         [
             u'technologies', u'disability', u'accommodation', u'recruiter',
             u'techexpousa'
         ], ['bank', 'river', 'shore', 'water'],
         ['river', 'water', 'flow', 'fast', 'tree'],
         ['bank', 'water', 'fall', 'flow'],
         ['bank', 'bank', 'water', 'rain', 'river'],
         ['river', 'water', 'mud', 'tree'],
         ['money', 'transaction', 'bank', 'finance'],
         ['bank', 'borrow', 'money'], ['bank', 'finance'],
         ['finance', 'money', 'sell', 'bank'], ['borrow', 'sell'],
         ['bank', 'loan', 'sell']
     ]
     # initializing using own LDA sufficient statistics so that we get same results each time.
     sstats = np.loadtxt(datapath('DTM/sstats_test.txt'))
     dictionary = Dictionary(texts)
     corpus = [dictionary.doc2bow(text) for text in texts]
     self.ldaseq = ldaseqmodel.LdaSeqModel(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=2,
                                           time_slice=[10, 10, 11],
                                           initialize='own',
                                           sstats=sstats,
                                           passes=2,
                                           lda_inference_max_iter=10,
                                           em_min_iter=1,
                                           em_max_iter=4)
Exemple #37
0
 def setUp(self):
     ft_home = os.environ.get('FT_HOME', None)
     self.ft_path = os.path.join(ft_home, 'fasttext') if ft_home else None
     self.test_model_file = datapath('lee_fasttext')
     self.test_model = FT_gensim.load_fasttext_format(self.test_model_file)
     self.test_new_model_file = datapath('lee_fasttext_new')
        mean_jaccard.append(np.mean(jacc_np))
        mean_bleu.append(np.mean(bleu_np))
        mean_cos.append(np.mean(cos_np))
        mean_fscore.append(np.mean(fscore_np))
    return np.max(np.asarray(mean_bleu)), np.max(
        np.asarray(mean_jaccard)), np.max(np.asarray(mean_cos)), np.max(
            np.asarray(mean_fscore))


GH_IDs, SO_IDs, GH_annotation_intersect, GH_annotation_union, SO_annotation_intersect, SO_annotation_union = load_annotations(
)
path = "/home/norberteke/PycharmProjects/Thesis/data/"

dictionary = Dictionary.load(path + 'GH_full_processed_Dictionary.dict')
corpus = MmCorpus(datapath(path + 'corpus_processed_GH_full.mm'))

texts = []
with open(path + 'GH_full_processed_corpus.csv', 'r') as f:
    reader = csv.reader(f)
    texts = list(reader)

terms = []
for (key, value) in dictionary.iteritems():
    terms.append(value)


def write_results_to_file(path, lda_model, max_bleu, max_jaccard, max_cos,
                          max_fscore):
    with open(path, 'a') as f:
        writer = csv.writer(f,
Exemple #39
0
    def __init__(
        self,
        data_path='data/soccer/',
        vec_dim=300,
        # fasttext_model='/home/debanjan/acl_submissions/soccerbot_acl/vocab/wiki.simple.bin'):
        fasttext_model='/data/dchaudhu/soccerbot_acl/vocab/wiki.en.bin'):
        self.data_path = data_path
        self.max_similarity = 85
        self.vec_dim = vec_dim
        self.args = get_args()
        cap_path = datapath(fasttext_model)
        self.word_emb = load_facebook_model(cap_path)
        # print (self.max_er_vec)
        self.stop = set(stopwords.words('english'))
        self.punc = string.punctuation
        self.ent_d, self.ent_list = self.get_kg(self.data_path + 'KG/')
        self.train_dataset = self.get_data('train')
        self.val_dataset = self.get_data('val')
        self.test_dataset = self.get_data('test')
        self.max_er_vec = []  # max er vector combination size
        for dat in self.train_dataset:
            self.max_er_vec.append(sum(len(v) for k, v in dat['kgER'].items()))
        self.max_out_reln = np.max(self.max_er_vec)
        self.inp_graph_max_size = np.max(
            [len(getER_vec(kg['kgER'])) for kg in self.train_dataset])
        print('input graph size:' + str(self.inp_graph_max_size))
        print(self.max_out_reln)
        self.objects = ['o' + str(j) for j in range(self.max_out_reln)]
        # Create vocabulary and word2id
        self.vocab = defaultdict(float)
        self.get_vocab(self.train_dataset)
        self.get_vocab(self.test_dataset)
        self.get_vocab(self.val_dataset)
        self.vocab[self.args.unk_tok] += 1.0
        self.vocab[self.args.sos_tok] += 1.0
        self.vocab[self.args.eou_tok] += 1.0
        self.vocab[self.args.mem_tok] += 1.0
        self.vocab[self.args.eos_tok] += 1.0
        for o in self.objects:
            self.vocab[o] += 1.0
        # self.stoi[self.args.pad_tok] = 0
        self.stoi = dict(zip(self.vocab.keys(), range(1, len(self.vocab) + 1)))
        self.stoi[self.args.pad_tok] = 0
        # add additional tokens
        # self.stoi[self.args.unk_tok] = len(self.stoi)
        # self.stoi[self.args.sos_tok] = len(self.stoi)
        # self.stoi[self.args.eos_tok] = len(self.stoi)
        # print(len(self.stoi))
        # self.itos = {v: k for k, v in self.stoi.items()}

        # for j in range(self.max_out_reln):
        #     self.stoi['o'+str(j)] = len(self.stoi)+1
        # del self.stoi

        self.itos = {v: k for k, v in self.stoi.items()}
        print(len(self.stoi))
        self.n_words = len(self.stoi)

        self.vectors = np.zeros((len(self.stoi), vec_dim))
        for w, w2i in self.stoi.items():
            if w2i < self.stoi[self.args.eos_tok]:
                self.vectors[w2i] = self.word_emb.wv[w]
Exemple #40
0
    return sim


'''
Author:衣介书生
Link:https://www.jianshu.com/p/0c33c17770a0
'''


def multi_vec(vector_a, x):
    vector_a = np.mat(vector_a)
    return vector_a * x


for model in models:
    embeddings = KeyedVectors.load_word2vec_format(datapath(
        (path / "models/{}.vector".format(model))),
                                                   binary=False)

    cilin = open('../files/cilin_hier_perword', 'rb')
    fileout = open('../results/resem_comp_' + model, 'w', encoding='utf-8')

    scores = 0.0
    cnt = 0
    for line in cilin:
        line = line.decode('utf-8').split()
        word = line[5]
        sem = line[:5]
        length1 = len(sem)
        semvec1 = np.mat(0)
        for i in range(len(sem)):
            tmp = multi_vec(embeddings[sem[i]], 1 / (2**(len(sem) - i))) if (
Exemple #41
0
 def setUp(self):
     self.corpus = MmCorpus(datapath('testcorpus.mm'))
     self.model = lsimodel.LsiModel(self.corpus, num_topics=2)
    def run_training_batch(self, batch, batch_idx):
        """

        :param batch: dict; contains three keys: input_ids, attention_mask, decoder_input_ids
            Example for 'batch':
                batch: {'input_ids': tensor([[  0,  36, 230,  ...,   8,  41,   2]]),
                'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1]]),
                'decoder_input_ids': tensor([[    0,   287,    10,  2107,   111, 10468,   226, 47385, 11579,  1012,
                                                2156,     5,  5302, 47385,   281, 47385, 10003,   255, 47385,   347,
                                                111,  2107, 47385,   574, 47385,  1000, 47385,   398, 47385,   245,
                                                16,    10,   205,  1374, 12576,   479,   646,  1000,  1215,  3388,
                                                510,   742,    85,   128,   579,    65,     9,     5,   357,  3092,
                                                23,    63,  1836,    11,     5,  3555,   111,   672,  2156, 26180,
                                                47385,   642,   111,  3547,  4120,   479,   646,  1000,  1215,  3388,
                                                510,   742,  7192,  8806, 10262,  3444,  7951,  2170,  1318,     2]])}
        :param batch_idx: number of batch
        :return:
        """
        # load tokenizer
        tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')
        # load config for GSM
        config = yaml_load(f"{self.default_root_dir}/data/config/gsm.yaml")
        # load dict
        dictionary = Dictionary.load(datapath('dict-www-cnndm-unigram'))
        # remove [SEP]
        sep_list = [
            '[SEP_0]', '[SEP_1]', '[SEP_2]', '[SEP_3]', '[SEP_4]', '[SEP_5]',
            '[SEP_6]', '[SEP_7]', '[SEP_8]', '[SEP_9]', '<S_SEP>'
        ]
        # vocab size for topic modeling
        vocab_size = len(dictionary)
        # model
        config['hidden']['features'][0] = vocab_size

        # trainer batch
        config['trainer_batch']['test_sample'] = 1
        config = extend_config_reference(config)
        gsm_trainer = config['GSMtrainer']
        gsm_trainer[
            'base_dir'] = f"{self.default_root_dir}/log/bart-large-cnn-finetune"
        gsm_trainer = GSMTrainer.from_config(gsm_trainer)

        # number of topics
        K = config['gsmtopic']['k']

        # yaml_dump(gsm_trainer,
        #           os.path.join(f"{self.default_root_dir}/log/bart-large-cnn-finetune", "gsm_trainer.yaml"))

        # -----------------------------------------
        # Topic Modeling - GSM
        # -----------------------------------------
        batch_size = batch['input_ids'].size()[0]

        docs = []
        for batch_num in range(batch_size):
            # extract the batch_sentence
            batch_sentence = tokenizer.decode(
                batch['input_ids'][batch_num].tolist(),
                skip_special_tokens=True)
            # change to lowercase and split to list
            batch_sentence_list = batch_sentence.split(" ")
            # remove [SEP]
            batch_sentence_list_nosep = [
                item for item in batch_sentence_list if item not in sep_list
            ]
            text = ' '.join([x for x in batch_sentence_list_nosep])
            fine_text = text.replace(' ##', '').lower()
            batch_sentence = re.sub(r'[^\w\s]', '', fine_text)
            # batch_sentence: change to the cleaned news for topic modeling
            # change to training data format in topic modeling
            gsm_data_bow = dictionary.doc2bow(batch_sentence.split(" "))
            docs.append(gsm_data_bow)

        # gsm_data: data for topic modeling
        gsm_data = DataLoader(DocDataset(docs, len(dictionary), device='cuda'),
                              batch_size=config['dataset']['batch_size'],
                              drop_last=False,
                              num_workers=0)

        gsm_trainer.__dict__['train_iterator'] = gsm_data

        gsm_loss, gsm_p = gsm_trainer.co_train(vocab_size, training=True)

        del gsm_data

        # track grad norms
        grad_norm_dic = {}

        # track all metrics for callbacks
        batch_callback_metrics = []

        # track metrics to log
        batch_log_metrics = []

        if batch is None:
            return AttributeDict(signal=0, grad_norm_dic=grad_norm_dic)

        # Batch start events
        with self.profiler.profile('on_batch_start'):
            # callbacks
            self.on_batch_start()
            # hooks
            if self.is_function_implemented('on_batch_start'):
                response = self.get_model().on_batch_start(batch)
                if response == -1:
                    return AttributeDict(signal=-1,
                                         grad_norm_dic=grad_norm_dic)

        splits = [batch]
        if self.truncated_bptt_steps is not None:
            model_ref = self.get_model()
            with self.profiler.profile('tbptt_split_batch'):
                splits = model_ref.tbptt_split_batch(batch,
                                                     self.truncated_bptt_steps)

        self.hiddens = None
        for split_idx, split_batch in enumerate(splits):
            self.split_idx = split_idx

            for opt_idx, optimizer in self._get_optimizers_iterable():
                # make sure only the gradients of the current optimizer's parameters are calculated
                # in the training step to prevent dangling gradients in multiple-optimizer setup.
                if len(self.optimizers) > 1:
                    for param in self.get_model().parameters():
                        param.requires_grad = False
                    for group in optimizer.param_groups:
                        for param in group['params']:
                            param.requires_grad = True

                # -------------------
                # calculate loss
                # -------------------
                beta = 0.01
                opt_closure_result = self.optimizer_closure(
                    split_batch,
                    batch_idx,
                    opt_idx,
                    optimizer,
                    self.hiddens,
                    gsm_p,  # topic distribution
                    gsm_loss,  # loss for topic modeling
                    K,  # number of topics
                    beta,
                )

                # ------------------------------
                # POST forward bookkeeping
                # ------------------------------
                batch_callback_metrics.append(
                    opt_closure_result.training_step_output.callback_metrics)
                batch_log_metrics.append(
                    opt_closure_result.training_step_output.log_metrics)

                self.add_progress_bar_metrics(
                    opt_closure_result.training_step_output.pbar_on_batch_end)

                # track hiddens
                self.hiddens = opt_closure_result.hiddens

                # check if loss or model weights are nan
                if self.terminate_on_nan:
                    self.detect_nan_tensors(opt_closure_result.loss)

                # track total loss for logging (avoid mem leaks)
                self.batch_loss_value.append(opt_closure_result.loss)

                # ------------------------------
                # BACKWARD PASS
                # ------------------------------
                # gradient update with accumulated gradients
                if (self.batch_idx + 1) % self.accumulate_grad_batches == 0:
                    # backward
                    grad_norm_dic = self.run_batch_backward_pass(
                        split_batch, batch_idx, opt_idx, optimizer)

                    # calculate running loss for display
                    self.running_loss.append(self.batch_loss_value.mean())

                    # reset for next set of accumulated grads
                    self.batch_loss_value.reset()

        # Batch end events
        with self.profiler.profile('on_batch_end'):
            # callbacks
            self.on_batch_end()
            # model hooks
            if self.is_function_implemented('on_batch_end'):
                self.get_model().on_batch_end()

        # collapse all metrics into one dict
        batch_log_metrics = {
            k: v
            for d in batch_log_metrics for k, v in d.items()
        }

        # track all metrics for callbacks
        self.callback_metrics.update(
            {k: v
             for d in batch_callback_metrics for k, v in d.items()})

        result = AttributeDict(
            signal=0,
            grad_norm_dic=grad_norm_dic,
            batch_log_metrics=batch_log_metrics,
            training_step_output_for_epoch_end=opt_closure_result.
            training_step_output_for_epoch_end)
        return result
from gensim.test.utils import datapath
from gensim.models.ldamodel import LdaModel

model_location = datapath('D:/HazMat/Projects/ML/Models/model_130')
model = LdaModel.load(model_location)

print(model.print_topics(10))
Exemple #44
0
def ALeA(json_semanticSelection_One,
         json_semanticSelection_Two,
         pathModel,
         pathOutput,
         scoreAlignPhon="09_Aver_Score_Sem-Phon_Corr",
         verbose=False,
         semanticLevel="Level_01",
         dividers=[","],
         selectBest="07_Sim_Score_Phon_Corr_Match",
         selectBestThreshold=0.65,
         parseVow=True):
    """
    :param json_semanticSelection_One: first semantically tagged lexical list
                -- format: json string - output of the ASeT algorithm
    :param json_semanticSelection_Two: second semantically tagged lexical list
                -- format: json string - output of the ASeT algorithm
    :param pathModel: path to saved semantic model (string)
    :param pathOutput: path to save the results (string - no extention; e.g. /my/folder/name_file_with_my_results)
    :param scoreAlignPhon: select type of score according to which the phonetic alignments are organized (string)
            -- default: "09_Aver_Score_Sem-Phon_Corr"
            -- options: "07_Sim_Score_Phon_Corr_Match", "08_Sim_Score_Phon_Glob_Match", "09_Aver_Score_Sem-Phon_Corr", or "10_Aver_Score_Sem-Phon_Glob"
            -- "07_Sim_Score_Phon_Corr_Match" uses the function "(((SumFeat) / (NrFeat * 7.71)) / (LenAlign * 4.77117)"
            -- "09_Aver_Score_Sem-Phon_Corr" is the average between the semantic score and the "07_Sim_Score_Phon_Corr_Match"
            -- "10_Aver_Score_Sem-Phon_Glob" is the average between the semantic score and the "08_Sim_Score_Phon_Glob_Match"
            -- see FAAL documentation for details ( https://github.com/MKilani/FAAL )
    :param verbose: print data during execution (boolean)
            -- default: True
    :param semanticLevel: level of the semantic tags according to which the comaprison is performed. The options, for now, are: "Level_01", "Level_02", "Level_03" (see ASeT algorithm for details)
    :param dividers: dividers used to split meanings (array of strings [string, string]
            -- default: [","]
    :param selectBest: parameter according to which the algorithm selects the best matches among those identified by the ALeA on the basis of the other parameters
            -- default: "07_Sim_Score_Phon_Corr_Match"
            -- options: "07_Sim_Score_Phon_Corr_Match", "08_Sim_Score_Phon_Glob_Match", "09_Aver_Score_Sem-Phon_Corr", or "10_Aver_Score_Sem-Phon_Glob"
            -- "07_Sim_Score_Phon_Corr_Match" uses the function "(((SumFeat) / (NrFeat * 7.71)) / (LenAlign * 4.77117)"
            -- "09_Aver_Score_Sem-Phon_Corr" is the average between the semantic score and the "07_Sim_Score_Phon_Corr_Match"
            -- "10_Aver_Score_Sem-Phon_Glob" is the average between the semantic score and the "08_Sim_Score_Phon_Glob_Match"
            -- see FAAL documentation for details ( https://github.com/MKilani/FAAL )
    :param selectBestThreshold: threshold for the parameter selectBest
            -- default: 0.65
    :param parseVow: this allows to decide if the phonetic comparison should take into consideration vowels or not. Ignoring vowels can be useful when dealing with unrelated or relatively distant languages, or with languages in which vowels are rather unstable and semantically secondary (e.g. Semitic languages)
            -- default: True
    """
    gateway = JavaGateway()
    addition_app = gateway.entry_point

    semanticSelectionDict_One = json.loads(json_semanticSelection_One)
    semanticSelectionDict_Two = json.loads(json_semanticSelection_Two)

    semanticSelectionDict = {}

    SemanticIndex_ListTwo = {}

    for key_Two in semanticSelectionDict_Two:
        entryTwo = semanticSelectionDict_Two[key_Two]
        ID_Token = entryTwo["00_ID_token"]
        for match_ID in entryTwo["03_Matches"][semanticLevel]:
            semantic_item_temp = entryTwo["03_Matches"][semanticLevel][
                match_ID]["11_Semantic_Field"]
            ID_Cluster = entryTwo["03_Matches"][semanticLevel][match_ID][
                "05_ID_Cluster"]

            if semantic_item_temp in SemanticIndex_ListTwo:
                SemanticIndex_ListTwo[semantic_item_temp].append({
                    "Key":
                    key_Two,
                    "ID_token":
                    ID_Token,
                    "ID_match":
                    match_ID,
                    "ID_Cluster":
                    ID_Cluster
                })
            else:
                SemanticIndex_ListTwo[semantic_item_temp] = [{
                    "Key":
                    key_Two,
                    "ID_token":
                    ID_Token,
                    "ID_match":
                    match_ID,
                    "ID_Cluster":
                    ID_Cluster
                }]

    hurry = SemanticIndex_ListTwo["hurry"]

    #Combine lists

    counterNewPairs = 0

    print("*- Phonetic comparison -*")
    print("-> Start")

    # set up progress bar
    indexBar = -1
    print("Progress:")

    for key_One in semanticSelectionDict_One:

        indexBar = indexBar + 1

        entry = semanticSelectionDict_One[key_One]
        ID_Token_00 = entry["00_ID_token"]
        Meaning_token_01 = entry["01_Meaning_token"]
        Form_token_02 = entry["02_Form_token"]
        last_match = list(entry["03_Matches"][semanticLevel].keys())[-1]
        max_cluster_ID = entry["03_Matches"][semanticLevel][last_match][
            "05_ID_Cluster"]
        for new_ID_cluster in range(0, max_cluster_ID + 1):
            new_entry = {}
            new_entry["00_ID_token"] = ID_Token_00
            new_entry["01_Meaning_token"] = Meaning_token_01
            new_entry["02_Form_token"] = Form_token_02

            new_match_count = 0
            new_matches = {}
            for match_ID in entry["03_Matches"][semanticLevel]:
                if entry["03_Matches"][semanticLevel][match_ID][
                        "05_ID_Cluster"] > new_ID_cluster:
                    continue

                if entry["03_Matches"][semanticLevel][match_ID][
                        "05_ID_Cluster"] <= new_ID_cluster:
                    semanticToMatch = entry["03_Matches"][semanticLevel][
                        match_ID]["11_Semantic_Field"]

                    #new_match_count = 0
                    if semanticToMatch in SemanticIndex_ListTwo:
                        for matchTwo in SemanticIndex_ListTwo[semanticToMatch]:

                            progbar(indexBar,
                                    len(semanticSelectionDict_One) - 1, 20)

                            new_match = {}

                            if matchTwo["ID_Cluster"] <= new_ID_cluster:
                                entry_Two = semanticSelectionDict_Two[
                                    matchTwo["Key"]]

                                new_match["00_ID_Match"] = entry_Two[
                                    "00_ID_token"]
                                new_match["01_Meaning_Match"] = entry_Two[
                                    "01_Meaning_token"]
                                new_match["02_Form_Match"] = entry_Two[
                                    "02_Form_token"]
                                new_match["03_Best_Match_Sem"] = [
                                    semanticToMatch, semanticToMatch
                                ]
                                new_match["05_ID_Cluster"] = new_ID_cluster
                                new_match["06_Sim_Score_Sem_Match"] = 1.0
                                new_match[
                                    "11_Semantic_Field"] = semanticToMatch

                                new_matches[new_match_count] = new_match.copy()
                                new_match_count = new_match_count + 1

            new_entry["03_Matches"] = {}
            new_entry["03_Matches"][semanticLevel] = new_matches

            semanticSelectionDict[counterNewPairs] = {}
            semanticSelectionDict[counterNewPairs][new_ID_cluster] = new_entry
            counterNewPairs = counterNewPairs + 1

    print()

    print("-> Load Model")

    # load the google word2vec model
    temp_file = datapath(pathModel)
    model = KeyedVectors.load(temp_file)

    print("-> Model loaded")

    counter = 0
    for key_A in semanticSelectionDict:
        for sem_Cluster in semanticSelectionDict[key_A]:

            meaningRaw = semanticSelectionDict[key_A][sem_Cluster][
                '01_Meaning_token']

            for divider in dividers:

                meaningRaw = meaningRaw.replace(divider, "£")

            meaningRaw = meaningRaw.replace("  ", " ")
            meaningRaw = meaningRaw.replace("  ", " ")
            meaningRaw = meaningRaw.replace("  ", " ")
            meaningRaw = meaningRaw.replace("£ ", "£")
            meaningRaw = meaningRaw.replace(" £", "£")

            listMeaningsSplit = meaningRaw.split("£")
            listMeanings = []
            for ID in range(0, len(listMeaningsSplit)):
                listMeanings.append(listMeaningsSplit[ID].split(" "))

            numberMatchesOutput = len(listMeanings)

            print("-> Compile semantic index")
            print(str(counter + 1) + " of " + str(len(semanticSelectionDict)))
            counter = counter + 1
            index = WmdSimilarity(listMeanings, model, numberMatchesOutput)
            print("-> Semantic index compiled")

            for key_B in semanticSelectionDict[key_A][sem_Cluster][
                    "03_Matches"][semanticLevel]:

                meaningToCheckRaw = semanticSelectionDict[key_A][sem_Cluster][
                    "03_Matches"][semanticLevel][key_B]["01_Meaning_Match"]

                for divider in dividers:
                    meaningToCheckRaw = meaningToCheckRaw.replace(divider, "£")

                meaningToCheckRaw = meaningToCheckRaw.replace("  ", " ")
                meaningToCheckRaw = meaningToCheckRaw.replace("  ", " ")
                meaningToCheckRaw = meaningToCheckRaw.replace("  ", " ")
                meaningToCheckRaw = meaningToCheckRaw.replace("£ ", "£")
                meaningToCheckRaw = meaningToCheckRaw.replace(" £", "£")

                meaningToCheck = meaningToCheckRaw.split("£")

                bestResult = 0.0
                bestMatch = ["", ""]

                for meaning in meaningToCheck:
                    query = [meaning]
                    resultsQuery = index[query]
                    resultsQueryWithIndexes = list(enumerate(resultsQuery))
                    if len(resultsQueryWithIndexes) > 0:
                        if resultsQueryWithIndexes[0][1][1] > bestResult:
                            bestResult = resultsQueryWithIndexes[0][1][1]
                            bestMatch = []
                            bestMatch.append(" ".join(listMeanings[
                                resultsQueryWithIndexes[0][1][0]]))
                            bestMatch.append(meaning)

                semanticSelectionDict[key_A][sem_Cluster]["03_Matches"][
                    semanticLevel][key_B][
                        "06_Sim_Score_Sem_Match"] = bestResult
                semanticSelectionDict[key_A][sem_Cluster]["03_Matches"][
                    semanticLevel][key_B]["03_Best_Match_Sem"] = bestMatch
                #semanticSelectionDict[key_A][sem_Cluster]["03_Matches"][semanticLevel][key_B]['09_Aver_Score_Sem-Phon_Corr'] = (semanticSelectionDict[key_A][sem_Cluster]["03_Matches"][semanticLevel][key_B]["07_Sim_Score_Phon_Corr_Match"] + semanticSelectionDict[key_A][sem_Cluster]["03_Matches"][semanticLevel][key_B]["06_Sim_Score_Sem_Match"]) / 2

    print("*- Phonetic comparison -*")
    print("-> Start")

    # set up progress bar
    indexBar = -1
    print("Progress:")

    for key_A in semanticSelectionDict:
        for sem_Cluster in semanticSelectionDict[key_A]:

            indexBar = indexBar + 1
            progbar(indexBar, len(semanticSelectionDict) - 1, 20)

            if semanticSelectionDict[key_A][sem_Cluster]["03_Matches"][
                    semanticLevel] == {}:
                continue

            ID_word_A = semanticSelectionDict[key_A][sem_Cluster][
                '00_ID_token']
            meaning_word_A = semanticSelectionDict[key_A][sem_Cluster][
                '01_Meaning_token']
            word_A_list = semanticSelectionDict[key_A][sem_Cluster][
                '02_Form_token']

            #print (word_A)
            previous_Key = ""
            for key_B in semanticSelectionDict[key_A][sem_Cluster][
                    "03_Matches"][semanticLevel]:

                if key_B == previous_Key:
                    continue
                previous_Key = key_B

                ID_word_B = semanticSelectionDict[key_A][sem_Cluster][
                    "03_Matches"][semanticLevel][key_B]["00_ID_Match"]
                meaning_word_B = semanticSelectionDict[key_A][sem_Cluster][
                    "03_Matches"][semanticLevel][key_B]["01_Meaning_Match"]
                word_B_list = semanticSelectionDict[key_A][sem_Cluster][
                    "03_Matches"][semanticLevel][key_B]["02_Form_Match"]

                resultsComparison = {}
                IDBestMatch = []

                #Compare phonetically FAAL - when more than one varian, select that providing the best alignment according to the selected score "score"
                index_WordA = -1
                for word_A in word_A_list:
                    index_WordA = index_WordA + 1
                    index_WordB = -1
                    for word_B in word_B_list:
                        index_WordB = index_WordB + 1

                        if parseVow == False:
                            noVowWord_A = removeVow(word_A)
                            noVowWord_B = removeVow(word_B)

                            resultsComparisonTemp = interfaceFAAL(
                                noVowWord_A, noVowWord_B, addition_app)

                        else:
                            resultsComparisonTemp = interfaceFAAL(
                                word_A, word_B)

                        #indexBar = indexBar + 1
                        #progbar(indexBar, (len(semanticSelectionDict)*len(semanticSelectionDict[key_A])* len(semanticSelectionDict[key_A][sem_Cluster]["03_Matches"][semanticLevel])*len(word_A_list)* len(word_B_list)) - 1, 20)

                        #print (resultsComparisonTemp)

                        if resultsComparison == {}:
                            resultsComparison = resultsComparisonTemp
                            IDBestMatch = []
                            IDBestMatch.append(index_WordA)
                            IDBestMatch.append(word_A)
                            IDBestMatch.append(index_WordB)
                            IDBestMatch.append(word_B)
                        else:
                            if resultsComparisonTemp[
                                    scoreAlignPhon] > resultsComparison[
                                        scoreAlignPhon]:
                                resultsComparison = resultsComparisonTemp
                                IDBestMatch = []
                                IDBestMatch.append(index_WordA)
                                IDBestMatch.append(word_A)
                                IDBestMatch.append(index_WordB)
                                IDBestMatch.append(word_B)

                #phoneticSelectionFile = open("/Users/iome/Desktop/dataTLA/lemmata/phonetics.txt", "a+")
                #phoneticSelectionFile.write(key_A + "||" + key_B + "||" + resultsComparison + "||" + IDBestMatch + "\n")
                #phoneticSelectionFile.close()

                semanticSelectionDict[key_A][sem_Cluster]["03_Matches"][
                    semanticLevel][key_B]['12_ResultsComp'] = resultsComparison
                semanticSelectionDict[key_A][sem_Cluster]["03_Matches"][
                    semanticLevel][key_B]['04_Best_Match_Phon'] = IDBestMatch
                semanticSelectionDict[key_A][sem_Cluster]["03_Matches"][
                    semanticLevel][key_B][
                        '07_Sim_Score_Phon_Corr_Match'] = semanticSelectionDict[
                            key_A][sem_Cluster]["03_Matches"][semanticLevel][
                                key_B]["12_ResultsComp"]["bestAlignCorrected"]
                semanticSelectionDict[key_A][sem_Cluster]["03_Matches"][
                    semanticLevel][key_B][
                        '08_Sim_Score_Phon_Glob_Match'] = semanticSelectionDict[
                            key_A][sem_Cluster]["03_Matches"][semanticLevel][
                                key_B]["12_ResultsComp"]["bestAlignGlobal"]
                semanticSelectionDict[key_A][sem_Cluster]["03_Matches"][
                    semanticLevel][key_B]['09_Aver_Score_Sem-Phon_Corr'] = (
                        semanticSelectionDict[key_A][sem_Cluster]["03_Matches"]
                        [semanticLevel][key_B]["07_Sim_Score_Phon_Corr_Match"]
                        +
                        semanticSelectionDict[key_A][sem_Cluster]["03_Matches"]
                        [semanticLevel][key_B]["06_Sim_Score_Sem_Match"]) / 2
                semanticSelectionDict[key_A][sem_Cluster]["03_Matches"][
                    semanticLevel][key_B]['10_Aver_Score_Sem-Phon_Glob'] = (
                        semanticSelectionDict[key_A][sem_Cluster]["03_Matches"]
                        [semanticLevel][key_B]["08_Sim_Score_Phon_Glob_Match"]
                        +
                        semanticSelectionDict[key_A][sem_Cluster]["03_Matches"]
                        [semanticLevel][key_B]["06_Sim_Score_Sem_Match"]) / 2

    print()
    # set up progress bar
    indexBar = -1
    print("Progress:")

    semanticSelectionDict_ordered = {}

    for key_A in semanticSelectionDict:

        indexBar = indexBar + 1
        progbar(indexBar, len(semanticSelectionDict) - 1, 20)

        if key_A not in semanticSelectionDict_ordered:
            semanticSelectionDict_ordered[key_A] = {}

        temporaryEntries = []

        for sem_Cluster in semanticSelectionDict[key_A]:
            if sem_Cluster not in semanticSelectionDict_ordered[key_A]:
                semanticSelectionDict_ordered[key_A][sem_Cluster] = {}
            semanticSelectionDict_ordered[key_A][sem_Cluster][
                "00_ID_token"] = semanticSelectionDict[key_A][sem_Cluster][
                    "00_ID_token"]
            semanticSelectionDict_ordered[key_A][sem_Cluster][
                "01_Meaning_token"] = semanticSelectionDict[key_A][
                    sem_Cluster]["01_Meaning_token"]
            semanticSelectionDict_ordered[key_A][sem_Cluster][
                "02_Form_token"] = semanticSelectionDict[key_A][sem_Cluster][
                    "02_Form_token"]

            if semanticSelectionDict[key_A][sem_Cluster]["03_Matches"][
                    semanticLevel] == {}:
                semanticSelectionDict_ordered[key_A][sem_Cluster][
                    "03_Matches"] = {}
                semanticSelectionDict_ordered[key_A][sem_Cluster][
                    "03_Matches"][semanticLevel] = semanticSelectionDict[
                        key_A][sem_Cluster]["03_Matches"][semanticLevel]
                continue

            for n in range(
                    0,
                    len(semanticSelectionDict[key_A][sem_Cluster]["03_Matches"]
                        [semanticLevel])):

                if len(temporaryEntries) == 0:

                    temporaryEntries.append(
                        semanticSelectionDict[key_A][sem_Cluster]["03_Matches"]
                        [semanticLevel][0])
                else:

                    if semanticSelectionDict[key_A][sem_Cluster]["03_Matches"][
                            semanticLevel][n][
                                scoreAlignPhon] >= temporaryEntries[0][
                                    scoreAlignPhon]:
                        temporaryEntries.insert(
                            0, semanticSelectionDict[key_A][sem_Cluster]
                            ["03_Matches"][semanticLevel][n])

                    elif semanticSelectionDict[key_A][sem_Cluster]["03_Matches"][semanticLevel][n][scoreAlignPhon] < \
                            temporaryEntries[-1][scoreAlignPhon]:
                        temporaryEntries.append(
                            semanticSelectionDict[key_A][sem_Cluster]
                            ["03_Matches"][semanticLevel][n])

                    else:
                        for z in range(1, len(temporaryEntries)):
                            if semanticSelectionDict[key_A][sem_Cluster]["03_Matches"][semanticLevel][n]\
                                            [scoreAlignPhon] < temporaryEntries[z-1][scoreAlignPhon] and \
                                            semanticSelectionDict[key_A][sem_Cluster]["03_Matches"][semanticLevel][n] \
                                            [scoreAlignPhon] >= temporaryEntries[z][scoreAlignPhon]:
                                #if not semanticSelectionDict[key_A][sem_Cluster]["03_Matches"][semanticLevel][n]\
                                #            [scoreAlignPhon] < temporaryEntries[z-1][scoreAlignPhon] and \
                                #            semanticSelectionDict[key_A][sem_Cluster]["03_Matches"][semanticLevel][n] \
                                #            ["00_ID_Match"] == temporaryEntries[z]["00_ID_Match"]:
                                temporaryEntries.insert(
                                    z,
                                    semanticSelectionDict[key_A][sem_Cluster]
                                    ["03_Matches"][semanticLevel][n])
                                break

            semanticSelectionDict_ordered[key_A][sem_Cluster][
                "03_Matches"] = {}
            semanticSelectionDict_ordered[key_A][sem_Cluster]["03_Matches"][
                semanticLevel] = {}

            temporaryEntriesCleaned = []
            #remove doubles from temporary entry
            doubleEntry = False
            for temporaryEntry in temporaryEntries:
                for temporaryEntryCleaned in temporaryEntriesCleaned:
                    if temporaryEntry["00_ID_Match"] == temporaryEntryCleaned[
                            "00_ID_Match"]:
                        doubleEntry = True

                if doubleEntry == False:
                    temporaryEntriesCleaned.append(
                        copy.deepcopy(temporaryEntry))
                doubleEntry = False

            for ID in range(0, len(temporaryEntriesCleaned)):
                semanticSelectionDict_ordered[key_A][sem_Cluster][
                    "03_Matches"][semanticLevel][ID] = temporaryEntriesCleaned[
                        ID]

    json_semanticSelectionDict = json.dumps(semanticSelectionDict_ordered,
                                            sort_keys=True,
                                            indent=3,
                                            ensure_ascii=False)

    #print(json_semanticSelectionDict)

    print()
    print("-> End")

    print()
    # set up progress bar
    indexBar = -1
    print("Select top matches - Progress:")

    semanticSelectionDict = json.loads(json_semanticSelectionDict)

    semanticSelectionDict_ordered_best = {}

    resultsSimplified = []
    resultsSimplifiedString = ""

    for key_A in semanticSelectionDict:

        indexBar = indexBar + 1
        progbar(indexBar, len(semanticSelectionDict) - 1, 20)

        if key_A not in semanticSelectionDict_ordered_best:
            semanticSelectionDict_ordered_best[key_A] = {}

        temporaryEntries = []

        counter = 0

        for sem_Cluster in semanticSelectionDict[key_A]:
            if sem_Cluster not in semanticSelectionDict_ordered_best[key_A]:
                semanticSelectionDict_ordered_best[key_A][sem_Cluster] = {}
            semanticSelectionDict_ordered_best[key_A][sem_Cluster]["00_ID_token"] = \
            semanticSelectionDict[key_A][sem_Cluster]["00_ID_token"]
            semanticSelectionDict_ordered_best[key_A][sem_Cluster]["01_Meaning_token"] = \
            semanticSelectionDict[key_A][sem_Cluster]["01_Meaning_token"]
            semanticSelectionDict_ordered_best[key_A][sem_Cluster]["02_Form_token"] = \
            semanticSelectionDict[key_A][sem_Cluster]["02_Form_token"]

            if semanticSelectionDict[key_A][sem_Cluster]["03_Matches"][
                    semanticLevel] == {}:
                semanticSelectionDict_ordered_best[key_A][sem_Cluster][
                    "03_Matches"] = {}
                semanticSelectionDict_ordered_best[key_A][sem_Cluster]["03_Matches"][semanticLevel] = \
                semanticSelectionDict[key_A][sem_Cluster]["03_Matches"][semanticLevel]
                continue

            for n in range(
                    0,
                    len(semanticSelectionDict[key_A][sem_Cluster]["03_Matches"]
                        [semanticLevel])):

                if len(temporaryEntries) == 0:

                    temporaryEntries.append(
                        semanticSelectionDict[key_A][sem_Cluster]["03_Matches"]
                        [semanticLevel][str(0)])
                else:

                    if semanticSelectionDict[key_A][sem_Cluster]["03_Matches"][
                            semanticLevel][str(
                                n)][selectBest] > selectBestThreshold:
                        temporaryEntries.append(
                            semanticSelectionDict[key_A][sem_Cluster]
                            ["03_Matches"][semanticLevel][str(n)])

        semanticSelectionDict_ordered_best[key_A][sem_Cluster][
            "03_Matches"] = {}
        semanticSelectionDict_ordered_best[key_A][sem_Cluster]["03_Matches"][
            semanticLevel] = {}
        for ID in range(0, len(temporaryEntries)):
            semanticSelectionDict_ordered_best[key_A][sem_Cluster][
                "03_Matches"][semanticLevel][str(ID)] = copy.deepcopy(
                    temporaryEntries[ID])


            resultsSimplifiedString = resultsSimplifiedString + "Cluster: " + str(sem_Cluster) + " :: " + str(semanticSelectionDict_ordered_best[key_A][sem_Cluster]["00_ID_token"]) + " - '" + ", ".join(semanticSelectionDict[key_A][sem_Cluster]["02_Form_token"]) + "' - " + \
                  semanticSelectionDict[key_A][sem_Cluster]["01_Meaning_token"] + " :: " + str(semanticSelectionDict_ordered_best[key_A][sem_Cluster]["03_Matches"][semanticLevel][str(ID)]["00_ID_Match"]) + " - '" + ", ".join(semanticSelectionDict_ordered_best[key_A][sem_Cluster]["03_Matches"][semanticLevel][str(ID)]["02_Form_Match"]) + "' - " + \
                  semanticSelectionDict_ordered_best[key_A][sem_Cluster]["03_Matches"][semanticLevel][str(ID)]["01_Meaning_Match"] + " :: " + str(semanticSelectionDict_ordered_best[key_A][sem_Cluster]["03_Matches"][semanticLevel][str(ID)][selectBest]) + "\n"
        resultsSimplifiedString = resultsSimplifiedString + "---------\n"

    if verbose == True:
        print()
        print()
        print(resultsSimplifiedString)

    json_semanticSelectionDict_best = json.dumps(
        semanticSelectionDict_ordered_best,
        sort_keys=True,
        indent=3,
        ensure_ascii=False)

    Results = open(pathOutput + ".json", "w")  #

    Results.write(json_semanticSelectionDict)
    Results.close()

    ResultsBest = open(pathOutput + "_best_" + str(selectBestThreshold) +
                       ".json", "w")  #

    ResultsBest.write(json_semanticSelectionDict_best)
    ResultsBest.close()

    ResultsBestSimplified = open(pathOutput + "_bestSimplified_" +
                                 str(selectBestThreshold) + ".txt", "w")  #

    ResultsBestSimplified.write(resultsSimplifiedString)
    ResultsBestSimplified.close()

    return json_semanticSelectionDict, json_semanticSelectionDict_best, resultsSimplifiedString
Exemple #45
0
       esac
   else
       echo "'$1' is not a valid file!"
   fi
}
"""
import pprint
from gensim.test.utils import common_texts, get_tmpfile, datapath
from gensim.models import Word2Vec, KeyedVectors
import gensim.matutils

binPath = "/root/GoogleNews-vectors-negative300.bin"
binPath = "/Users/liruqi/GoogleNews-vectors-negative300.bin"
print(binPath)
# Gensim can load word vectors in the “word2vec C format”, as a KeyedVectors instance:
wv_from_bin = KeyedVectors.load_word2vec_format(datapath(binPath), binary=True)
v1 = wv_from_bin.wv['man']
v2 = wv_from_bin.wv['woman']
pprint.pprint(v1)
pprint.pprint(v2)
pprint.pprint(wv_from_bin.similarity('man','woman'))
pprint.pprint(wv_from_bin.distance('man','woman'))
pprint.pprint(1 - wv_from_bin.n_similarity(
    "National tragedy Trump begins border wall construction in Unesco reserve".split(" "),
    "Trump administration enters new phase for border wall sets ambitious timetable after securing land".split(" ")
    )
)

"""
# python3 page3.py
Traceback (most recent call last):
Exemple #46
0
 def __iter__(self):
     with open(datapath('lee_background.cor')) as f:
         for line in f:
             yield utils.simple_preprocess(line)
Exemple #47
0
p.add_argument("--dependency", action="append")
p.add_argument("--language", action="append")
args = p.parse_args()
embeddings_folder = args.embeddings_folder
output_folder = args.output_folder
vaa_pairs_folder = args.vaa_pairs_folder
avv_pairs_folder = args.avv_pairs_folder
save_path = args.save_path
dependency = args.dependency
language = args.language

dim = 300
batch = 32
epoch = 5
for lang in language:
    embeddings = KeyedVectors.load_word2vec_format(datapath(
        (embeddings_folder / "embeddings_{}".format(lang))),
                                                   binary=False)
    file0 = open((embeddings_folder / "embeddings_{}".format(lang)), "rb")
    vocabulary = []
    file0.readline()
    for line in file0:
        line = line.decode('utf-8').split()
        vocabulary.append(line[0])
    file0.close()
    model = nn.DotProductModel(embeddings, vocabulary)
    for dep in dependency:
        vaa_train_f = open(
            (vaa_pairs_folder /
             "{}/v_a1_a2_filtered_pairs_{}_{}.train".format(lang, dep, lang)),
            "rb")
        vaa_test_f = open(
text = file_handle.read()
doc = nlp(text.lower())
texts, article = [], []
for w in doc:
    if w.text != '\n' and not w.is_stop and not w.is_punct and not w.like_num and w.text != 'I':
        article.append(w.lemma_)
    if w.text == '\n':
        texts.append(article)
        article = []
dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
if len(corpus) == 0:
    print()
else:
    ldamodel = LdaModel(corpus=corpus, num_topics=50, id2word=dictionary)
    temp_topic_file = datapath("model")
    ldamodel.save(temp_topic_file)
    ldamodel = LdaModel.load(temp_topic_file)
    # print(ldamodel)
    # for i in ldamodel.show_topics():
    #     print(i)

file = "sample.csv"
print("issue_id,topic")
file_handle = open(file, 'r')
file_handle.readline()
for line in file_handle:
    a, text = line.split("~", 1)
    text = text[:-1]
    text = ' '.join(text.split())
    text = text + "\n"
Exemple #49
0
 def test_load_model_supervised(self):
     with self.assertRaises(NotImplementedError):
         FT_gensim.load_fasttext_format(
             datapath('pang_lee_polarity_fasttext'))
from gensim.test.utils import common_texts, get_tmpfile
from gensim.models import Word2Vec
#getting the word2vec model to train the other vectors
#as given in the problem that either polyglot or word2vec embeddings can be chosen
path = get_tmpfile("word2vec.model")

model = Word2Vec(common_texts, size=100, window=5, min_count=1, workers=4)
myvector = model.wv
from gensim.test.utils import get_tmpfile
from gensim.models import KeyedVectors

filename = get_tmpfile("vectors.kv")
#myvector.save(filename)
myvector = KeyedVectors.load(filename, mmap='r')
from gensim.test.utils import datapath
wv_from_text = KeyedVectors.load_word2vec_format(datapath('word2vec_pre_kv_c'),
                                                 binary=False)

import gensim.downloader as api
myvector = api.load("glove-twitter-50")

print("similar word for Dog :")
print(myvector.similar_by_word('dog', 1))
print("-------------------------")
print("similar word for Whale :")
print(myvector.similar_by_word('whale', 1))
print("-------------------------")
print("similar word for before :")
print(myvector.similar_by_word('before', 1))
print("-------------------------")
print("similar word for however :")
Exemple #51
0
                                            passes=10,
                                            alpha='auto',
                                            per_word_topics=True)

num_topics = 20
print("\n\n\n Total topics with word probabilities are: \n")
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

print("\n\n Percentage of a topic in a document \n\n")
print(lda_model[corpus[0]])
print("\n\n")

from gensim.test.utils import datapath

temp_file = datapath("model")
lda_model.save(temp_file)

# Load a potentially pretrained model from disk.
lda_model = gensim.models.ldamodel.LdaModel.load(temp_file)

other_texts = [[
    'He do not cook well ', 'why does he annoy me so much',
    'this lockdown is getting on my nerve'
],
               [
                   'Good activities', 'i am really feeling very energetic',
                   'she snores so much'
               ], ['he is so nice and kind', 'system panic', 'nice person']]
print(other_texts)
other_corpus = [id2word.doc2bow(text) for text in other_texts]
Exemple #52
0
# 1.3 训练得到word embedding
word2vector = model.wv  # KeyedVectors
vector = word2vector['computer']  # numpy vector of shape (100, )

path = get_tmpfile("wordvectors.kv")
word2vector.save(path)
word2vector = KeyedVectors.load(path, mmap='r')

# 1.4 自动检测并训练词组Phrase
bigram_transformer = Phrases(common_texts)
model = Word2Vec(bigram_transformer[common_texts], min_count=1)

# 2 处理word embedding
# 2.1 加载现成的word embedding
word2vector1 = KeyedVectors.load_word2vec_format(datapath('word2vec_pre_kv_c'),
                                                 binary=False)  # C text format
word2vector2 = KeyedVectors.load_word2vec_format(
    datapath("euclidean_vectors.bin"), binary=True)  # C bin format

# 2.2 转化普通glove文件为Gensim支持的word2vec格式,即 C text format
# 普通glove文件格式:没有header,从第一行开始就是word及其vector,空格分隔
# word2vec文件格式:第一行是vector个数和vector维度,其他行同普通txt文件,空格分隔
glove_file = './data/normal_glove.txt'
word2vec_file = './data/normal_word2vec.txt'
glove2word2vec(glove_file, word2vec_file)
word2vector = KeyedVectors.load_word2vec_format(word2vec_file, binary=False)

# Get a Keras 'Embedding' layer with weights set as the Word2Vec model's learned word embeddings.
embedding_layer = word2vector.get_keras_embedding(train_embeddings=False)
Exemple #53
0
def glove2w2v(glove_path, w2v_path):
    glove_path = datapath(glove_path)
    w2v_path = get_tmpfile(w2v_path)
    glove2word2vec(glove_path, w2v_path)
import json
import gensim
import markovify
from gensim.test.utils import datapath
from ModelMaker import process_body

if __name__ == '__main__':
    dictionary = None
    model_choice = 10  # Can be either 5, 10, 15, 20
    markov_models = []

    with open('Raw Data/dictionary', 'rb') as fp:
        dictionary = pickle.load(fp)

    fname = datapath(
        'C:/Users/User/Desktop/Love Advice Bot/Raw Data/ldaModel' +
        str(model_choice))
    lda_model = gensim.models.LdaModel.load(fname)
    num_topics = lda_model.get_topics().shape[0]

    print('****LOADING MARKOV MODELS****\n')
    for x in range(num_topics):
        with open('Raw Data/MarkovModels/markov' + str(model_choice) + '_' +
                  str(x) + '.json') as fp:
            model_json = json.load(fp)
            markov_models.append(markovify.Text.from_json(model_json))

    print('****PROCESSING I-O/Input.txt ****\n')

    question = ""
    with open('I-O/Input.txt', encoding="utf8") as fp:
Exemple #55
0
class Recommendation:
    __instance = None
    __name = "Recommendation"
    __path = datapath(__name)
    __model = Model.getInstance()
    __alpha = 5.730e-7
    __min_val = 0.1

    def getInstance():
        if Recommendation.__instance == None:
            Recommendation()
        return Recommendation.__instance

    def __init__(self):
        try:
            file = open(Recommendation.__path, 'rb')
            Recommendation.__instance = pickle.load(file)
            file.close()
        except Exception as e:
            print(e)
            conn = connect()
            self.users = {}
            if conn != None:
                cursor = conn.cursor()
                cursor.execute("SELECT id FROM users")
                rowss = cursor.fetchall()
                for row in rowss:
                    self.users[row[0]] = User(
                        row[0], Recommendation.__model.dimensions)
                cursor.close()
                conn.close()
            file = open(Recommendation.__path, 'wb')
            pickle.dump(self, file, pickle.HIGHEST_PROTOCOL)
            file.close()
            Recommendation.__instance = self

    def add_user(self, id):
        self.users[id] = User(id, Recommendation.__model.dimensions)
        file = open(Recommendation.__path, 'wb')
        pickle.dump(self, file, pickle.HIGHEST_PROTOCOL)
        file.close()

    def show_users(self):
        for index, user in self.users.items():
            print(index, user.spikes)

    def recommend_articles(self, id):
        curr_time = int(round(time.time()))
        return Recommendation.__model.getReccommendation(
            self.users[id].get_preference_vector(curr_time,
                                                 Recommendation.__alpha,
                                                 Recommendation.__min_val))

    def read_articles(self, ids, uid):
        conn = connect()
        indices_set = set()
        if conn != None:
            cursor = conn.cursor()
            for id in ids:
                cursor.execute("SELECT content from articles where id=" +
                               str(id))
                rows = cursor.fetchall()
                l = Recommendation.__model.getVector(rows[0][0])
                l.sort(key=lambda x: x[1], reverse=True)
                for i in range(0, 5):
                    indices_set.add(l[i][0])
            self.users[uid].update_spike(indices_set)
            file = open(Recommendation.__path, 'wb')
            pickle.dump(self, file, pickle.HIGHEST_PROTOCOL)
            file.close()
            return True
        return False

    def initialize_vec(self, tags, id):
        ans = set()
        sets = {}
        l = Recommendation.__model.model.show_topics(num_topics=50,
                                                     num_words=5,
                                                     log=False,
                                                     formatted=False)
        for i in range(0, Recommendation.__model.dimensions):
            sets[i] = set()
            for j in range(0, 5):
                sets[i].add(l[i][1][j][0])
        for tag in tags:
            for k, val in sets.items():
                if tag in val:
                    ans.add(k)
        self.users[id].update_spike(ans)
        file = open(Recommendation.__path, 'wb')
        pickle.dump(self, file, pickle.HIGHEST_PROTOCOL)
        file.close()
Exemple #56
0
 def test_open_file_existent_file_object(self):
     number_of_lines_in_file = 30
     file_obj = open(datapath('testcorpus.mm'))
     with utils.open_file(file_obj) as infile:
         self.assertEqual(sum(1 for _ in infile), number_of_lines_in_file)
# H*!

from flask import Flask, request
from flask_cors import CORS, cross_origin
from flask_restful import Resource, Api
from json import dumps
from flask_jsonpify import jsonify

app = Flask(__name__)
api = Api(app)

import time
t0 = time.time()

from gensim.test.utils import datapath
cap_path = datapath("/home/haoran/cc.en.300.bin")
from gensim.models.wrappers import FastText
wv = FastText.load_fasttext_format(cap_path)

t1 = time.time()
CORS(app)

print('app started')
print('time to load:', t1 - t0)


def most_similar(word):
    return wv.most_similar(word)


def similarity(word1, word2):
Exemple #58
0
 def setUp(self):
     self.corpus = mmcorpus.MmCorpus(datapath('testcorpus.mm'))
     self.class_ = hdpmodel.HdpModel
     self.model = self.class_(corpus,
                              id2word=dictionary,
                              random_state=np.random.seed(0))
Exemple #59
0
    bleu_np = np.asarray(BLEU_scores)
    jacc_np = np.asarray(jacc_sim)
    cos_np = np.asarray(cos_sim)
    fscore_np = np.asarray(fscore)

    mean_jaccard.append( np.mean(jacc_np) )
    mean_bleu.append( np.mean(bleu_np) )
    mean_cos.append( np.mean(cos_np) )
    mean_fscore.append( np.mean(fscore_np) )
  return np.max( np.asarray(mean_bleu) ), np.max( np.asarray(mean_jaccard) ), np.max( np.asarray(mean_cos) ), np.max( np.asarray(mean_fscore) )

GH_IDs, SO_IDs, GH_annotation_intersect, GH_annotation_union, SO_annotation_intersect, SO_annotation_union = load_annotations()
path = '/home/norberteke/PycharmProjects/Thesis/data/'

dictionary = Dictionary.load(path + 'SO_full_processed_Dictionary.dict')
corpus = MmCorpus(datapath(path + 'corpus_processed_SO_full.mm'))

texts = []
with open(path + 'new_SO_full_processed_corpus.csv', 'r') as f:
    reader = csv.reader(f)
    texts = list(reader)


terms = []
for (key, value) in dictionary.iteritems():
  terms.append(value)

def write_results_to_file(path, lda_model, max_bleu, max_jaccard, max_cos, max_fscore):
  with open(path, 'a') as f:
    writer = csv.writer(f, delimiter = ',', quotechar='"', quoting = csv.QUOTE_MINIMAL)
    writer.writerow([str(lda_model.num_topics), str(lda_model.eta), str(max_bleu), str(max_jaccard), str(max_cos), str(max_fscore)])
Exemple #60
0
 def test_save_load_no_scoring(self):
     """Test saving and loading a FrozenPhrases object with no scoring parameter.
     This should ensure backwards compatibility with old versions of FrozenPhrases"""
     bigram_loaded = FrozenPhrases.load(datapath("phraser-no-scoring.pkl"))
     # we do not much with scoring, just verify its the one expected
     self.assertEqual(bigram_loaded.scoring, original_scorer)