Ejemplo n.º 1
0
def get_jensen_shannon(components, ntopics):
    topic_dists = components
    js_dists = []
    for i in range(ntopics):
        for j in range(ntopics):
            if i>j:
                js_dists.append(jensen_shannon(topic_dists[i,:], topic_dists[j,:]))

    return np.min(js_dists), np.mean(js_dists)
Ejemplo n.º 2
0
    def testNewAuthorTopics(self):

        model = self.class_(corpus,
                            author2doc=author2doc,
                            id2word=dictionary,
                            num_topics=2,
                            passes=100,
                            random_state=np.random.seed(0))
        author2doc_newauthor = {}
        author2doc_newauthor["test"] = [0, 1]
        model.update(corpus=corpus[0:2], author2doc=author2doc_newauthor)

        # temp save model state vars before get_new_author_topics is called
        state_gamma_len = len(model.state.gamma)
        author2doc_len = len(model.author2doc)
        author2id_len = len(model.author2id)
        id2author_len = len(model.id2author)
        doc2author_len = len(model.doc2author)

        new_author_topics = model.get_new_author_topics(corpus=corpus[0:2])

        # sanity check
        for k, v in new_author_topics:
            self.assertTrue(isinstance(k, int))
            self.assertTrue(isinstance(v, float))

        # make sure topics are similar enough
        similarity = 1 / (1 + jensen_shannon(model["test"], new_author_topics))
        self.assertTrue(similarity >= 0.9)

        # produce an error to test if rollback occurs
        with self.assertRaises(TypeError):
            model.get_new_author_topics(corpus=corpus[0])

        # assure rollback was successful and the model state is as before
        self.assertEqual(state_gamma_len, len(model.state.gamma))
        self.assertEqual(author2doc_len, len(model.author2doc))
        self.assertEqual(author2id_len, len(model.author2id))
        self.assertEqual(id2author_len, len(model.id2author))
        self.assertEqual(doc2author_len, len(model.doc2author))
Ejemplo n.º 3
0
    def testNewAuthorTopics(self):

        model = self.class_(
            corpus, author2doc=author2doc, id2word=dictionary, num_topics=2,
            passes=100, random_state=np.random.seed(0)
        )
        author2doc_newauthor = {}
        author2doc_newauthor["test"] = [0, 1]
        model.update(corpus=corpus[0:2], author2doc=author2doc_newauthor)

        # temp save model state vars before get_new_author_topics is called
        state_gamma_len = len(model.state.gamma)
        author2doc_len = len(model.author2doc)
        author2id_len = len(model.author2id)
        id2author_len = len(model.id2author)
        doc2author_len = len(model.doc2author)

        new_author_topics = model.get_new_author_topics(corpus=corpus[0:2])

        # sanity check
        for k, v in new_author_topics:
            self.assertTrue(isinstance(k, int))
            self.assertTrue(isinstance(v, float))

        # make sure topics are similar enough
        similarity = 1 / (1 + jensen_shannon(model["test"], new_author_topics))
        self.assertTrue(similarity >= 0.9)

        # produce an error to test if rollback occurs
        with self.assertRaises(TypeError):
            model.get_new_author_topics(corpus=corpus[0])

        # assure rollback was successful and the model state is as before
        self.assertEqual(state_gamma_len, len(model.state.gamma))
        self.assertEqual(author2doc_len, len(model.author2doc))
        self.assertEqual(author2id_len, len(model.author2id))
        self.assertEqual(id2author_len, len(model.id2author))
        self.assertEqual(doc2author_len, len(model.doc2author))
Ejemplo n.º 4
0
with open("movie_ldabow.txt", "rb") as fp:
    lda_bow = pickle.load(fp)

df_res = pd.DataFrame({"title": [],
                       "rec": []})

df_title = pd.DataFrame({"title": [],
                         "dist": []})

for i in range(34, len(titles)):
    df_title.empty
    df_curr = pd.DataFrame({"title": [titles[i]]})
    start = time.time()
    for j in range(len(lda_bow)):
        if j != i:
            dst = jensen_shannon(lda_bow[i], lda_bow[j], 130)
            df_individual = pd.DataFrame({"title": [titles[j]],
                                          "dist": [dst]})
            df_title = df_title.append(df_individual, ignore_index=True)

    df_title = df_title.sort_values(by=['dist'])
    df_rec = pd.DataFrame({"rec": [df_title['title'][0:50]]})
    rec = df_rec["rec"][0].tolist()
    df_rec = pd.DataFrame({"rec": [rec]})
    df_ind_res = pd.merge(df_curr, df_rec, left_index=True, right_index=True, how='inner')
    df_res = df_res.append(df_ind_res, ignore_index=True)
    if i == 0:
        df_ind_res.to_csv('movie_recommendation.csv', index=False)
    else:
        df_ind_res.to_csv('movie_recommendation.csv', mode='a', index=False, header=False)
    if i == 99:  # remove this or change this as requirement | Here i is number of movies in CSV
Ejemplo n.º 5
0
def js_dist(X):
    return pdist(X, lambda u, v: jensen_shannon(u, v))
Ejemplo n.º 6
0
                  "-->'1' to get a random movie and it's recommendation\n" +
                  "-->'2' to see some of the titles in the dataset\n" +
                  "-->'3' or 'STOP' to end sequence\n" +
                  "-->Title copied from dataset\n" + "Input: ")

    df_res = pd.DataFrame({"title": [], "rec": []})

    df_title = pd.DataFrame({"title": [], "dist": []})

    if check == '1':
        num = random.randint(0, i - 25)
        print("Getting recommendations. Please wait\n")
        df_curr = pd.DataFrame({"title": [titles[num]]})
        for j in range(len(lda_bow)):
            if j != num:
                dst = jensen_shannon(lda_bow[num], lda_bow[j], 130)
                df_individual = pd.DataFrame({
                    "title": [titles[j]],
                    "dist": [dst]
                })
                df_title = df_title.append(df_individual, ignore_index=True)

        df_title = df_title.sort_values(by=['dist'])
        df_rec = pd.DataFrame({"rec": [df_title['title'][0:50]]})
        rec = df_rec["rec"][0].tolist()
        title = titles[num]
        rec = rec[0:10]
        print("Movie: %s\nTop 10 Recommendations: %s" % (title, rec))

    elif check == '2':
        start = random.randint(0, i - 20)