def receive_feedbacks(self, session, feedbacks): """ Receive feedbacks from user The format of feedback is: { "docs": [[doc_id, feedback_value], ...], "kws": [[keyword_id, feedback_value], ...], "dockws": [[keyword_id, doc_id, feedback_value], ...] } """ print "propagation started..." for doc_fb in feedbacks.get("docs", []): doc_id, fb = doc_fb doc = Document.get(doc_id) self.ppgt.fb_from_doc(doc, fb, session) for kw_fb in feedbacks.get("kws", []): kw_id, fb = kw_fb kw = Keyword.get(kw_id) self.ppgt.fb_from_kw(kw, fb, session) for dockw_fb in feedbacks.get("dockws", []): kw_id, doc_id, fb = dockw_fb doc = Document.get(doc_id) kw = Keyword.get(kw_id) self.ppgt.fb_from_dockw(kw, doc, fb, session) # propagation is done # updates the feedback value self.upd.update(session) print "propagation finished"
def test_rec_fb_from_dockw(self): """ getter/setting for receiving feedback from in-document keyword """ doc = Document.get(1) doc.rec_fb_from_dockw(Keyword.get("redis"), doc, 1, self.session) doc.rec_fb_from_dockw(Keyword.get("database"), doc, .5, self.session) self.assertEqual(doc.fb_from_kw(self.session), { Keyword.get("redis"): 1, Keyword.get("database"): .5 }) #not the right document self.assertRaises(AssertionError, doc.rec_fb_from_dockw, Document.get(2), Keyword.get("redis"), 1, self.session) #python is not a keyword for document#1, error should be raised self.assertRaises(AssertionError, doc.rec_fb_from_dockw, doc, Keyword.get("python"), 1, self.session) #test the weighted sum weights = [ 0.62981539329519109, 0.45460437826405437, 0.62981539329519109 ] self.assertEqual((weights[0] * 1 + weights[1] * .5) / sum(weights), doc.fb_weighted_sum(self.session))
def test_similarity(self): # for doc doc1 = Document.get(1) doc2 = Document.get(2) doc3 = Document.get(3) self.assertAlmostEqual(0.6300877890447911, doc1.similarity_to(doc2)) self.assertAlmostEqual(doc2.similarity_to(doc1), doc1.similarity_to(doc2)) self.assertAlmostEqual(0.31713642199844894, doc1.similarity_to(doc3)) self.assertRaises(NotImplementedError, doc1.similarity_to, doc3, "not implemented metric") # for kw kw1 = Keyword.get("redis") kw2 = Keyword.get("database") kw3 = Keyword.get("python") self.assertAlmostEqual(0.6698544675330306, kw1.similarity_to(kw2)) self.assertAlmostEqual(kw2.similarity_to(kw1), kw1.similarity_to(kw2)) self.assertAlmostEqual(0.2613424459663648, kw1.similarity_to(kw3)) self.assertRaises(NotImplementedError, kw1.similarity_to, kw3, "not implemented metric")
def setUp(self): #make the fmim kw_filters = [self.my_kw_filter] doc_filters = [self.kw_count_filter, self.has_database_filter] self.r = LinRelRecommender(2, 2, 1.0, 0.1, 1.0, 0.1, #the default configuration kw_filters = None, doc_filters = [self.kw_count_filter, self.has_database_filter], **fmim.__dict__) self.session = get_session() self.session.update_kw_feedback(Keyword.get("redis"), .7) self.session.update_kw_feedback(Keyword.get("database"), .6) self.session.update_doc_feedback(Document.get(1), .7) self.session.update_doc_feedback(Document.get(2), .7) self.session.update_doc_feedback(Document.get(8), .7) filtered_kws = self.r._filter_objs(kw_filters, kws = Keyword.all_kws) filtered_docs = self.r._filter_objs(doc_filters, docs = Document.all_docs) kw2doc_submat, kw_ind_map, kw_ind_map_r = self.r._submatrix_and_indexing(filtered_kws, filtered_docs, fmim.kw2doc_m, fmim.kw_ind, fmim.doc_ind) doc2kw_submat, doc_ind_map, doc_ind_map_r = self.r._submatrix_and_indexing(filtered_docs, filtered_kws, fmim.doc2kw_m, fmim.doc_ind, fmim.kw_ind) self.fmim = FeatureMatrixAndIndexMapping(kw_ind_map, doc_ind_map, kw2doc_submat, doc2kw_submat, kw_ind_map_r, doc_ind_map_r)
def test_doc_fb_threshold_filter_with_prefiltering(self): #change the feedback self.session.update_doc_feedback(Document.get(1), .2) self.session.update_doc_feedback(Document.get(2), .0999999) actual = doc_fb_threshold_filter(0.1, self.session, with_fb=True) expected = Document.get_many([1]) self.assertEqual(expected, actual)
def test_doc_fb_threshold_filter(self): #change the feedback self.session.update_doc_feedback(Document.get(1), .2) self.session.update_doc_feedback(Document.get(2), .0999999) actual = doc_fb_threshold_filter(0.1, self.session, docs = Document.all_docs, with_fb = False) expected = Document.get_many([1]) self.assertEqual(expected, actual)
def test_affected_docs(self): docs = [Document.get(1), Document.get(2)] self.session.add_affected_docs(*docs) self.assertEqual(docs, self.session.affected_docs) doc3 = Document.get(3) docs.append(doc3) self.session.add_affected_docs(doc3) self.assertEqual(set(docs), set(self.session.affected_docs))
def test_fb_weighted_sum_dockw_only(self): """ test if the weighted sum is correct only feedback from dockw/doc """ kw = Keyword.get("redis") kw.rec_fb_from_dockw(kw, Document.get(1), 1, self.session) kw.rec_fb_from_doc(Document.get(2), .5, self.session) self.assertEqual((1 + .5) / 2, kw.fb_weighted_sum(self.session))
def setUp(self): self.r = LinRelRecommender(2, 2, 1., .1, 1., .1, None, None, **fmim.__dict__) self.session = get_session() #giving the feedbacks self.session.update_kw_feedback(Keyword.get("redis"), .7) self.session.update_kw_feedback(Keyword.get("database"), .6) self.session.update_doc_feedback(Document.get(1), .7) self.session.update_doc_feedback(Document.get(2), .7) self.session.update_doc_feedback(Document.get(8), .7)
def test_document_centroid(self): doc = Document.get(1) doclist1 = DocumentList([doc]) self.assertArrayAlmostEqual(matrix2array(doclist1.centroid), doc.vec.toarray()[0]) doc1 = Document.get(1) doc2 = Document.get(2) doclist2 = Document.get_many([1, 2]) self.assertArrayAlmostEqual(matrix2array(doclist2.centroid), (doc1.vec.toarray()[0] + doc2.vec.toarray()[0]) / 2)
def test_rec_fb_from_dockw(self): """ getter/setting for receiving feedback from in-document keyword """ kw = Keyword.get("redis") kw.rec_fb_from_dockw(kw, Document.get(2), .5, self.session) kw.rec_fb_from_dockw(kw, Document.get(1), 1, self.session) self.assertEqual(kw.fb_from_doc(self.session), {Document.get(1): 1, Document.get(2): .5}) #is not the right keyword self.assertRaises(AssertionError, kw.rec_fb_from_dockw, Keyword.get("the"), Document.get(1), 1, self.session)
def test_rec_fb_from_doc(self): """ getter/setting for receiving feedback from document """ kw = Keyword.get("redis") kw.rec_fb_from_doc(Document.get(1), 1, self.session) kw.rec_fb_from_doc(Document.get(2), .5, self.session) self.assertEqual(kw.fb_from_doc(self.session), {Document.get(1): 1, Document.get(2): .5}) #does not contain redis, error should be raised self.assertRaises(AssertionError, kw.rec_fb_from_doc, Document.get(3), 1, self.session)
def test_fb_weighted_sum_mixed_source(self): """ test if the weighted sum is correct feedback include all three sources """ kw = Keyword.get("redis") kw.rec_fb_from_dockw(kw, Document.get(1), 1, self.session) kw.rec_fb_from_doc(Document.get(2), .5, self.session) kw.rec_fb_from_kw(kw, .5, self.session) self.assertEqual(.3 * (1 / 2. + 1 / 4.) + .7 * .5, kw.fb_weighted_sum(self.session))
def test_document_centroid(self): doc = Document.get(1) doclist1 = DocumentList([doc]) self.assertArrayAlmostEqual(matrix2array(doclist1.centroid), doc.vec.toarray()[0]) doc1 = Document.get(1) doc2 = Document.get(2) doclist2 = Document.get_many([1, 2]) self.assertArrayAlmostEqual( matrix2array(doclist2.centroid), (doc1.vec.toarray()[0] + doc2.vec.toarray()[0]) / 2)
def test_loop_done(self): """ test if things are cleaned when the loop is done """ kw = Keyword.get("redis") kw.rec_fb_from_dockw(kw, Document.get(1), 1, self.session) kw.rec_fb_from_doc(Document.get(2), .5, self.session) kw.rec_fb_from_kw(kw, .5, self.session) # terminate the loop # everything feedback stuff cleaned kw.loop_done(self.session) self.assertEqual(kw.fb_weighted_sum(self.session), 0)
def test_update_doc_fb(self): """update document feedback""" doc = Document.get(1) self.session.update_doc_feedback(doc, 1) self.assertEqual(self.session.doc_feedbacks, {doc: 1}) self.assertEqual(doc.fb(self.session), 1)
def test_rec_fb_from_doc(self): """ getter/setting for receiving feedback from document """ kw = Keyword.get("redis") kw.rec_fb_from_doc(Document.get(1), 1, self.session) kw.rec_fb_from_doc(Document.get(2), .5, self.session) self.assertEqual(kw.fb_from_doc(self.session), { Document.get(1): 1, Document.get(2): .5 }) #does not contain redis, error should be raised self.assertRaises(AssertionError, kw.rec_fb_from_doc, Document.get(3), 1, self.session)
def test_rec_from_doc(self): """ getter/setting for receiving feedback from document """ doc = Document.get(1) doc.rec_fb_from_doc(doc, 1, self.session) self.assertEqual(1, doc.fb_from_doc(self.session)) doc.rec_fb_from_doc(doc, .5, self.session) self.assertEqual(.5, doc.fb_from_doc(self.session)) #is not the right document self.assertRaises(AssertionError, doc.rec_fb_from_doc, Document.get(2), 1, self.session) #test the weighted sum self.assertEqual(.5 * .7, doc.fb_weighted_sum(self.session))
def test_rec_fb_from_dockw(self): """ getter/setting for receiving feedback from in-document keyword """ kw = Keyword.get("redis") kw.rec_fb_from_dockw(kw, Document.get(2), .5, self.session) kw.rec_fb_from_dockw(kw, Document.get(1), 1, self.session) self.assertEqual(kw.fb_from_doc(self.session), { Document.get(1): 1, Document.get(2): .5 }) #is not the right keyword self.assertRaises(AssertionError, kw.rec_fb_from_dockw, Keyword.get("the"), Document.get(1), 1, self.session)
def test_fb_from_dockw(self): kw = Keyword.get("redis") doc = Document.get(1) ppgt.fb_from_dockw(kw, doc, .5, self.session) upd.update(self.session) self.assertAlmostEqual(0.183701573217, doc.fb(self.session)) self.assertAlmostEqual(1/4., kw.fb(self.session))
def test_kw_fb_filter(self): kw = Keyword.get("redis") kw.rec_fb_from_doc(Document.get(1), 1, self.session) self.session.add_doc_recom_list(Document.get_many([1, 2, 6])) self.session.update_kw_feedback(kw, kw.fb_weighted_sum(self.session)) actual = FilterRepository.filters["kw_fb"]([kw]) expected = Keyword.get_many(["redis"]) self.assertEqual(expected, actual)
def test_doc_fb_filter(self): doc = Document.get(1) doc.rec_fb_from_kw(Keyword.get("redis"), 1, self.session) self.session.update_doc_feedback(doc, doc.fb_weighted_sum(self.session)) print "doc.fb(self.session)=", doc.fb(self.session) actual = FilterRepository.filters["doc_fb"]([doc]) expected = Document.get_many([]) print doc.fb(self.session) self.assertEqual(expected, actual)
def test_rec_fb_from_dockw(self): """ getter/setting for receiving feedback from in-document keyword """ doc = Document.get(1) doc.rec_fb_from_dockw(Keyword.get("redis"), doc, 1, self.session) doc.rec_fb_from_dockw(Keyword.get("database"), doc, .5, self.session) self.assertEqual(doc.fb_from_kw(self.session), {Keyword.get("redis"): 1, Keyword.get("database"): .5}) #not the right document self.assertRaises(AssertionError, doc.rec_fb_from_dockw, Document.get(2), Keyword.get("redis"), 1, self.session) #python is not a keyword for document#1, error should be raised self.assertRaises(AssertionError, doc.rec_fb_from_dockw, doc, Keyword.get("python"), 1, self.session) #test the weighted sum weights = [0.62981539329519109, 0.45460437826405437, 0.62981539329519109] self.assertEqual((weights[0] * 1 + weights[1] * .5) / sum(weights), doc.fb_weighted_sum(self.session))
def test_model2modellist_similarity(self): #for keywords kw = Keyword.get("redis") kwlist = Keyword.get_many(["database", "mysql"]) self.assertAlmostEqual(0.3754029265429976, kw.similarity_to(kwlist)) #for documents doc = Document.get(6) doclist = Document.get_many([1, 2]) self.assertAlmostEqual(0.7382455893131392, doc.similarity_to(doclist))
def test_fb_from_doc(self): doc = Document.get(1) ppgt.fb_from_doc(doc, 0.5, self.session) upd.update(self.session) # assertions self.assertAlmostEqual(.5 * .7, doc.fb(self.session)) self.assertAlmostEqual(1/2., Keyword.get("a").fb(self.session)) self.assertAlmostEqual(1/4., Keyword.get("redis").fb(self.session)) self.assertAlmostEqual(1/4., Keyword.get("database").fb(self.session))
def setUp(self): #make the fmim kw_filters = [self.my_kw_filter] doc_filters = [self.kw_count_filter, self.has_database_filter] self.r = LinRelRecommender( 2, 2, 1.0, 0.1, 1.0, 0.1, #the default configuration kw_filters=None, doc_filters=[self.kw_count_filter, self.has_database_filter], **fmim.__dict__) self.session = get_session() self.session.update_kw_feedback(Keyword.get("redis"), .7) self.session.update_kw_feedback(Keyword.get("database"), .6) self.session.update_doc_feedback(Document.get(1), .7) self.session.update_doc_feedback(Document.get(2), .7) self.session.update_doc_feedback(Document.get(8), .7) filtered_kws = self.r._filter_objs(kw_filters, kws=Keyword.all_kws) filtered_docs = self.r._filter_objs(doc_filters, docs=Document.all_docs) kw2doc_submat, kw_ind_map, kw_ind_map_r = self.r._submatrix_and_indexing( filtered_kws, filtered_docs, fmim.kw2doc_m, fmim.kw_ind, fmim.doc_ind) doc2kw_submat, doc_ind_map, doc_ind_map_r = self.r._submatrix_and_indexing( filtered_docs, filtered_kws, fmim.doc2kw_m, fmim.doc_ind, fmim.kw_ind) self.fmim = FeatureMatrixAndIndexMapping(kw_ind_map, doc_ind_map, kw2doc_submat, doc2kw_submat, kw_ind_map_r, doc_ind_map_r)
def test_query_that_produces_match(self): """ Query that matches something in the corpus """ query = "python, redis" docs, kws = self.r.recommend(query) self.assertEqual(Document.get(6), docs[0]) self.assertEqual(4, len(docs)) #kws should be superset of assoc_kws assoc_kws = set([kw for doc in docs for kw in doc.keywords]) self.assertTrue(assoc_kws.issubset(set(kws)))
def test_all_together(self): """ All three types of feedbacks are involved """ doc = Document.get(1) kw = Keyword.get("redis") recom_docs = [Document.get(_id) for _id in [1,2,3]] self.session.add_doc_recom_list(recom_docs) ppgt.fb_from_doc(doc, 0.5, self.session) ppgt.fb_from_dockw(kw, doc, .5, self.session) ppgt.fb_from_kw(kw, 0.5, self.session) upd.update(self.session) self.assertAlmostEqual(0.56689342264886755 * .5 / (0.56689342264886755 + 0.49704058656839417), Keyword.get("a").fb(self.session)) self.assertAlmostEqual(1 / 4. * .3 + .5 * .7, Keyword.get("redis").fb(self.session)) self.assertAlmostEqual(1 / 4., Keyword.get("database").fb(self.session)) self.assertAlmostEqual(0.183701573217 * .3 + .7 * .5, recom_docs[0].fb(self.session)) self.assertAlmostEqual(0.191506501383, recom_docs[1].fb(self.session)) self.assertAlmostEqual(0, recom_docs[2].fb(self.session))
def test_fb_from_kw(self): kw = Keyword.get("redis") recom_docs = [Document.get(_id) for _id in [1,2,3]] self.session.add_doc_recom_list(recom_docs) ppgt.fb_from_kw(kw, 0.5, self.session) upd.update(self.session) # assertions self.assertAlmostEqual(.5 * .7, kw.fb(self.session)) self.assertAlmostEqual(0.183701573217, recom_docs[0].fb(self.session)) self.assertAlmostEqual(0.191506501383, recom_docs[1].fb(self.session)) self.assertAlmostEqual(0, recom_docs[2].fb(self.session))
def test_document(self): """ whether id, article, keywords are correct """ kw_strs = ["redis", "database", "a"] doc = Document.get(1) self.assertEqual(doc.id, 1) self.assertEqual(doc.title, "redis: key-value-storage database (ONE)") self.assertEqual(set(doc.keywords), set(Keyword.get_many(kw_strs))) #that is as far as we can test #no numerical testing self.assertTrue(type(doc._kw_weight) is DictType)
def test_loop_done(self): """ test if things are cleaned when the loop is done """ doc = Document.get(1) doc.rec_fb_from_dockw(Keyword.get("redis"), doc, 1, self.session) doc.rec_fb_from_kw(Keyword.get("database"), .5, self.session) doc.rec_fb_from_doc(doc, .5, self.session) # terminate the loop # everything feedback stuff cleaned doc.loop_done(self.session) self.assertEqual(doc.fb_weighted_sum(self.session), 0)
def test_all_together(self): """ All three sources of feedbacks are involved """ doc = Document.get(1) doc.rec_fb_from_dockw(Keyword.get("redis"), doc, 1, self.session) doc.rec_fb_from_kw(Keyword.get("database"), .5, self.session) doc.rec_fb_from_doc(doc, .5, self.session) redis = Keyword.get("redis") db = Keyword.get("database") weights = {redis: 0.62981539329519109, db: 0.45460437826405437, Keyword.get("a"): 0.62981539329519109 } self.assertAlmostEqual(.5 * .7 + .3 * (weights[redis] * 1 + weights[db] * .5) / sum(weights.values()), doc.fb_weighted_sum(self.session))
def recommend_documents(self, fmim, session, top_n, mu, c, sampler = None): """ return a list of document ids as well as the scores """ docs = Document.get_many(fmim.doc_ind.keys()) fbs = dict([(doc.id, doc.fb(session)) for doc in docs]) id_with_scores, id_with_explt_scores, id_with_explr_scores = self.generic_rank(fmim.doc2kw_m, fbs, fmim.doc_ind,fmim.doc_ind_r, mu, c) docs = [] for doc_id, score in id_with_scores.items()[:top_n]: doc = Document.get(doc_id) doc["score"] = score doc['recommended'] = True docs.append(doc) return docs
def recommend_documents(self, query, top_n): """ Param: query: string, the query string, phrases separated by comma, for example: machine learning, natural language processing top_n: integer, the number of documents to be returned Return: DocumentList, the recommended documents KeywordList, the query keywords(that exist in the corpus) """ query_keywords = [kw_str.strip() for kw_str in query.strip().split(",")] #prepare the query word binary column vector word_vec = self._word_vec(query_keywords) existing_keywords = Keyword.get_many([word for word in query_keywords if self.kw_ind.has_key(word)]) #get the scores for documents and score it scores = matrix2array((self.doc2kw_m * word_vec).T) #get none zero scores non_zero_scores = filter(None, scores) sorted_scores = sorted(enumerate(non_zero_scores), key = lambda (_, score): score, reverse = True) #get the top_n documents docs = DocumentList([]) for ind, score in sorted_scores[:top_n]: doc_id = self.doc_ind_r[ind] doc = Document.get(doc_id) doc['score'] = score doc["recommended"] = True docs.append(doc) return docs, existing_keywords
def test_all_together(self): """ All three sources of feedbacks are involved """ doc = Document.get(1) doc.rec_fb_from_dockw(Keyword.get("redis"), doc, 1, self.session) doc.rec_fb_from_kw(Keyword.get("database"), .5, self.session) doc.rec_fb_from_doc(doc, .5, self.session) redis = Keyword.get("redis") db = Keyword.get("database") weights = { redis: 0.62981539329519109, db: 0.45460437826405437, Keyword.get("a"): 0.62981539329519109 } self.assertAlmostEqual( .5 * .7 + .3 * (weights[redis] * 1 + weights[db] * .5) / sum(weights.values()), doc.fb_weighted_sum(self.session))
def test_type_mismatch(self): kw = Keyword.get("redis") kwlist = Keyword.get_many(["database", "mysql"]) doc = Document.get(6) doclist = Document.get_many([1, 2]) #doc to kw self.assertRaises(AssertionError, kw.similarity_to, doc) #kw to doc self.assertRaises(AssertionError, doc.similarity_to, kw) #kw to doclist self.assertRaises(AssertionError, kw.similarity_to, doclist) #doclist to kw self.assertRaises(AssertionError, doclist.similarity_to, kw) #doc to kwlist self.assertRaises(AssertionError, doc.similarity_to, kwlist) #kwlist to doc self.assertRaises(AssertionError, kwlist.similarity_to, doc)
from util import config_doc_kw_model config_doc_kw_model() from scinet3.model import Document, Keyword Document.load_all_from_db() def doc_compute(d): return d.get(Keyword.get("redis"), 0) * .5 / sum(d.values()) doc = Document.get(1) print doc._kw_weight {"a": 0.62981539329519109, "redis": 0.62981539329519109, "database": 0.45460437826405437} print doc_compute(doc._kw_weight) doc = Document.get(2) print doc._kw_weight {"the": 0.58478244910295341, "redis": 0.6577450118852588, "database": 0.47476413782131072} print doc_compute(doc._kw_weight) doc = Document.get(3) print doc._kw_weight {"tornado": 0.57555052264377027, "web": 0.50353869621889158, "a": 0.50353869621889158, "python": 0.40204372735417787} print doc_compute(doc._kw_weight) def kw_compute(d):
def doc_feedbacks(self): """document feedback""" key = "session:%s:%s" %(self.session_id, "doc_feedbacks") return dict([(Document.get(int(_id)), float(fb)) for _id, fb in self.redis.hgetall(key).items()])
def affected_docs(self): return [Document.get(doc_id) for doc_id in self.get("affected_docs", set())]