def test_db(): gl_vlist = VocabList() log('searching directory: %s' % FEED_DIR) for dir in os.listdir(FEED_DIR): if '.mk4' in dir[-4:]: log('found database: %s' % dir) # open database db = metakit.storage(os.path.join(FEED_DIR, dir), 0) data = read_database(db) if len(data) > 0: # feed content in database log('create library') lib = Library() for feed in data: lib.add_document(read_data(feed)) vlist = lib.gen_vocablist() vlist.clean(5) gl_vlist.merge(vlist) db = None # close database print gl_vlist
def print80(text): tmp = '' for c in text: tmp+=c if len(tmp) > 80: print(tmp) tmp = '' print(tmp) def printlist(list): for item in list: print(item) if __name__ == '__main__': from nbot.document import Document, Library q = ['file:///home/stes/dislike.html'] lib = Library() url = q.pop(0) page = fetch_content(url) hrefs = get_hyperlinks(page) q.extend(hrefs) while q: print 'currently %d elements in the queue' % len(q) url = q.pop(0) print 'getting %s' % url page = fetch_content(url) doc = Document(page) lib.add_document(doc) lib.save('res/dislike')
def train_recsys(): from nbot.document import Document, Library, VocabList, load_document doc0 = load_document('res/sample/blubb.html') doc1 = load_document('res/sample/page.html') doc2 = load_document('res/sample/dislikepage.html') lib_like = Library() lib_like.load('res/like', False) lib_dislike = Library() lib_dislike.load('res/dislike', False) like_cv = [] keys = lib_like.get_keys() shuffle(keys) for key in keys[:5]: like_cv.append(lib_like.rmv_document(key)) dislike_cv = [] keys = lib_dislike.get_keys() shuffle(keys) for key in keys[:5]: dislike_cv.append(lib_dislike.rmv_document(key)) vlist_like = lib_like.gen_vocablist() vlist_dislike = lib_dislike.gen_vocablist() vlist_like.clean(10) vlist_dislike.clean(10) like_mask = vlist_like.gen_mask() dislike_mask = vlist_dislike.gen_mask() mask = [] mask.extend(like_mask) mask.extend(dislike_mask) rsys = RecommenderSystem(mask, len(mask)) for key in lib_like.get_keys(): doc = lib_like.get_document(key) rsys.set_rate(doc.content(), 1.) for key in lib_dislike.get_keys(): doc = lib_dislike.get_document(key) rsys.set_rate(doc.content(), 0.) rsys.train(10000000, 0.1) likes = lib_like.get_keys() shuffle(likes) for key in likes[:5]: doc = lib_like.get_document(key) print rsys.rate(doc.content()) dislikes = lib_dislike.get_keys() shuffle(dislikes) for key in dislikes[:5]: doc = lib_dislike.get_document(key) print rsys.rate(doc.content()) print '---------------------------------------' print rsys.rate(doc0.content()) print rsys.rate(doc1.content()) print rsys.rate(doc2.content()) print '---------------------------------------' print 'CV data' print '(1) LIKE' for doc in like_cv: print rsys.rate(doc.content()) print '(2) DISLIKE' for doc in dislike_cv: print rsys.rate(doc.content()) # This seems to work, however, more training/cv data will be necessary! print '---------------------------------------' return rsys
[X, Y] = self.__gen_matrix() self.__lreg.train(iterations, learnrate, X, Y) def __gen_matrix(self): return [ array(self.__training_set), array(self.__ratings) ] if __name__ == '__main__': # some tests from nbot.document import Document, Library, VocabList doc0 = load_document('res/sample/blubb.html') doc1 = load_document('res/sample/page.html') doc2 = load_document('res/sample/dislikepage.html') lib_like = Library() lib_like.load('res/like', False) lib_dislike = Library() lib_dislike.load('res/dislike', False) like_cv = [] keys = lib_like.get_keys() shuffle(keys) for key in keys[:5]: like_cv.append(lib_like.rmv_document(key)) dislike_cv = [] keys = lib_dislike.get_keys() shuffle(keys) for key in keys[:5]: dislike_cv.append(lib_dislike.rmv_document(key))