Ejemplo n.º 1
0
Archivo: rss.py Proyecto: stes/nbot
def test_db():
    gl_vlist = VocabList()
    log('searching directory: %s' % FEED_DIR)
    for dir in os.listdir(FEED_DIR):
        if '.mk4' in dir[-4:]:
            log('found database: %s' % dir)
            # open database
            db = metakit.storage(os.path.join(FEED_DIR, dir), 0)
            data = read_database(db)
            if len(data) > 0:
                # feed content in database
                log('create library')
                lib = Library()
                for feed in data:
                    lib.add_document(read_data(feed))
                vlist = lib.gen_vocablist()
                vlist.clean(5)
                gl_vlist.merge(vlist)
            db = None # close database
    print gl_vlist
Ejemplo n.º 2
0
Archivo: tools.py Proyecto: stes/nbot
def print80(text):
    tmp = ''
    for c in text:
        tmp+=c
        if len(tmp) > 80:
            print(tmp)
            tmp = ''
    print(tmp)

def printlist(list):
    for item in list:
        print(item)

if __name__ == '__main__':
    from nbot.document import Document, Library
    q = ['file:///home/stes/dislike.html']
    lib = Library()
    url = q.pop(0)
    page = fetch_content(url)
    hrefs = get_hyperlinks(page)
    q.extend(hrefs)
    while q:
        print 'currently %d elements in the queue' % len(q)
        url = q.pop(0)
        print 'getting %s' % url

        page = fetch_content(url)

        doc = Document(page)
        lib.add_document(doc)
    lib.save('res/dislike')
Ejemplo n.º 3
0
Archivo: main.py Proyecto: stes/nbot
def train_recsys():
    from nbot.document import Document, Library, VocabList, load_document
    
    doc0 = load_document('res/sample/blubb.html')
    doc1 = load_document('res/sample/page.html')
    doc2 = load_document('res/sample/dislikepage.html')
    
    lib_like = Library()
    lib_like.load('res/like', False)
    lib_dislike = Library()
    lib_dislike.load('res/dislike', False)
    
    like_cv = []
    keys = lib_like.get_keys()
    shuffle(keys)
    for key in keys[:5]:
        like_cv.append(lib_like.rmv_document(key))
    
    dislike_cv = []
    keys = lib_dislike.get_keys()
    shuffle(keys)
    for key in keys[:5]:
        dislike_cv.append(lib_dislike.rmv_document(key))
    
    vlist_like = lib_like.gen_vocablist()
    vlist_dislike = lib_dislike.gen_vocablist()
    
    vlist_like.clean(10)
    vlist_dislike.clean(10)
    
    like_mask = vlist_like.gen_mask()
    dislike_mask = vlist_dislike.gen_mask()
    
    mask = []
    mask.extend(like_mask)
    mask.extend(dislike_mask)
    
    rsys = RecommenderSystem(mask, len(mask))
    for key in lib_like.get_keys():
        doc = lib_like.get_document(key)
        rsys.set_rate(doc.content(), 1.)
    
    for key in lib_dislike.get_keys():
        doc = lib_dislike.get_document(key)
        rsys.set_rate(doc.content(), 0.)
    
    rsys.train(10000000, 0.1)

    likes = lib_like.get_keys()
    shuffle(likes)
    for key in likes[:5]:
        doc = lib_like.get_document(key)
        print rsys.rate(doc.content())
    
    dislikes = lib_dislike.get_keys()
    shuffle(dislikes)
    for key in dislikes[:5]:
        doc = lib_dislike.get_document(key)
        print rsys.rate(doc.content())
    
    print '---------------------------------------'
    print rsys.rate(doc0.content())
    print rsys.rate(doc1.content())
    print rsys.rate(doc2.content())
    print '---------------------------------------'
    print 'CV data'
    print '(1) LIKE'
    for doc in like_cv:
        print rsys.rate(doc.content())
    
    print '(2) DISLIKE'
    for doc in dislike_cv:
        print rsys.rate(doc.content())
    
    # This seems to work, however, more training/cv data will be necessary!  
    
    print '---------------------------------------'
    return rsys
Ejemplo n.º 4
0
Archivo: recsys.py Proyecto: stes/nbot
        [X, Y] = self.__gen_matrix()
        self.__lreg.train(iterations, learnrate, X, Y)
    
    def __gen_matrix(self):
        return [ array(self.__training_set), array(self.__ratings) ]
    
if __name__ == '__main__':
    # some tests
    
    from nbot.document import Document, Library, VocabList
    
    doc0 = load_document('res/sample/blubb.html')
    doc1 = load_document('res/sample/page.html')
    doc2 = load_document('res/sample/dislikepage.html')
    
    lib_like = Library()
    lib_like.load('res/like', False)
    lib_dislike = Library()
    lib_dislike.load('res/dislike', False)
    
    like_cv = []
    keys = lib_like.get_keys()
    shuffle(keys)
    for key in keys[:5]:
        like_cv.append(lib_like.rmv_document(key))
    
    dislike_cv = []
    keys = lib_dislike.get_keys()
    shuffle(keys)
    for key in keys[:5]:
        dislike_cv.append(lib_dislike.rmv_document(key))