def html_stories_info():
    template = """<div class="stories_info"><h1>%d last fetched stories:</h1><p>%s</p><h1>%d never fetched stories:</h1><p>%s</p><h1>%d no symbol stories:</h1><p>%s</p></div>"""
    fetched_story_template = """<a href="/story/%s">%s</a> (%d hits, %d symbols)"""
    never_fetched_story_template = """<a href="/story/%s">%s</a>"""
    no_symbol_story_template = """%s"""

    fetched_story = sql.request(
        "select url_md5,url,hit_count,symbol_count from story where not isnull(fetch_date) and not symbol_count=0 order by symbol_count desc limit 30"
    )
    never_fetched_story = sql.request("select url_md5,url from story where isnull(fetch_date) order by id desc")
    no_symbol_story = sql.request(
        "select url from story where not isnull(fetch_date) and symbol_count=0 order by id desc"
    )
    return template % (
        len(fetched_story),
        "<br/>".join(
            [
                fetched_story_template % (story[0], saxutils.escape(story[1]), story[2], story[3])
                for story in fetched_story
            ]
        ),
        len(never_fetched_story),
        "<br/>".join(
            [never_fetched_story_template % (story[0], saxutils.escape(story[1])) for story in never_fetched_story]
        ),
        len(no_symbol_story),
        "<br/>".join([no_symbol_story_template % saxutils.escape(story[0]) for story in no_symbol_story]),
    )
def html_story_info(story_md5):
    template = """<div class="story_info"><h1><a href="%s">%.100s</a></h1><p>fetched %d times, contains %d symbols, last fetching %s</p><p>symbols: %s</p><h1>found in feeds:</h1><p>%s</p></div>"""
    error_template = """<div class="story_info"><h1>can't found story info!!!</h1></div>"""
    feed_template = """<a href="/feed/%s">%s</a> (%d hits)"""

    story = sql.request(
        "select id,url,url,hit_count,symbol_count,fetch_date,symbols from story where url_md5=%s limit 30", story_md5
    )
    if story:
        story_id = story[0][0]
        story = story[0][1:]
        feeds = sql.request(
            "select feed.url_md5,feed.url,feed.hit_count from feed,feed_story where feed.id=feed_story.feed_id and feed_story.story_id=%s",
            int(story_id),
        )
        return template % (
            saxutils.escape(story[0]),
            saxutils.escape(story[1]),
            story[2],
            story[3],
            story[4],
            saxutils.escape(story[5]),
            "<br/>".join([feed_template % feed for feed in feeds]),
        )
    else:
        return error_template
def fetch():
    #is incoming full??
    if sql.request("select count(id) from story where rated_date is null")[0][0]<250:
        #build feed list from recent and requested tags
        #print "INFO: building feed list"
        feeds=sql.request("select url,added_by from feed where isnull(fetch_date) and not url= '' or addtime(fetch_date,'01:00:00') < now()")
        #feeds=["rss.xml"]
        stories=[]
        #print "INFO: found %d updatable feeds" % len(feeds)
        for k,(feed,added_by) in enumerate(feeds):
            #print"INFO: updating %d/%d feed %s" % (k+1,len(feeds),feed)
            for url,title,symbols in get_stories(feed,added_by):
                sql.query("insert into story (url,url_md5,hit_count,symbols,symbol_count,fetch_date,title) values (%s,md5(%s),0,%s,%s,now(),%s)\
                  on duplicate key update title=%s" , (url,url,symbols,len(symbols.split()),title,title)) 
                sql.query("insert into feed_story (story_id,feed_id)\
                  select story.id,feed.id\
                  from story,feed\
                  where story.url_md5=md5(%s)\
                  and feed.url_md5=md5(%s)\
                  on duplicate key update story_id=story_id" ,(url,feed)) #nice hack II
            sql.query("update feed set fetch_date=now() where url_md5=md5(%s)",feed)
        print "INFO: updated %d feeds (%s)" % (len(feeds),time.ctime())

    else:
        print "INFO: no more urls needed"
def html_feeds_info():
    template="""<div class="feeds_info"><h1>%d fetched feeds:</h1><p>%s</p><h1>%d never fetched feeds:</h1><p>%s</p></div>"""
    fetched_feed_template="""<a href="/feed/%s">%s</a> (%d hits)"""
    never_fetched_feed_template="""<a href="/feed/%s">%s</a>"""

    fetched_feed=sql.request("select url_md5,url,hit_count from feed where not isnull(fetch_date) order by hit_count asc")
    never_fetched_feed=sql.request("select url_md5,url from feed where isnull(fetch_date) order by id desc")
    return template % (len(fetched_feed),"<br/>".join([fetched_feed_template % feed for feed in fetched_feed])\
                      ,len(never_fetched_feed),"<br/>".join([never_fetched_feed_template % feed for feed in never_fetched_feed]))
def html_liked_symbols(param,session):
    if param.has_key("update_liked_symbols"):
        sql.request("update kolmognus_user set liked_symbols=%s where login=%s",(param["liked_symbols"],session["login"]))
    template="""<div class="liked_symbols"><h1>symbols you said you like:</h1><form action=""><p><input class="button_input" type="submit" value="submit" name="update_liked_symbols"/><input class="text_input" size="60" maxlength="200" type="text" name="liked_symbols" value="%s"/></p></form><h1>symbols kolmognus thinks you like:</h1><p>%s</p></div>"""
    user_liked_symbols=sql.request("select liked_symbols from kolmognus_user where login=%s",session["login"])[0][0]
    kolmognus_liked_symbol_template="""<span class="symbol" style="font-size: %dpt">%s(%d)</span>"""
    kolmognus_liked_symbols=sql.request("select symbol, good_count-bad_count\
        from bayes_data, kolmognus_user where length(bayes_data.symbol) > 3 and bayes_data.symbol not like 'special\_%%' and bayes_data.user_id=kolmognus_user.id and kolmognus_user.login=%s order by good_count-bad_count desc limit 10",session["login"])
    return template % (saxutils.escape(user_liked_symbols)," ".join([kolmognus_liked_symbol_template % (compute_size(symbol[1]),saxutils.escape(symbol[0]),symbol[1]) for symbol in kolmognus_liked_symbols]))
def html_feed_submitter(param,session):
    if param.has_key("update_submit_feed") and param["submit_feed"]!="":
        import urllib2
        try:
            urllib2.urlopen(param["submit_feed"])
            sql.request("insert into feed (url,url_md5,hit_count,added_by) values (%s,md5(%s),0,%s) on duplicate url=url",(param["submit_feed"],param["submit_feed"],session["login"]))
        except (urllib2.URLError,ValueError):
            pass
    template="""<div class="feed_submitter"><h1>submit a feed:</h1><form method="post" action=""><p><input class="button_input" type="submit" value="submit" name="update_submit_feed"/><input class="text_input" size="60" maxlength="500" type="text" name="submit_feed" value=""/></p></form></div>"""
    return template
 def __init__(self,user_id,alluser_info={}):
     import sql
     self.alluser_info=alluser_info
     bayes_data = sql.request("select symbol,good_count,bad_count\
         from bayes_data where user_id=%s" , user_id)
     self.symb_ratio={}
     liked_symbols=set(sql.request("select liked_symbols from kolmognus_user where id=%s",user_id)[0][0].split())
     for symbol,good,bad in bayes_data: #compute f(w) for each word
         if symbol in liked_symbols:
             good+=5 #empirical bonus added to symbols the user manually tagged as good
         self.symb_ratio[symbol]=(BayesianClassifier.s*self.get_prior_value(symbol) + good) / (BayesianClassifier.s + bad + good)
def html_rated_stories(session):
    template="""<div class="rated_stories"><h1>rated stories:</h1><p>%s</p></div>"""
    rated_story_template="""<a href="/story/%s">%s</a> <span class="rating">%s</span>"""
    rated_stories=sql.request("select story.url_md5, story.url, recommended_story.user_rating from story, recommended_story, kolmognus_user\
        where recommended_story.user_id=kolmognus_user.id and recommended_story.story_id=story.id\
        and kolmognus_user.login=%s and not recommended_story.user_rating='?' order by recommended_story.userrating_date desc limit 20",session['login'])
    return template % "<br/>".join([rated_story_template % (story[0],saxutils.escape(story[1]),story[2]) for story in rated_stories])
def html_feed_info(feed_md5):
    template="""<div class="feed_info"><h1><a href="%s">%.50s</a></h1><p>fetched %d times, last fetching %s</p><h1>stories:</h1><p>%s</p><h1>empty stories:</h1><p>%s</p></div>"""
    error_template="""<div class="feed_info"><h1>can't found feed info!!!</h1></div>"""
    story_template="""<a href="/story/%s">%s</a> (%d hits) [%.50s]"""
    empty_story_template="""%s"""

    feed=sql.request("select id,url,url,hit_count,fetch_date from feed where url_md5=%s" , feed_md5)
    if feed:
        feed_id=feed[0][0]
        feed=feed[0][1:]
        stories=sql.request("select story.url_md5,story.url,story.hit_count,story.symbols\
            from story,feed_story where story.id=feed_story.story_id and feed_story.feed_id=%s and not isnull(story.symbols)" , feed_id)
        empty_stories=sql.request("select story.url\
            from story,feed_story where story.id=feed_story.story_id and feed_story.feed_id=%s and isnull(story.symbols)" , feed_id)
        return template % (feed[0],feed[1],feed[2],feed[3],"<br/>".join([story_template % (story[0],saxutils.escape(story[1]),story[2],saxutils.escape(story[3])) for story in stories]),"<br/>".join([empty_story_template % saxutils.escape(story[0]) for story in empty_stories]))
    else:
        return error_template
def get_valid_cached_symbols_for_story(url):
    rows=sql.request("select symbols from story where\
                        url_md5=md5(%s)\
                        and addtime(fetch_date,'00:30:00') >= now()" , url)
    if rows:
        return rows[0][0]
    else: 
        return u""
def get_alluser_info():
    import sql
    alluser_info={}
    uncertainty=0.3
    for symbol,good,bad in sql.request("select symbol,sum(good_count),sum(bad_count) from bayes_data group by symbol"):
        alluser_info[symbol]=(float(good) / float(bad + good))*(1-uncertainty)+uncertainty*0.5
    #for k,v in alluser_info.items():
    #    print v,k
    return alluser_info
def html_recommended_stories(session):
    template="""<div class="recommended_stories"><h1>recommended stories:</h1>%s</div>"""
    recommended_story_template="""<form method="post" action=""><p><input type="image" class="good" value="good" name="rating" src="/image/good.png" alt="good"/><input type="image" class="bad" value="bad" name="rating" src="/image/bad.png" alt="bad"/> <a href="%s">%s</a> <span style="color: %s;">%.2f</span> <a class="details" href="/story/%s">show details</a><input type="hidden" name="story_id" value="%d"/></p></form>"""
    recommended_stories=sql.request("select story.url_md5, story.url, recommended_story.computed_rating, story.id,if(story.title='',story.url,story.title) from story, recommended_story, kolmognus_user\
        where recommended_story.user_id=kolmognus_user.id and recommended_story.story_id=story.id\
        and kolmognus_user.login=%s and recommended_story.user_rating='?'\
        and addtime(story.fetch_date,'24:00:00') > now()\
        order by recommended_story.computed_rating desc\
        limit 10",session['login'])
    return template % "".join([recommended_story_template % (saxutils.escape(story[1]),saxutils.escape(story[4]),compute_rating_color(story[2]),story[2],story[0],story[3]) for story in recommended_stories])
def get_valid_cached_stories_for_feed(feed):
    rows=sql.request("select story.url from feed_story,feed,story\
                      where story.id=feed_story.story_id\
                        and feed_story.feed_id=feed.id\
                        and feed.url_md5=md5(%s)\
                        and addtime(feed.fetch_date,'00:30:00') >= now()" , feed)
    if rows:
        return [row[0] for row in rows]
    else:
        return []
Beispiel #14
0
#!/usr/bin/env python

import sql
import classifier

if __name__ == '__main__':
    rerate_delay="0 00:00:01"
    stories=sql.request("select id,url_md5,url,symbols from story where addtime(fetch_date,'12:00:00') > now() and (not isnull(fetch_date)) and (isnull(rated_date) or addtime(rated_date,%s) < now())" , rerate_delay) #do not rerate old news
    users=sql.request("select id,login from kolmognus_user")
    classifiers={}
    alluser_info=classifier.get_alluser_info()
    for user_id,login in users: #rate stories for each user
        if not user_id in classifiers:
            classifiers[user_id]=classifier.BayesianClassifier(user_id,alluser_info)
            #classifiers[user_id]=classifier.DumbClassifier()
        classif=classifiers[user_id]
        for url_id,umd5,url,symbols in stories:
            rating=classif.rate(symbols.split())
            sql.query("insert into recommended_story (user_id,story_id,computed_rating)\
                values(%s,%s,%s)\
                on duplicate key update computed_rating=%s",(user_id,url_id,rating,rating))

    for url_id,umd5,url,symbols in stories: #mark stories as rated
        sql.query("update story set rated_date=now() where id=%s;",url_id)
    sql.db.close()
    import time
    print "INFO: (%s) rated %d stories for %d users" % (time.asctime(),len(stories),len(users))
Beispiel #15
0
def p2p_server(opened_socket, db_conn):
    is_connected = False
    while True:
        conn, addr = opened_socket.accept()
        print 'Connected by', addr
        #addr = ':'.join(map(str, addr))
        addr = addr[0]
        data = conn.recv(1024).strip()
        print data
        
        if data == 'QUIT':
            print 'QUIT'
            conn.sendall('QUIT')
            break
        
        if not is_connected:
            if data == 'CONNECT':
                print '%s send the connect information' % addr
                conn.sendall('ACCEPT\n')
                is_connected = True
            else:
                print 'not connected'
                conn.sendall('ERROR USER NOT CONNECTED\n')
            continue

        data = data.split(' ')
        if not data:
            conn.sendall('ERROR EMPTY DATA\n')
            continue

        return_msg = 'ERROR DATA MESSAGE FORMAT ERROR'
        if data[0] == 'ADD':
            if len(data) == 4:
                command, filename, filehash, filesize = data
                if not sql.add_file(filename, filehash, int(filesize), addr, db_conn):
                    return_msg = 'ERROR FILE ADDING FAILED'
                else:
                    return_msg = 'OK'
            else:
                return_msg = 'ERROR ADD MESSAGE FORMAT ERROR'
        elif data[0] == 'DELETE':
            if len(data) == 3:
                command, filename, filehash = data
                if not sql.delete_file(filename, filehash, addr, db_conn):
                    return_msg = 'ERROR FILE DELETING FAILED'
                else:
                    return_msg = 'OK'
            else:
                return_msg = 'ERROR DELETE MESSAGE FORMAT ERROR'
        elif data[0] == 'LIST':
            if len(data) == 1:
                file_list = sql.list_file(db_conn)
                if type(file_list) is not list:
                    return_msg = 'ERROR LIST COMMAND'
                else:
                    return_msg = '\n'.join(file_list)
            else:
                return_msg = 'ERROR LIST MESSAGE FORMAT ERROR'
        elif data[0] == 'REQUEST':
            if len(data) == 2:
                command, filename = data
                ret = sql.request(filename, db_conn)
                if ret == -1:
                    return_msg = 'ERROR FILE NOT EXIST'
                else:
                    peeraddr, filesize = ret
                    return_msg = '%s %s' % (peeraddr, filesize)
            else:
                return_msg = 'ERROR REQUEST MESSAGE FORMAT ERROR'
        elif data[0] == 'HELP':
            return_msg ='COMMEND: ADD DELETE LIST REQUEST'
        else:
            return_msg = 'ERROR UNDEFINED COMMEND, USE "HELP" TO LIST AVIBLE COMMEND'
        conn.sendall('%s\n' % return_msg)
        continue

    conn.close()
def get_best_feeds(feed_number=10):
    return sql.request("select symbol,good_count - bad_count from bayes_data order by good_count-bad_count desc limit %s",feed_number)
def html_feeds():
    template="""<div class="feeds"><h1>recent feeds:</h1><p>%s</p></div>"""
    feed_template="""<a href="/feed/%s">%s</a>"""
    feeds=sql.request("select url_md5,url from feed order by fetch_date asc limit 10")
    return template % "<br/>".join([feed_template % feed for feed in feeds])
def html_stories():
    template="""<div class="stories"><h1>popular stories:</h1><p>%s</p></div>"""
    story_template="""<a href="/story/%s">%s</a> """
    stories=sql.request("select story.url_md5,story.url,story.hit_count,story.symbol_count,story.symbols,story.title from story,recommended_story where recommended_story.story_id=story.id and addtime(story.fetch_date,'12:00:00') > now() and recommended_story.user_rating='G' group by story.url order by count(*) desc limit 10")
    return template % "<br/>".join([story_template % (story[0],saxutils.escape(story[5])) for story in stories])
def increment(dic,user_id,symbol):
    if not dic.has_key(user_id): dic[user_id]={}
    try:
        dic[user_id][symbol]+=1
    except KeyError : dic[user_id][symbol]=1

if __name__=='__main__':
    goods={}
    bads={}
    all_symbols=set()
    all_users=set()
    all_stories=set()
    learned=[]
    header_printed=False
    for user_id,story_id,user_rating,symbols in sql.request("select user_id,story_id,user_rating,symbols from recommended_story,story \
            where id=story_id and user_rating != '?' and isnull(learned)"):
        if not header_printed:
            print "INFO: (%s) :"%time.asctime(),
            header_printed=True
        print ".",
        all_users.add(user_id)
        learned.append((user_id,story_id))
        for s in symbols.split():
            all_symbols.add(s)
            if user_rating=='G':
                increment(goods,user_id,s)
            else:
                increment(bads,user_id,s)
    for user in all_users:
        for s in all_symbols:
            g_inc,b_inc=(goods.get(user,{}).get(s,0),bads.get(user,{}).get(s,0))