def html_stories_info(): template = """<div class="stories_info"><h1>%d last fetched stories:</h1><p>%s</p><h1>%d never fetched stories:</h1><p>%s</p><h1>%d no symbol stories:</h1><p>%s</p></div>""" fetched_story_template = """<a href="/story/%s">%s</a> (%d hits, %d symbols)""" never_fetched_story_template = """<a href="/story/%s">%s</a>""" no_symbol_story_template = """%s""" fetched_story = sql.request( "select url_md5,url,hit_count,symbol_count from story where not isnull(fetch_date) and not symbol_count=0 order by symbol_count desc limit 30" ) never_fetched_story = sql.request("select url_md5,url from story where isnull(fetch_date) order by id desc") no_symbol_story = sql.request( "select url from story where not isnull(fetch_date) and symbol_count=0 order by id desc" ) return template % ( len(fetched_story), "<br/>".join( [ fetched_story_template % (story[0], saxutils.escape(story[1]), story[2], story[3]) for story in fetched_story ] ), len(never_fetched_story), "<br/>".join( [never_fetched_story_template % (story[0], saxutils.escape(story[1])) for story in never_fetched_story] ), len(no_symbol_story), "<br/>".join([no_symbol_story_template % saxutils.escape(story[0]) for story in no_symbol_story]), )
def html_story_info(story_md5): template = """<div class="story_info"><h1><a href="%s">%.100s</a></h1><p>fetched %d times, contains %d symbols, last fetching %s</p><p>symbols: %s</p><h1>found in feeds:</h1><p>%s</p></div>""" error_template = """<div class="story_info"><h1>can't found story info!!!</h1></div>""" feed_template = """<a href="/feed/%s">%s</a> (%d hits)""" story = sql.request( "select id,url,url,hit_count,symbol_count,fetch_date,symbols from story where url_md5=%s limit 30", story_md5 ) if story: story_id = story[0][0] story = story[0][1:] feeds = sql.request( "select feed.url_md5,feed.url,feed.hit_count from feed,feed_story where feed.id=feed_story.feed_id and feed_story.story_id=%s", int(story_id), ) return template % ( saxutils.escape(story[0]), saxutils.escape(story[1]), story[2], story[3], story[4], saxutils.escape(story[5]), "<br/>".join([feed_template % feed for feed in feeds]), ) else: return error_template
def fetch(): #is incoming full?? if sql.request("select count(id) from story where rated_date is null")[0][0]<250: #build feed list from recent and requested tags #print "INFO: building feed list" feeds=sql.request("select url,added_by from feed where isnull(fetch_date) and not url= '' or addtime(fetch_date,'01:00:00') < now()") #feeds=["rss.xml"] stories=[] #print "INFO: found %d updatable feeds" % len(feeds) for k,(feed,added_by) in enumerate(feeds): #print"INFO: updating %d/%d feed %s" % (k+1,len(feeds),feed) for url,title,symbols in get_stories(feed,added_by): sql.query("insert into story (url,url_md5,hit_count,symbols,symbol_count,fetch_date,title) values (%s,md5(%s),0,%s,%s,now(),%s)\ on duplicate key update title=%s" , (url,url,symbols,len(symbols.split()),title,title)) sql.query("insert into feed_story (story_id,feed_id)\ select story.id,feed.id\ from story,feed\ where story.url_md5=md5(%s)\ and feed.url_md5=md5(%s)\ on duplicate key update story_id=story_id" ,(url,feed)) #nice hack II sql.query("update feed set fetch_date=now() where url_md5=md5(%s)",feed) print "INFO: updated %d feeds (%s)" % (len(feeds),time.ctime()) else: print "INFO: no more urls needed"
def html_feeds_info(): template="""<div class="feeds_info"><h1>%d fetched feeds:</h1><p>%s</p><h1>%d never fetched feeds:</h1><p>%s</p></div>""" fetched_feed_template="""<a href="/feed/%s">%s</a> (%d hits)""" never_fetched_feed_template="""<a href="/feed/%s">%s</a>""" fetched_feed=sql.request("select url_md5,url,hit_count from feed where not isnull(fetch_date) order by hit_count asc") never_fetched_feed=sql.request("select url_md5,url from feed where isnull(fetch_date) order by id desc") return template % (len(fetched_feed),"<br/>".join([fetched_feed_template % feed for feed in fetched_feed])\ ,len(never_fetched_feed),"<br/>".join([never_fetched_feed_template % feed for feed in never_fetched_feed]))
def html_liked_symbols(param,session): if param.has_key("update_liked_symbols"): sql.request("update kolmognus_user set liked_symbols=%s where login=%s",(param["liked_symbols"],session["login"])) template="""<div class="liked_symbols"><h1>symbols you said you like:</h1><form action=""><p><input class="button_input" type="submit" value="submit" name="update_liked_symbols"/><input class="text_input" size="60" maxlength="200" type="text" name="liked_symbols" value="%s"/></p></form><h1>symbols kolmognus thinks you like:</h1><p>%s</p></div>""" user_liked_symbols=sql.request("select liked_symbols from kolmognus_user where login=%s",session["login"])[0][0] kolmognus_liked_symbol_template="""<span class="symbol" style="font-size: %dpt">%s(%d)</span>""" kolmognus_liked_symbols=sql.request("select symbol, good_count-bad_count\ from bayes_data, kolmognus_user where length(bayes_data.symbol) > 3 and bayes_data.symbol not like 'special\_%%' and bayes_data.user_id=kolmognus_user.id and kolmognus_user.login=%s order by good_count-bad_count desc limit 10",session["login"]) return template % (saxutils.escape(user_liked_symbols)," ".join([kolmognus_liked_symbol_template % (compute_size(symbol[1]),saxutils.escape(symbol[0]),symbol[1]) for symbol in kolmognus_liked_symbols]))
def html_feed_submitter(param,session): if param.has_key("update_submit_feed") and param["submit_feed"]!="": import urllib2 try: urllib2.urlopen(param["submit_feed"]) sql.request("insert into feed (url,url_md5,hit_count,added_by) values (%s,md5(%s),0,%s) on duplicate url=url",(param["submit_feed"],param["submit_feed"],session["login"])) except (urllib2.URLError,ValueError): pass template="""<div class="feed_submitter"><h1>submit a feed:</h1><form method="post" action=""><p><input class="button_input" type="submit" value="submit" name="update_submit_feed"/><input class="text_input" size="60" maxlength="500" type="text" name="submit_feed" value=""/></p></form></div>""" return template
def __init__(self,user_id,alluser_info={}): import sql self.alluser_info=alluser_info bayes_data = sql.request("select symbol,good_count,bad_count\ from bayes_data where user_id=%s" , user_id) self.symb_ratio={} liked_symbols=set(sql.request("select liked_symbols from kolmognus_user where id=%s",user_id)[0][0].split()) for symbol,good,bad in bayes_data: #compute f(w) for each word if symbol in liked_symbols: good+=5 #empirical bonus added to symbols the user manually tagged as good self.symb_ratio[symbol]=(BayesianClassifier.s*self.get_prior_value(symbol) + good) / (BayesianClassifier.s + bad + good)
def html_rated_stories(session): template="""<div class="rated_stories"><h1>rated stories:</h1><p>%s</p></div>""" rated_story_template="""<a href="/story/%s">%s</a> <span class="rating">%s</span>""" rated_stories=sql.request("select story.url_md5, story.url, recommended_story.user_rating from story, recommended_story, kolmognus_user\ where recommended_story.user_id=kolmognus_user.id and recommended_story.story_id=story.id\ and kolmognus_user.login=%s and not recommended_story.user_rating='?' order by recommended_story.userrating_date desc limit 20",session['login']) return template % "<br/>".join([rated_story_template % (story[0],saxutils.escape(story[1]),story[2]) for story in rated_stories])
def html_feed_info(feed_md5): template="""<div class="feed_info"><h1><a href="%s">%.50s</a></h1><p>fetched %d times, last fetching %s</p><h1>stories:</h1><p>%s</p><h1>empty stories:</h1><p>%s</p></div>""" error_template="""<div class="feed_info"><h1>can't found feed info!!!</h1></div>""" story_template="""<a href="/story/%s">%s</a> (%d hits) [%.50s]""" empty_story_template="""%s""" feed=sql.request("select id,url,url,hit_count,fetch_date from feed where url_md5=%s" , feed_md5) if feed: feed_id=feed[0][0] feed=feed[0][1:] stories=sql.request("select story.url_md5,story.url,story.hit_count,story.symbols\ from story,feed_story where story.id=feed_story.story_id and feed_story.feed_id=%s and not isnull(story.symbols)" , feed_id) empty_stories=sql.request("select story.url\ from story,feed_story where story.id=feed_story.story_id and feed_story.feed_id=%s and isnull(story.symbols)" , feed_id) return template % (feed[0],feed[1],feed[2],feed[3],"<br/>".join([story_template % (story[0],saxutils.escape(story[1]),story[2],saxutils.escape(story[3])) for story in stories]),"<br/>".join([empty_story_template % saxutils.escape(story[0]) for story in empty_stories])) else: return error_template
def get_valid_cached_symbols_for_story(url): rows=sql.request("select symbols from story where\ url_md5=md5(%s)\ and addtime(fetch_date,'00:30:00') >= now()" , url) if rows: return rows[0][0] else: return u""
def get_alluser_info(): import sql alluser_info={} uncertainty=0.3 for symbol,good,bad in sql.request("select symbol,sum(good_count),sum(bad_count) from bayes_data group by symbol"): alluser_info[symbol]=(float(good) / float(bad + good))*(1-uncertainty)+uncertainty*0.5 #for k,v in alluser_info.items(): # print v,k return alluser_info
def html_recommended_stories(session): template="""<div class="recommended_stories"><h1>recommended stories:</h1>%s</div>""" recommended_story_template="""<form method="post" action=""><p><input type="image" class="good" value="good" name="rating" src="/image/good.png" alt="good"/><input type="image" class="bad" value="bad" name="rating" src="/image/bad.png" alt="bad"/> <a href="%s">%s</a> <span style="color: %s;">%.2f</span> <a class="details" href="/story/%s">show details</a><input type="hidden" name="story_id" value="%d"/></p></form>""" recommended_stories=sql.request("select story.url_md5, story.url, recommended_story.computed_rating, story.id,if(story.title='',story.url,story.title) from story, recommended_story, kolmognus_user\ where recommended_story.user_id=kolmognus_user.id and recommended_story.story_id=story.id\ and kolmognus_user.login=%s and recommended_story.user_rating='?'\ and addtime(story.fetch_date,'24:00:00') > now()\ order by recommended_story.computed_rating desc\ limit 10",session['login']) return template % "".join([recommended_story_template % (saxutils.escape(story[1]),saxutils.escape(story[4]),compute_rating_color(story[2]),story[2],story[0],story[3]) for story in recommended_stories])
def get_valid_cached_stories_for_feed(feed): rows=sql.request("select story.url from feed_story,feed,story\ where story.id=feed_story.story_id\ and feed_story.feed_id=feed.id\ and feed.url_md5=md5(%s)\ and addtime(feed.fetch_date,'00:30:00') >= now()" , feed) if rows: return [row[0] for row in rows] else: return []
#!/usr/bin/env python import sql import classifier if __name__ == '__main__': rerate_delay="0 00:00:01" stories=sql.request("select id,url_md5,url,symbols from story where addtime(fetch_date,'12:00:00') > now() and (not isnull(fetch_date)) and (isnull(rated_date) or addtime(rated_date,%s) < now())" , rerate_delay) #do not rerate old news users=sql.request("select id,login from kolmognus_user") classifiers={} alluser_info=classifier.get_alluser_info() for user_id,login in users: #rate stories for each user if not user_id in classifiers: classifiers[user_id]=classifier.BayesianClassifier(user_id,alluser_info) #classifiers[user_id]=classifier.DumbClassifier() classif=classifiers[user_id] for url_id,umd5,url,symbols in stories: rating=classif.rate(symbols.split()) sql.query("insert into recommended_story (user_id,story_id,computed_rating)\ values(%s,%s,%s)\ on duplicate key update computed_rating=%s",(user_id,url_id,rating,rating)) for url_id,umd5,url,symbols in stories: #mark stories as rated sql.query("update story set rated_date=now() where id=%s;",url_id) sql.db.close() import time print "INFO: (%s) rated %d stories for %d users" % (time.asctime(),len(stories),len(users))
def p2p_server(opened_socket, db_conn): is_connected = False while True: conn, addr = opened_socket.accept() print 'Connected by', addr #addr = ':'.join(map(str, addr)) addr = addr[0] data = conn.recv(1024).strip() print data if data == 'QUIT': print 'QUIT' conn.sendall('QUIT') break if not is_connected: if data == 'CONNECT': print '%s send the connect information' % addr conn.sendall('ACCEPT\n') is_connected = True else: print 'not connected' conn.sendall('ERROR USER NOT CONNECTED\n') continue data = data.split(' ') if not data: conn.sendall('ERROR EMPTY DATA\n') continue return_msg = 'ERROR DATA MESSAGE FORMAT ERROR' if data[0] == 'ADD': if len(data) == 4: command, filename, filehash, filesize = data if not sql.add_file(filename, filehash, int(filesize), addr, db_conn): return_msg = 'ERROR FILE ADDING FAILED' else: return_msg = 'OK' else: return_msg = 'ERROR ADD MESSAGE FORMAT ERROR' elif data[0] == 'DELETE': if len(data) == 3: command, filename, filehash = data if not sql.delete_file(filename, filehash, addr, db_conn): return_msg = 'ERROR FILE DELETING FAILED' else: return_msg = 'OK' else: return_msg = 'ERROR DELETE MESSAGE FORMAT ERROR' elif data[0] == 'LIST': if len(data) == 1: file_list = sql.list_file(db_conn) if type(file_list) is not list: return_msg = 'ERROR LIST COMMAND' else: return_msg = '\n'.join(file_list) else: return_msg = 'ERROR LIST MESSAGE FORMAT ERROR' elif data[0] == 'REQUEST': if len(data) == 2: command, filename = data ret = sql.request(filename, db_conn) if ret == -1: return_msg = 'ERROR FILE NOT EXIST' else: peeraddr, filesize = ret return_msg = '%s %s' % (peeraddr, filesize) else: return_msg = 'ERROR REQUEST MESSAGE FORMAT ERROR' elif data[0] == 'HELP': return_msg ='COMMEND: ADD DELETE LIST REQUEST' else: return_msg = 'ERROR UNDEFINED COMMEND, USE "HELP" TO LIST AVIBLE COMMEND' conn.sendall('%s\n' % return_msg) continue conn.close()
def get_best_feeds(feed_number=10): return sql.request("select symbol,good_count - bad_count from bayes_data order by good_count-bad_count desc limit %s",feed_number)
def html_feeds(): template="""<div class="feeds"><h1>recent feeds:</h1><p>%s</p></div>""" feed_template="""<a href="/feed/%s">%s</a>""" feeds=sql.request("select url_md5,url from feed order by fetch_date asc limit 10") return template % "<br/>".join([feed_template % feed for feed in feeds])
def html_stories(): template="""<div class="stories"><h1>popular stories:</h1><p>%s</p></div>""" story_template="""<a href="/story/%s">%s</a> """ stories=sql.request("select story.url_md5,story.url,story.hit_count,story.symbol_count,story.symbols,story.title from story,recommended_story where recommended_story.story_id=story.id and addtime(story.fetch_date,'12:00:00') > now() and recommended_story.user_rating='G' group by story.url order by count(*) desc limit 10") return template % "<br/>".join([story_template % (story[0],saxutils.escape(story[5])) for story in stories])
def increment(dic,user_id,symbol): if not dic.has_key(user_id): dic[user_id]={} try: dic[user_id][symbol]+=1 except KeyError : dic[user_id][symbol]=1 if __name__=='__main__': goods={} bads={} all_symbols=set() all_users=set() all_stories=set() learned=[] header_printed=False for user_id,story_id,user_rating,symbols in sql.request("select user_id,story_id,user_rating,symbols from recommended_story,story \ where id=story_id and user_rating != '?' and isnull(learned)"): if not header_printed: print "INFO: (%s) :"%time.asctime(), header_printed=True print ".", all_users.add(user_id) learned.append((user_id,story_id)) for s in symbols.split(): all_symbols.add(s) if user_rating=='G': increment(goods,user_id,s) else: increment(bads,user_id,s) for user in all_users: for s in all_symbols: g_inc,b_inc=(goods.get(user,{}).get(s,0),bads.get(user,{}).get(s,0))