def search_news(keyword=None, num=5): """ return result of search news, a dict which contains : : title: news title : url : news url : source: news source like "sina" : images : image urls in the content : content: news content : time: news publish time """ if not keyword: return print "baidu keywords=", keyword cj = cookielib.LWPCookieJar() br.set_cookiejar(cj) br.set_handle_equiv(True) br.set_handle_gzip(True) br.set_handle_redirect(True) br.set_handle_referer(True) br.set_handle_robots(False) br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1) br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')] words = '+'.join(keyword) search_words = urllib.quote(str2utf8(words)[0]) url = "%sword=%s" % (BAIDU_BASE_URL["news"], search_words) print "url=",url result = br.open(url).read() soup = BeautifulSoup(result) news_list = [] for c in soup.findAll("li"): try: news = {} news["title"] = c.a.text.strip() news["url"] = c.a.get("href") news["images"] = [img.get("src") for img in c.findAll("img")] decode_content = str2utf8(c.span.text)[0].decode("utf-8")\ .replace(" ", " ").replace(u"\xa0", " ").strip() temp = decode_content.rsplit(' ', 2) print "temp=", decode_content if len(temp) != 3: continue news["source"], day, time = temp news["time"] = day +" " + time news["content"] = c.find("div", {"class": "c-summary"}).text\ .replace(u"- 百度快照", "").strip() news_list.append(news) except AttributeError, e: logger.warning(e) continue except Exception, e: logger.error(traceback.format_exc(e)) continue
def get_msg(keyword=None, name=None): """ return grab message and took some filter(如果新闻或微博内容数据库中木有,则返回; 如果有且发布时间在MAX_LONG_TIME 以内,也返回;其他过滤掉) : keyword: key word list for searching news : name: weibo account """ try: if not keyword and not name: return {} send_contents = {} news_list = search_news(keyword) send_contents["news"] = can_send_news(keyword, news_list) logger.debug("can send %d news after filtered" % len(send_contents["news"])) # if name: # weibo = search_weibo(name) # ret = can_send_weibo(name, weibo) # if ret: # send_contents["weibo"] = ret return send_contents except Exception, e: logger.error(traceback.format_exc(e)) return {}