def search_news(keyword=None, num=5): """ return result of search news, a dict which contains : : title: news title : url : news url : source: news source like "sina" : images : image urls in the content : content: news content : time: news publish time """ if not keyword: return print "baidu keywords=", keyword cj = cookielib.LWPCookieJar() br.set_cookiejar(cj) br.set_handle_equiv(True) br.set_handle_gzip(True) br.set_handle_redirect(True) br.set_handle_referer(True) br.set_handle_robots(False) br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1) br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')] words = '+'.join(keyword) search_words = urllib.quote(str2utf8(words)[0]) url = "%sword=%s" % (BAIDU_BASE_URL["news"], search_words) print "url=",url result = br.open(url).read() soup = BeautifulSoup(result) news_list = [] for c in soup.findAll("li"): try: news = {} news["title"] = c.a.text.strip() news["url"] = c.a.get("href") news["images"] = [img.get("src") for img in c.findAll("img")] decode_content = str2utf8(c.span.text)[0].decode("utf-8")\ .replace(" ", " ").replace(u"\xa0", " ").strip() temp = decode_content.rsplit(' ', 2) print "temp=", decode_content if len(temp) != 3: continue news["source"], day, time = temp news["time"] = day +" " + time news["content"] = c.find("div", {"class": "c-summary"}).text\ .replace(u"- 百度快照", "").strip() news_list.append(news) except AttributeError, e: logger.warning(e) continue except Exception, e: logger.error(traceback.format_exc(e)) continue
def search_weibo(keyword): """ return result of search weibos, a dict which contains : : name: weibo name : url : weibo url : content: news content : time: news publish time """ if not keyword: return new_weibo = {} encode_words = urllib.quote(str2utf8(keyword)[0]) url = "%s%s" % (WEIBO_BASE_URL, encode_words) result = br.open(url).read() soup = BeautifulSoup(result) new_weibo["url"] = soup.find("p", {"class": "person_addr"}).a.string new_weibo["content"] = "\n\n".join([p.text for p in soup.findAll("div", {"class": "person_newwb"})]) year = "%s-" % datetime.now().year new_weibo["time"] = year + soup.find("div", {"class": "person_newwb"}).p.findAll("a")[-1].text\ .strip("(").strip(")").replace(u"月", "-").replace(u"日", "").strip(" ") + ":00" return new_weibo
def add_words(cls, email, add_word_list, interval=300): """ add new search words for one email """ add_word_list = [str2utf8(word)[0] for word in add_word_list] words_key = cal_key(','.join(add_word_list)) ret = words_collection.find_one({"key": words_key}) if not ret: words_collection.insert({"word": add_word_list, "email_list": [email], "key": words_key,"interval": interval}) elif ret and email in set(ret.get("email_list")): pass else: ret = words_collection.find_one({"key": words_key}) words_collection.update({"key": words_key}, {"$set": {"email_list": ret["email_list"].append(email)}}) # for word in add_word_list: # ret = words_collection.find_one({"word": word}) # if not ret: # words_collection.insert({"word": word, "email_list": [email], "interval": interval}) # continue # if ret and email in set(ret.get("email_list")): # continue # else: # words_collection.update({"word": word}, {"$set": {"email_list": ret["email_list"].append(email)}}) return True
def add_words(cls, email, add_word_list): """ add new search words for one email """ with sessionCM() as session: add_word_list = [str2utf8(word)[0] for word in add_word_list] ret = session.query(AccountSetting).filter_by(email=email)\ .filter(AccountSetting.keyword.in_(add_word_list)).all() exist_words = [str2utf8(a.keyword)[0] for a in ret] #if ret: # print exist_words, type(ret[0].keyword), type(ret[0].email) need_add_words = list(set(add_word_list) - set(exist_words)) #print need_add_words, add_word_list for word in need_add_words: new_setting = cls(email=email, keyword=word) session.add(new_setting) session.commit() return True
def cal_key(content): return hashlib.md5(str2utf8(content)[0]).hexdigest()