Exemple #1
0
def search_news(keyword=None, num=5):
    """
    return result of search news, a dict which contains :
    : title: news title
    : url : news url
    : source: news source like "sina"
    : images : image urls in the content
    : content: news content
    : time:  news publish time
    """
    if not keyword:
        return
    print "baidu keywords=", keyword
    cj = cookielib.LWPCookieJar()
    br.set_cookiejar(cj)
    br.set_handle_equiv(True)
    br.set_handle_gzip(True)
    br.set_handle_redirect(True)
    br.set_handle_referer(True)
    br.set_handle_robots(False)
    br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)
    br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]

    words = '+'.join(keyword)
    search_words = urllib.quote(str2utf8(words)[0])
    url = "%sword=%s" % (BAIDU_BASE_URL["news"], search_words)
    print "url=",url
    result = br.open(url).read()

    soup = BeautifulSoup(result)
    news_list = []
    for c in soup.findAll("li"):
        try:
            news = {}
            news["title"] = c.a.text.strip()
            news["url"] = c.a.get("href")
            news["images"] = [img.get("src") for img in c.findAll("img")]
            decode_content = str2utf8(c.span.text)[0].decode("utf-8")\
                .replace(" ", " ").replace(u"\xa0", " ").strip()

            temp = decode_content.rsplit(' ', 2)
            print "temp=", decode_content
            if len(temp) != 3:
                continue

            news["source"], day, time = temp
            news["time"] = day +" " + time
            news["content"] = c.find("div", {"class": "c-summary"}).text\
                .replace(u"- 百度快照", "").strip()
            news_list.append(news)
        except AttributeError, e:
            logger.warning(e)
            continue
        except Exception, e:
            logger.error(traceback.format_exc(e))
            continue
Exemple #2
0
def search_weibo(keyword):
    """
    return result of search weibos, a dict which contains :
    : name: weibo name
    : url : weibo url
    : content: news content
    : time:  news publish time
    """

    if not keyword:
        return

    new_weibo = {}
    encode_words = urllib.quote(str2utf8(keyword)[0])
    url = "%s%s" % (WEIBO_BASE_URL, encode_words)

    result = br.open(url).read()

    soup = BeautifulSoup(result)
    new_weibo["url"] = soup.find("p", {"class": "person_addr"}).a.string
    new_weibo["content"] = "\n\n".join([p.text for p in soup.findAll("div", {"class": "person_newwb"})])
    year = "%s-" % datetime.now().year
    new_weibo["time"] = year + soup.find("div", {"class": "person_newwb"}).p.findAll("a")[-1].text\
        .strip("(").strip(")").replace(u"月", "-").replace(u"日", "").strip(" ") + ":00"

    return new_weibo
Exemple #3
0
    def add_words(cls, email, add_word_list, interval=300):
        """
         add new search words for one email
        """
        add_word_list = [str2utf8(word)[0] for word in add_word_list]
        words_key = cal_key(','.join(add_word_list))


        ret = words_collection.find_one({"key": words_key})
        if not ret:
            words_collection.insert({"word": add_word_list, "email_list": [email], "key": words_key,"interval": interval})
        elif ret and email in set(ret.get("email_list")):
            pass
        else:
            ret = words_collection.find_one({"key": words_key})

            words_collection.update({"key": words_key}, {"$set": {"email_list": ret["email_list"].append(email)}})
        # for word in add_word_list:
        #     ret = words_collection.find_one({"word": word})
        #     if not ret:
        #         words_collection.insert({"word": word, "email_list": [email], "interval": interval})
        #         continue
        #     if ret and email in set(ret.get("email_list")):
        #         continue
        #     else:
        #         words_collection.update({"word": word}, {"$set": {"email_list": ret["email_list"].append(email)}})

        return True
    def add_words(cls, email, add_word_list):
        """
         add new search words for one email
        """
        with sessionCM() as session:
            add_word_list = [str2utf8(word)[0] for word in add_word_list]
            ret = session.query(AccountSetting).filter_by(email=email)\
                .filter(AccountSetting.keyword.in_(add_word_list)).all()
            exist_words = [str2utf8(a.keyword)[0] for a in ret]
            #if ret:
            #    print exist_words, type(ret[0].keyword), type(ret[0].email)
            need_add_words = list(set(add_word_list) - set(exist_words))
            #print need_add_words, add_word_list
            for word in need_add_words:
                new_setting = cls(email=email, keyword=word)
                session.add(new_setting)
                session.commit()

        return True
Exemple #5
0
def cal_key(content):
    return hashlib.md5(str2utf8(content)[0]).hexdigest()