Exemple #1
0
 def get_summary(self, max_num=100):
     """
     @attention: 通过内容获取摘要
     """
     from common import utils
     return utils.get_summary_from_html_by_sub(self.content,
                                               max_num=max_num)
Exemple #2
0
def auto_publish_article(key):
    from common.utils import get_summary_from_html_by_sub

    # text = open((u"./txt/%s.txt" % u"【原创】《大雄股市历险记4》股票为什么会上涨").encode("utf8"), "r").read()
    # text = get_summary_from_html_by_sub(text, max_num=990000)
    # print text
    # return
    count = 0
    hrefs = u""

    for index in range(20):
        url = 'http://weixin.sogou.com/weixin?query=' + key + '&type=2&ie=utf8&page=' + str(
            index) + '&p=40040100&dp=1&w=01019900&dr=1'
        rep = requests.get(url)
        text = rep.text
        jq = pq(text)
        lst_articles = jq('.wx-rb3 .txt-box a')
        for article in lst_articles:
            print count
            href = article.get("href")
            name = article.text_content()
            text = requests.get(href).text
            text = get_summary_from_html_by_sub(text,
                                                max_num=990000,
                                                filter_nbsp=True)

            re_blank = re.compile('[\s]+', re.I)
            key_text = re_blank.sub('', text)
            if not 300 < len(key_text) < 3000:
                continue
            if is_in_baidu(key_text[50:60]):
                continue
            if is_in_baidu(key_text[150:160]):
                continue
            if is_in_baidu(key_text[250:260]):
                continue
            count += 1
            hrefs += u"%s\n" % href
            open((u"./txt/%s.txt" % name).encode("utf8"),
                 "w").write(text.encode("utf8"))
            # break
        open((u"./txt/hrefs.txt").encode("utf8"),
             "w").write(hrefs.encode("utf8"))

    print u"total articles:%s" % count
Exemple #3
0
def auto_publish_article(key):
    from common.utils import get_summary_from_html_by_sub

    # text = open((u"./txt/%s.txt" % u"【原创】《大雄股市历险记4》股票为什么会上涨").encode("utf8"), "r").read()
    # text = get_summary_from_html_by_sub(text, max_num=990000)
    # print text
    # return
    count = 0
    hrefs = u""

    for index in range(20):
        url = 'http://weixin.sogou.com/weixin?query=' + key + '&type=2&ie=utf8&page=' + str(index) + '&p=40040100&dp=1&w=01019900&dr=1'
        rep = requests.get(url)
        text = rep.text
        jq = pq(text)
        lst_articles = jq('.wx-rb3 .txt-box a')
        for article in lst_articles:
            print count
            href = article.get("href")
            name = article.text_content()
            text = requests.get(href).text
            text = get_summary_from_html_by_sub(text, max_num=990000, filter_nbsp=True)

            re_blank = re.compile('[\s]+', re.I)
            key_text = re_blank.sub('', text)
            if not 300 < len(key_text) < 3000:
                continue
            if is_in_baidu(key_text[50:60]):
                continue
            if is_in_baidu(key_text[150:160]):
                continue
            if is_in_baidu(key_text[250:260]):
                continue
            count += 1
            hrefs += u"%s\n" % href
            open((u"./txt/%s.txt" % name).encode("utf8"), "w").write(text.encode("utf8"))
            # break
        open((u"./txt/hrefs.txt").encode("utf8"), "w").write(hrefs.encode("utf8"))

    print u"total articles:%s" % count
Exemple #4
0
 def get_summary(self):
     """
     @attention: 通过内容获取摘要
     """
     from common import utils
     return utils.get_summary_from_html_by_sub(self.content)
Exemple #5
0
 def get_summary(self):
     """
     @note: 通过内容获取摘要
     """
     from common import utils
     return utils.get_summary_from_html_by_sub(self.des)
Exemple #6
0
 def get_summary(self):
     """
     @note: 通过内容获取摘要
     """
     from common import utils
     return utils.get_summary_from_html_by_sub(self.des)
Exemple #7
0
 def get_summary(self):
     """
     @attention: 通过内容获取摘要
     """
     from common import utils
     return utils.get_summary_from_html_by_sub(self.summary)