Esempio n. 1
0
    def _decorate_article(self, article):
        """在 parse_response 後執行,後處理其輸出"""

        # html post-process
        from lxml.html import tostring, fromstring
        from bs4 import BeautifulSoup
        from lib.util.net import normalize_url
        from lib.util.text import pack_string

        # article['content'] may be list of lxml doms
        if type(article['content']) is list:
            article['content'] = \
              fromstring('\n'.join([tostring(x, encoding=unicode) for x in article['content']]))

        # remove unwanted tags
        self.css_sel_drop_tree(article['content'], ['script'])

        # prettify html with BeautifulSoup
        html_bs4 = BeautifulSoup(tostring(article['content'],
                                          encoding=unicode)).body.next

        article['text'] = pack_string(html_bs4.text)
        article['html'] = pack_string(unicode(html_bs4))
        article["ctlr_classname"] = str(self.__class__)

        article['url'] = normalize_url(article['url'])
        article['url_read'] = normalize_url(article['url_read'])
        article['url_canonical'] = normalize_url(article['url_canonical'])

        self.move_out_of_meta(article, 'title')

        return article
Esempio n. 2
0
  def _decorate_article(self, article):
    """在 parse_response 後執行,後處理其輸出"""

    # html post-process
    from lxml.html import tostring, fromstring
    from bs4 import BeautifulSoup
    from lib.util.net import normalize_url
    from lib.util.text import pack_string

    # article['content'] may be list of lxml doms
    if type(article['content']) is list:
      article['content'] = \
        fromstring('\n'.join([tostring(x, encoding=unicode) for x in article['content']]))

    # remove unwanted tags
    self.css_sel_drop_tree(article['content'], ['script'])

    # prettify html with BeautifulSoup
    html_bs4 = BeautifulSoup(tostring(article['content'], encoding=unicode)).body.next

    article['text'] = pack_string(html_bs4.text)
    article['html'] = pack_string(unicode(html_bs4))
    article["ctlr_classname"] = str(self.__class__)

    article['url'] = normalize_url(article['url'])
    article['url_read'] = normalize_url(article['url_read'])
    article['url_canonical'] = normalize_url(article['url_canonical'])

    self.move_out_of_meta(article, 'title')

    return article
Esempio n. 3
0
def get_fresh_urls(urls, dbi = None):
  """篩選出 urls 中未曾成功抓取過者並回傳"""

  from lib.util.text import md5
  from lib.util.net import normalize_url

  if (len(urls) == 0): return set()

  url_md5 = [{'url': x, 'md5': md5(normalize_url(x))} for x in urls]
  hashes = "(" + (",".join(["UNHEX('%s')" % x['md5'] for x in url_md5 ])) + ")"
  sql = "SELECT HEX(`hash`) FROM `article__urls` WHERE `hash` IN %s" % hashes

  ret = set(DB.query(sql, dbi = dbi))

  output = []
  for x in url_md5:
    if not (x['md5'].upper(),) in ret:
      output.append(x['url'])

  return output