Exemple #1
0
def feed_revisit(pool, dbi=None):
    """重下載必要的新聞,仿造 Base Ctlr :: dispatch_rss_2_0 meta
  並轉由 dispatch_response 處理

  @see db.list_revisits()
  @startpoint
  """
    import json
    import importlib

    from lib import db, DB, logger
    from lib.util.dt import to_timestamp

    if dbi is None: _dbi = DB()
    else: _dbi = dbi

    ctlr_cache = {}

    i_created_on = 0
    i_last_seen_on = 1
    i_pub_ts = 2
    i_feed_url = 3
    i_canonical_url = 4
    i_title = 5
    i_meta = 6
    i_ctlr = 7

    # logger.info("Found %d articles to revisit" % len(revisit_list))

    for x in db.list_recent_fetches(revisit_max_m(), dbi=dbi):
        expired = need_revisit(x[i_created_on], x[i_last_seen_on])
        if (not expired):
            continue

        if (x[i_ctlr] not in ctlr_cache):
            (ns, cn) = x[i_ctlr].rsplit('.', 1)
            module = importlib.import_module(ns)
            ctlr_cache[x[i_ctlr]] = getattr(module, cn)()

        ctlr = ctlr_cache[x[i_ctlr]]
        meta = json.loads(x[i_meta])

        meta['feed_url'] = x[i_feed_url]
        meta['pub_date'] = to_timestamp(x[i_pub_ts])
        meta['title'] = x[i_title]

        logger.info('Revisiting %s, expired for %d min',
                    x[i_canonical_url],
                    expired,
                    extra={'classname': feed_revisit})
        pool.log_stats('with_revisit')
        pool.put("http://" + x[i_canonical_url],
                 ctlr.dispatch_response,
                 category="revisit",
                 meta=meta)

    if dbi is None: _dbi.disconnect()
Exemple #2
0
def feed_revisit(pool, dbi=None):
    """重下載必要的新聞,仿造 Base Ctlr :: dispatch_rss_2_0 meta
  並轉由 dispatch_response 處理

  @see db.list_revisits()
  @startpoint
  """
    import json
    import importlib

    from lib import db, DB, logger
    from lib.util.dt import to_timestamp

    if dbi is None:
        _dbi = DB()
    else:
        _dbi = dbi

    ctlr_cache = {}

    i_created_on = 0
    i_last_seen_on = 1
    i_pub_ts = 2
    i_feed_url = 3
    i_canonical_url = 4
    i_title = 5
    i_meta = 6
    i_ctlr = 7

    # logger.info("Found %d articles to revisit" % len(revisit_list))

    for x in db.list_recent_fetches(revisit_max_m(), dbi=dbi):
        expired = need_revisit(x[i_created_on], x[i_last_seen_on])
        if not expired:
            continue

        if x[i_ctlr] not in ctlr_cache:
            (ns, cn) = x[i_ctlr].rsplit(".", 1)
            module = importlib.import_module(ns)
            ctlr_cache[x[i_ctlr]] = getattr(module, cn)()

        ctlr = ctlr_cache[x[i_ctlr]]
        meta = json.loads(x[i_meta])

        meta["feed_url"] = x[i_feed_url]
        meta["pub_date"] = to_timestamp(x[i_pub_ts])
        meta["title"] = x[i_title]

        logger.info("Revisiting %s, expired for %d min", x[i_canonical_url], expired, extra={"classname": feed_revisit})
        pool.log_stats("with_revisit")
        pool.put("http://" + x[i_canonical_url], ctlr.dispatch_response, category="revisit", meta=meta)

    if dbi is None:
        _dbi.disconnect()
Exemple #3
0
    def dispatch_response(self, payload, pool, dbi):
        """
    處理 fetcher 傳回之資料,調用 parse_response 解析其內容並儲存。

    輸入 payload 格式為 {
      'src': 'RESPONSE_BODY',
      'meta': {
        'feed_url': '',
        'pub_date': 'str'
      }
    }
    輸出為 {
      'html': lxml tree
    }

    @endpoint
    """
        import lxml.html
        from lib import logger, util, db
        from lib.util.dt import to_timestamp
        from lib.util.text import to_unicode

        if not payload:
            pool.log_stats('error_fetch')
            return

        try:
            payload['pub_ts'] = to_timestamp(payload['meta']['pub_date'])
        except KeyError:
            pass

        # dom tree 前處理
        try:
            html = lxml.html.fromstring(
                payload['src'])  # lxml handles html encoding
            payload['src'] = to_unicode(
                payload['src'])  # conver to unicode before storing
        except:
            extra = {'classname': self.__class__}
            logger.warning("HTML parse error, url: %s",
                           payload['url_read'],
                           extra=extra)
            logger.info("Got: %s", payload['src'], extra=extra)
            pool.log_stats('error_parse')
            return

        # canonical url
        url_canonical = html.cssselect('link[rel=canonical]')
        payload['url_canonical'] = url_canonical[0].attrib['href'] \
          if len(url_canonical) > 0 else payload['url_read']

        # 移除 charset 因為保證是 unicode; 若未移除反而可能使 html parser 誤判
        tags = html.cssselect('meta[http-equiv=Content-Type]')
        if (len(tags) > 0):
            payload['meta']['Content-Type'] = tags[0].attrib['content']
            for x in tags:
                x.drop_tree()

        payload['html'] = html

        self.move_out_of_meta(payload, 'feed_url')

        article = self.parse_response(payload)

        if article:
            # parsed successfully
            self._decorate_article(article)
            db.save_article(article, dbi=dbi)
            pool.log_stats('done_article')
        else:
            # TODO: 還是寫入 article 表
            db.save_response(payload, dbi=dbi)
            pool.log_stats('error_parse')
Exemple #4
0
  def dispatch_response(self, payload, pool, dbi):
    """
    處理 fetcher 傳回之資料,調用 parse_response 解析其內容並儲存。

    輸入 payload 格式為 {
      'src': 'RESPONSE_BODY',
      'meta': {
        'feed_url': '',
        'pub_date': 'str'
      }
    }
    輸出為 {
      'html': lxml tree
    }

    @endpoint
    """
    import lxml.html
    from lib import logger, util, db
    from lib.util.dt import to_timestamp
    from lib.util.text import to_unicode

    try: payload['pub_ts'] = to_timestamp(payload['meta']['pub_date'])
    except KeyError: pass

    # dom tree 前處理
    try:
      html = lxml.html.fromstring(payload['src']) # lxml handles html encoding
      payload['src'] = to_unicode(payload['src']) # conver to unicode before storing
    except:
      extra = {'classname': self.__class__}
      logger.warning("HTML parse error, url: %s", payload['url_read'], extra=extra)
      logger.info("Got: %s", payload['src'], extra=extra)
      pool.log_stats('error_parse')
      return

    # canonical url
    url_canonical = html.cssselect('link[rel=canonical]')
    payload['url_canonical'] = url_canonical[0].attrib['href'] \
      if len(url_canonical) > 0 else payload['url_read']

    # 移除 charset 因為保證是 unicode; 若未移除反而可能使 html parser 誤判
    tags = html.cssselect('meta[http-equiv=Content-Type]')
    if (len(tags) > 0):
      payload['meta']['Content-Type'] = tags[0].attrib['content']
      for x in tags: x.drop_tree()

    payload['html'] = html

    self.move_out_of_meta(payload, 'feed_url')

    article = self.parse_response(payload)

    if article:
      # parsed successfully
      self._decorate_article(article)
      db.save_article(article, dbi = dbi)
      pool.log_stats('done_article')
    else:
      # TODO: 還是寫入 article 表
      db.save_response(payload, dbi = dbi)
      pool.log_stats('error_parse')