Beispiel #1
0
def _save_hashtbl(tbl_name, data, dbi=None):
    """
  將 data 存入列表,接受 body, [body...] 或 [{'body':'body'}...] 的格式
  數量可能很大,因此硬寫不快取
  """
    from MySQLdb import escape_string
    from lib.util.text import md5

    if (type(data) is str):
        _data = {'md5': md5(data), 'body': data}
    elif (type(data) is dict):
        if 'md5' not in data:
            data['md5'] = md5(data['body'])
        _data = data
    elif (type(data[0]) is str):
        _data = [{'md5': md5(x), 'body': x} for x in data]
    else:
        _data = [{
            'md5': x['md5'] if 'md5' in x else md5(x['body']),
            'body': x['body']
        } for x in data]

    sql = "INSERT IGNORE INTO `%s` (`body`, `hash`) VALUES " % escape_string(
        tbl_name)
    sql += "(%(body)s, UNHEX(%(md5)s))"

    DB.execute(sql, _data, dbi=dbi)
Beispiel #2
0
def save_article(payload, dbi = None):
  """更新新聞內容"""
  from json import dumps
  from datetime import datetime
  from copy import deepcopy

  from lib.util.text import md5

  # deep copy so that we don't mess up the original payload
  _payload = deepcopy(payload)
  _payload["meta"] = dumps(payload['meta'])
  _payload["pub_ts"] = datetime.fromtimestamp(payload["pub_ts"]).isoformat()

  # hashtbl : html, text, meta
  _payload['html_md5'] = md5(_payload['html'])
  _payload['text_md5'] = md5(_payload['text'])
  _payload['meta_md5'] = md5(_payload['meta'])
  _payload['src_md5'] = md5(_payload['src'])


  _save_hashtbl('article__htmls', {
    'md5': _payload['html_md5'], 'body':_payload['html']}, dbi = dbi)
  _save_hashtbl('article__texts', {
    'md5': _payload['text_md5'], 'body':_payload['text']}, dbi = dbi)
  _save_hashtbl('article__meta', {
    'md5': _payload['meta_md5'], 'body': _payload['meta']}, dbi = dbi)
  _save_hashtbl('article__srcs', {
    'md5': _payload['src_md5'], 'body': _payload['src']}, dbi = dbi)

  # hashtbl: url
  _payload['url_md5'] = md5(_payload['url'])
  _payload['url_read_md5'] = md5(_payload['url_read'])
  _payload['url_canonical_md5'] = md5(_payload['url_canonical'])

  _save_hashtbl('article__urls', [
    {'md5': _payload['url_md5'], 'body': _payload['url']},
    {'md5': _payload['url_read_md5'], 'body': _payload['url_read']},
    {'md5': _payload['url_canonical_md5'], 'body': _payload['url_canonical']},
  ], dbi = dbi);

  # do the insert
  sql = "INSERT INTO `articles` (" \
      "`title`, `pub_ts`, `created_on`, " \
      "`feed_id`, `ctlr_id`, " \
      "`url_hash`, `url_read_hash`, `url_canonical_hash`, " \
      "`meta_hash`, `html_hash`, `text_hash`, `src_hash`" \
    ") VALUES(" \
      "%(title)s, %(pub_ts)s, CURRENT_TIMESTAMP, " \
      "(SELECT `feed_id` FROM `feeds` WHERE `url` = %(feed_url)s), " \
      "(SELECT `ctlr_id` FROM `ctlrs` WHERE `classname` = %(ctlr_classname)s), " \
      "UNHEX(%(url_md5)s), UNHEX(%(url_read_md5)s), UNHEX(%(url_canonical_md5)s)," \
      "UNHEX(%(meta_md5)s), UNHEX(%(html_md5)s), UNHEX(%(text_md5)s), UNHEX(%(src_md5)s)" \
    ") ON DUPLICATE KEY UPDATE last_seen_on = CURRENT_TIMESTAMP"

  DB.execute(sql, _payload, dbi = dbi)
Beispiel #3
0
def _save_hashtbl(tbl_name, data, dbi = None):
  """
  將 data 存入列表,接受 body, [body...] 或 [{'body':'body'}...] 的格式
  數量可能很大,因此硬寫不快取
  """
  from MySQLdb import escape_string
  from lib.util.text import md5

  if (type(data) is str):
    _data = {'md5': md5(data), 'body': data}
  elif (type(data) is dict):
    if 'md5' not in data:
      data['md5'] = md5(data['body'])
    _data = data
  elif (type(data[0]) is str):
    _data = [{'md5': md5(x), 'body': x} for x in data]
  else:
    _data = [{'md5': x['md5'] if 'md5' in x else md5(x['body']), 'body': x['body']} for x in data]

  sql = "INSERT IGNORE INTO `%s` (`body`, `hash`) VALUES " % escape_string(tbl_name)
  sql += "(%(body)s, UNHEX(%(md5)s))"

  DB.execute(sql, _data, dbi = dbi)
Beispiel #4
0
def save_response(payload, dbi = None):
  """ Response, 僅在 parse 失敗時寫入,因此積極重寫"""
  from json import dumps
  from datetime import datetime
  from copy import deepcopy

  from lib.util.text import md5

  # deep copy so that we don't mess up the original payload
  _payload = deepcopy(payload)
  _payload["meta"] = dumps(payload['meta'])
  _payload["src_md5"] = md5(_payload['src'])

  sql = "INSERT IGNORE INTO `responses` " + \
    "(`feed_id`, `url`, `src`, `src_hash`, `meta`) VALUES(" + \
      "(SELECT `feed_id` FROM `feeds` WHERE `url` = %(feed_url)s), " + \
      "%(url)s, %(src)s, UNHEX(%(src_md5)s), %(meta)s" + \
      ")"
  DB.execute(sql, _payload, dbi = dbi)
Beispiel #5
0
def save_response(payload, dbi=None):
    """ Response, 僅在 parse 失敗時寫入,因此積極重寫"""
    from json import dumps
    from datetime import datetime
    from copy import deepcopy

    from lib.util.text import md5

    # deep copy so that we don't mess up the original payload
    _payload = deepcopy(payload)
    _payload["meta"] = dumps(payload['meta'])
    _payload["src_md5"] = md5(_payload['src'])

    sql = "INSERT IGNORE INTO `responses` " + \
      "(`feed_id`, `url`, `src`, `src_hash`, `meta`) VALUES(" + \
        "(SELECT `feed_id` FROM `feeds` WHERE `url` = %(feed_url)s), " + \
        "%(url)s, %(src)s, UNHEX(%(src_md5)s), %(meta)s" + \
        ")"
    DB.execute(sql, _payload, dbi=dbi)
Beispiel #6
0
def get_fresh_urls(urls, dbi = None):
  """篩選出 urls 中未曾成功抓取過者並回傳"""

  from lib.util.text import md5
  from lib.util.net import normalize_url

  if (len(urls) == 0): return set()

  url_md5 = [{'url': x, 'md5': md5(normalize_url(x))} for x in urls]
  hashes = "(" + (",".join(["UNHEX('%s')" % x['md5'] for x in url_md5 ])) + ")"
  sql = "SELECT HEX(`hash`) FROM `article__urls` WHERE `hash` IN %s" % hashes

  ret = set(DB.query(sql, dbi = dbi))

  output = []
  for x in url_md5:
    if not (x['md5'].upper(),) in ret:
      output.append(x['url'])

  return output
Beispiel #7
0
def save_article(payload, dbi=None):
    """更新新聞內容"""
    from json import dumps
    from datetime import datetime
    from copy import deepcopy

    from lib.util.text import md5

    # deep copy so that we don't mess up the original payload
    _payload = deepcopy(payload)
    _payload["meta"] = dumps(payload['meta'])
    _payload["pub_ts"] = datetime.fromtimestamp(payload["pub_ts"]).isoformat()

    # hashtbl : html, text, meta
    _payload['html_md5'] = md5(_payload['html'])
    _payload['text_md5'] = md5(_payload['text'])
    _payload['meta_md5'] = md5(_payload['meta'])
    _payload['src_md5'] = md5(_payload['src'])

    _save_hashtbl('article__htmls', {
        'md5': _payload['html_md5'],
        'body': _payload['html']
    },
                  dbi=dbi)
    _save_hashtbl('article__texts', {
        'md5': _payload['text_md5'],
        'body': _payload['text']
    },
                  dbi=dbi)
    _save_hashtbl('article__meta', {
        'md5': _payload['meta_md5'],
        'body': _payload['meta']
    },
                  dbi=dbi)
    _save_hashtbl('article__srcs', {
        'md5': _payload['src_md5'],
        'body': _payload['src']
    },
                  dbi=dbi)

    # hashtbl: url
    _payload['url_md5'] = md5(_payload['url'])
    _payload['url_read_md5'] = md5(_payload['url_read'])
    _payload['url_canonical_md5'] = md5(_payload['url_canonical'])

    _save_hashtbl('article__urls', [
        {
            'md5': _payload['url_md5'],
            'body': _payload['url']
        },
        {
            'md5': _payload['url_read_md5'],
            'body': _payload['url_read']
        },
        {
            'md5': _payload['url_canonical_md5'],
            'body': _payload['url_canonical']
        },
    ],
                  dbi=dbi)

    # do the insert
    sql = "INSERT INTO `articles` (" \
        "`title`, `pub_ts`, `created_on`, " \
        "`feed_id`, `ctlr_id`, " \
        "`url_hash`, `url_read_hash`, `url_canonical_hash`, " \
        "`meta_hash`, `html_hash`, `text_hash`, `src_hash`" \
      ") VALUES(" \
        "%(title)s, %(pub_ts)s, CURRENT_TIMESTAMP, " \
        "(SELECT `feed_id` FROM `feeds` WHERE `url` = %(feed_url)s), " \
        "(SELECT `ctlr_id` FROM `ctlrs` WHERE `classname` = %(ctlr_classname)s), " \
        "UNHEX(%(url_md5)s), UNHEX(%(url_read_md5)s), UNHEX(%(url_canonical_md5)s)," \
        "UNHEX(%(meta_md5)s), UNHEX(%(html_md5)s), UNHEX(%(text_md5)s), UNHEX(%(src_md5)s)" \
      ") ON DUPLICATE KEY UPDATE last_seen_on = CURRENT_TIMESTAMP"

    DB.execute(sql, _payload, dbi=dbi)