def _decorate_article(self, article): """在 parse_response 後執行,後處理其輸出""" # html post-process from lxml.html import tostring, fromstring from bs4 import BeautifulSoup from lib.util.net import normalize_url from lib.util.text import pack_string # article['content'] may be list of lxml doms if type(article['content']) is list: article['content'] = \ fromstring('\n'.join([tostring(x, encoding=unicode) for x in article['content']])) # remove unwanted tags self.css_sel_drop_tree(article['content'], ['script']) # prettify html with BeautifulSoup html_bs4 = BeautifulSoup(tostring(article['content'], encoding=unicode)).body.next article['text'] = pack_string(html_bs4.text) article['html'] = pack_string(unicode(html_bs4)) article["ctlr_classname"] = str(self.__class__) article['url'] = normalize_url(article['url']) article['url_read'] = normalize_url(article['url_read']) article['url_canonical'] = normalize_url(article['url_canonical']) self.move_out_of_meta(article, 'title') return article
def get_fresh_urls(urls, dbi = None): """篩選出 urls 中未曾成功抓取過者並回傳""" from lib.util.text import md5 from lib.util.net import normalize_url if (len(urls) == 0): return set() url_md5 = [{'url': x, 'md5': md5(normalize_url(x))} for x in urls] hashes = "(" + (",".join(["UNHEX('%s')" % x['md5'] for x in url_md5 ])) + ")" sql = "SELECT HEX(`hash`) FROM `article__urls` WHERE `hash` IN %s" % hashes ret = set(DB.query(sql, dbi = dbi)) output = [] for x in url_md5: if not (x['md5'].upper(),) in ret: output.append(x['url']) return output