def article(content_file, info): soup = BeautifulSoup(content_file) # 选取内容部分 detail = str(soup.select("table.vwtb td")[0]).decode('utf8') # 转换 markdown detail = html2text.html2text(detail, 'http://' + utl.get_host(info['url'])) # 2013/08/06 大概是数据源不太规则,抓去过来的加重标题容易出现两行 ** 以后空行,再一行 ** 再加几个空格,修正 detail = re.compile(ur'''[\r\n\s]+\*+[\r\n]+\*+ +''').subn('', detail)[0] # 还存在 ** 后边有多个全角空格的情况,直接替换 detail = re.compile(ur'''\*\* +''').subn('**', detail)[0] # 空的 strong 标签会被解析为四个星号 detail = re.compile(ur'''\*\*\*\*''').subn('', detail)[0] article = {'detail': detail} ################### 后续还出现连续或者不连续 ** 的话很可能是 strong 包含空字符的问题,彻底改进 return article
def article(content_file, info): # article = {'detail': '呢绒', 'author':'作者', 'author_url': info['url'], 'summary': '摘要信息','title': '标题'} soup = BeautifulSoup(content_file) # 选取内容部分: 支持一系列的不断尝试 try_list = [ "div.entry-content", ] detail = '' for selector in try_list: get_items = soup.select(selector) if len(get_items) > 0: detail = str(get_items[0]).decode('utf8') if len(detail) > 0: break # 转换 markdown detail = html2text.html2text(detail, 'http://' + utl.get_host(info['url'])) article = {'detail': detail} return article
all_rows = rs.fetchall() if len(all_rows) < 1: utl.log("no source need to be spidered", 1) for row in all_rows: # 资源 id res_id = row['id'] # 模块名只用来定义逻辑,同一个模块名可能涉及多个 id , 比如 rss 模块 mod_name = row['name'] # 判断如果模块不存在,则报告一个错误以后继续其他操作 if not os.path.isfile(mod_name + '.py'): utl.log("!!!!!ERR: no module defined for " + mod_name, 1) continue mod = importlib.import_module(mod_name) utl.log("======== start " + mod_name + str(res_id) + " " + utl.get_host(row['url']), 1) #hicktodo 需要增加的字段,先用保留字段 row['url'] = row['url'] ### 更新下次抓取的发生时间 now_time = time.time() + 3600 * int(row['day_count']) new_time = time.strftime(gl.TIME, time.localtime(now_time)) # print row # sys.exit() # 获得数据 try: page_content = utl.down(row)