Beispiel #1
0
def article(content_file, info):
    soup = BeautifulSoup(content_file)
    # 选取内容部分
    detail = str(soup.select("table.vwtb td")[0]).decode('utf8')
    # 转换 markdown
    detail = html2text.html2text(detail, 'http://' + utl.get_host(info['url']))

    # 2013/08/06 大概是数据源不太规则,抓去过来的加重标题容易出现两行 ** 以后空行,再一行 ** 再加几个空格,修正
    detail =  re.compile(ur'''[\r\n\s]+\*+[\r\n]+\*+ +''').subn('', detail)[0]
    # 还存在 ** 后边有多个全角空格的情况,直接替换
    detail =  re.compile(ur'''\*\* +''').subn('**', detail)[0]
    # 空的 strong 标签会被解析为四个星号
    detail =  re.compile(ur'''\*\*\*\*''').subn('', detail)[0] 
    article = {'detail': detail}

    ################### 后续还出现连续或者不连续 ** 的话很可能是 strong 包含空字符的问题,彻底改进

    return article
Beispiel #2
0
def article(content_file, info):

    # article = {'detail': '呢绒', 'author':'作者', 'author_url': info['url'], 'summary': '摘要信息','title': '标题'}

    soup = BeautifulSoup(content_file)
    # 选取内容部分: 支持一系列的不断尝试
    try_list = [
        "div.entry-content", 
    ]
    detail = ''
    for selector in try_list:
        get_items = soup.select(selector)
        if len(get_items) > 0:
            detail = str(get_items[0]).decode('utf8')
        if len(detail) > 0:
            break

    # 转换 markdown
    detail = html2text.html2text(detail, 'http://' + utl.get_host(info['url']))
    article = {'detail': detail}

    return article
Beispiel #3
0
    all_rows = rs.fetchall()
    if len(all_rows) < 1:
        utl.log("no source need to be spidered", 1)

    for row in all_rows:
        # 资源 id
        res_id = row['id']
        # 模块名只用来定义逻辑,同一个模块名可能涉及多个 id , 比如 rss 模块
        mod_name = row['name']
        # 判断如果模块不存在,则报告一个错误以后继续其他操作
        if not os.path.isfile(mod_name + '.py'):
            utl.log("!!!!!ERR: no module defined for " + mod_name, 1)
            continue

        mod = importlib.import_module(mod_name)
        utl.log("======== start " + mod_name  + str(res_id) + " " + utl.get_host(row['url']), 1)
        #hicktodo 需要增加的字段,先用保留字段

        row['url'] = row['url']

        ### 更新下次抓取的发生时间
        now_time = time.time() + 3600 * int(row['day_count'])
        new_time = time.strftime(gl.TIME, time.localtime(now_time))


        # print row
        # sys.exit()

        # 获得数据
        try:
            page_content = utl.down(row)