Esempio n. 1
0
#!/usr/bin/env python
Esempio n. 2
0
def run_it(*args,**kwargs):
    # 接受要采集的种子信息和地址信息,
    uuid =kwargs['uuid']
    url = kwargs['url']
    uri = kwargs['uri']
    # 判读有配置模板信息
    sql = '''
        SELECT  `uuid`,`charset`,`request_type`,`sub_uri`,`type`
        FROM `application`.`sys_seed_ruler_info`
        WHERE delete_flag = 0
        and seed_uuid = '%s'
    ''' % (uuid)
    res ,datarule = applicationDb.read_sql(sql)

    print(datarule)
    lastrule=()
    urllen = 0
    for i in datarule:
        if url.find(i[3]) > -1:
            if len(i[3]) > urllen:
                lastrule = i
                urllen = len(i[3])
    # 获取网页源码(HtmlSource)
    htmlSource = HtmlSource()

    print("读取网页%s" %(url))
    if len(lastrule) > 0:
        html_text = htmlSource.get_html(url_p=url, type_p=lastrule[2], chartset_p=lastrule[1])
    else:
        html_text = htmlSource.get_html(url_p=url)
    rule = Rule()
    # 粗提取url
    list_a = htmlSource.get_url_list_xpath(html_text)
    for a in list_a:
        print("原文:"+a)

    list_a = htmlSource.addr_clear(list_a) # 去噪点去重复
    for a in list_a:
        print("去噪点:"+a)
    list_a = htmlSource.addr_whole(list_a, url_root=rule.get_url_root(url))  # 补全路径
    for a in list_a:
        print("补全路径:" + a)

    # 判断url是否当前的网站内地址 TODO
    # 如果是入库标记状态0
    # 如果不是丢弃url

    # 数据入库
    for a in list_a:
        sql ='''
            INSERT INTO `result`.`sys_url_info`
            VALUES ('%s', '%s',0)
        '''%(rule.get_md5_value(a),a)
        resultDb.write_sql(sql)
    print("网页链接提取完毕.")
    if(len(lastrule) > 0):
        print("读取模板信息.")
        # 获取模板信息
        sql ='''
            SELECT `colum_name`,`ruler`,`type`,`app1`,`app2`,`arr`,`spl1`,`spl2`
            FROM `application`.`sys_seed_ruler_colum_info`
            where delete_flag = 0
            and ruler_uuid = '%s'
        ''' %(lastrule[0])
        res2, columrole = applicationDb.read_sql(sql)

        # 如果有调用网页采集程序,调用规则提取数据,调用结果配置数据入库,完成采集任务
        if(len(columrole)>0):
            print(columrole)
            # 将网页源码和当前url传递给(Rule)获得结果
            result=[]
            if lastrule[4] == '0':
                print("详细页面信息提取.")
                result = rule.html_content_analysis_detial(html_text=html_text, column=columrole, url=url)

            elif  lastrule[4] =='1':
                print("列表页面信息提取.")
                result = rule.html_content_analysis_list(html_text=html_text,column=columrole,url=url)

            # 调用ResultData入库
            rd = ResultData()
            rd.resultRefulence(rule_uuid=lastrole[0], result=result,type=lastrole[4] )


    # 更新url
    sql ='''
        UPDATE `result`.`sys_url_info`
        SET `flag` = 2
        WHERE `url` = '%s'
    ''' %(url)
    resultDb.write_sql(sql)