Exemple #1
0
def coal_data_extract(response):
    """
    """
    sel = response.selector
    info = []
    name = sel.xpath(u"//strong[text()='煤矿名称:']\
        /../text()").extract()   #煤矿名称
    # print name, 'name'
    info = for_ominated_data(info, name[1:2])
    pro = sel.xpath(u"//strong[text()='企业性质:']\
        /../text()").extract()  #企业性质
    info = for_ominated_data(info, pro[1:2])
    prov = sel.xpath(u"//strong[text()='省份:']\
        /../text()").extract()  #省份
    info = for_ominated_data(info, prov[1:2])
    cap = sel.xpath(u"//strong[text()='生产能力(万t/a):']\
        /../text()").extract()  #生产能力(万t/a)
    info = for_ominated_data(info, cap[1:2])
    gas_level = sel.xpath(u"//strong[text()='瓦斯等级:']\
        /../text()").extract()  #瓦斯等级
    info = for_ominated_data(info, gas_level[1:2])
    safe_day = sel.xpath(u"//strong[text()='安全生产天数:']\
        /../text()").extract()  #安全生产天数
    info = for_ominated_data(info, safe_day[1:2])
    return "\001".join(info)
def non_coal_data_extract(response):
    """
    """
    sel = response.selector
    info = []
    name = sel.xpath(u"//strong[text()='企业名称:']\
        /../text()").extract()   #企业名称
    info = for_ominated_data(info, name[1:2])
    # print name[1], "name"
    classi = sel.xpath(u"//strong[text()='所属行业:']\
    //../text()").extract() #所属行业
    info = for_ominated_data(info, classi[2:3])
    safe_le = sel.xpath(u"//strong[text()='安全生产标准化级别:']\
    //../text()").extract() #安全生产标准化级别
    info = for_ominated_data(info, safe_le[2:3])
    type_l = sel.xpath(u"//strong[text()='类型:']\
    //../text()").extract() #类型
    info = for_ominated_data(info, type_l[2:3])
    pub_date = sel.xpath(u"//strong[text()='公告时间:']\
        //../text()").extract()  #公告时间
    info = for_ominated_data(info, pub_date[2:3])
    prov = sel.xpath(u"//strong[text()='所在省份:']\
        /../text()").extract() #省份
    info = for_ominated_data(info, prov[1:2])
    info_re = "\001".join(info)
    if info_re:
        pass     #如果有内容则pass,没有内容则记录下该页面的url
    else:
        log.msg("error in extract_non_coal %s" %response.url,\
            level=log.ERROR)
    return info_re
Exemple #3
0
def non_coal_data_extract(response):
    """
    """
    sel = response.selector
    info = []
    name = sel.xpath(u"//strong[text()='企业名称:']\
        /../text()").extract()  #企业名称
    info = for_ominated_data(info, name[1:2])
    # print name[1], "name"
    classi = sel.xpath(u"//strong[text()='所属行业:']\
    //../text()").extract()  #所属行业
    info = for_ominated_data(info, classi[2:3])
    safe_le = sel.xpath(u"//strong[text()='安全生产标准化级别:']\
    //../text()").extract()  #安全生产标准化级别
    info = for_ominated_data(info, safe_le[2:3])
    type_l = sel.xpath(u"//strong[text()='类型:']\
    //../text()").extract()  #类型
    info = for_ominated_data(info, type_l[2:3])
    pub_date = sel.xpath(u"//strong[text()='公告时间:']\
        //../text()").extract()  #公告时间
    info = for_ominated_data(info, pub_date[2:3])
    prov = sel.xpath(u"//strong[text()='所在省份:']\
        /../text()").extract()  #省份
    info = for_ominated_data(info, prov[1:2])
    info_re = "\001".join(info)
    if info_re:
        pass  #如果有内容则pass,没有内容则记录下该页面的url
    else:
        log.msg("error in extract_non_coal %s" %response.url,\
            level=log.ERROR)
    return info_re