Beispiel #1
0
 def parse_detail(self, response):
     item = LawyerInfoItem()
     item["name"] = response.xpath(
         '//span[@id="Label1"]/text()').extract_first()
     item["sex"] = ''
     item["nation"] = response.xpath(
         '//span[@id="Label6"]/text()').extract_first()
     item["education"] = response.xpath(
         '//span[@id="Label7"]/text()').extract_first()
     item["political_status"] = response.xpath(
         '//span[@id="Label8"]/text()').extract_first()
     headurl = ''.join(
         response.xpath('//img[@id="Image1"]/@src').extract()).replace(
             '121.197.1.207:8001', '118.178.181.229:8000')
     item["headurl"] = ''.join(
         http_util.downloadImage([headurl], 'lawyer_pics/chongqing'))
     item["lawnumber"] = response.xpath(
         '//span[@id="Label11"]/text()').extract_first()
     item["professional_status"] = ''
     item["personnel_type"] = ''
     item["start_time"] = ''
     item["get_time"] = response.xpath(
         '//span[@id="Label12"]/text()').extract_first()
     item["cert_type"] = ''
     item["profession"] = ''
     item["ispartnership"] = ''
     item["firm"] = response.xpath(
         '//span[@id="Label3"]/text()').extract_first()
     item["province"] = u"重庆"
     item["url"] = response.url
     item['collection'] = 'lawyers'
     return item
Beispiel #2
0
 def parse_detail(self, response):
     data = json.loads(response.body_as_unicode())
     for x in data['rows']:
         item = LawyerInfoItem()
         item['name'] = x['ID']
         sex = x['xingB']
         if (sex == u"男"):
             item["sex"] = "0"
         elif (sex == u"女"):
             item["sex"] = "1"
         else:
             item["sex"] = ""
         item['personnel_type'] = x['zhiYLB']
         item['firm'] = x['zhiYJG']
         item['lawnumber'] = x['zhiYZH']
         item['get_time'] = ''
         item['start_time'] = ''
         item['province'] = u'贵州'
         item['education'] = ''
         item['cert_type'] = ''
         item['headurl'] = ''
         item['ispartnership'] = ''
         item['nation'] = ''
         item['political_status'] = ''
         item['professional_status'] = ''
         item['profession'] = ''
         item['url'] = response.url
         item['collection'] = 'lawyers'
         yield item
Beispiel #3
0
    def pase_item_details(self, response):
        detail_item = LawyerInfoItem()
        #页面url
        detail_item["url"] = response.url
        # 省份
        detail_item["province"] = "浙江"
        # 姓名
        detail_item["name"] = "".join(
            response.xpath(
                '//table[4]/tr/td[1]/table/tr/td[2]/table[3]/tr[1]/td[2]/text()'
            ).extract())
        # 性别
        sex = "".join(
            response.xpath(
                '//table[4]//tr/td[1]/table/tr/td[2]/table[3]/tr[4]/td[2]/text()'
            ).extract())
        if (sex == u"男"):
            detail_item["sex"] = "0"
        elif (sex == u"女"):
            detail_item["sex"] = "1"
        else:
            detail_item["sex"] = ""

        # 民族
        detail_item["nation"] = ""
        # 学历
        detail_item["education"] = ""
        # 政治面貌
        detail_item["political_status"] = ""
        # 头像路径
        detail_item["headurl"] = ""
        # 律师职业证号
        detail_item["lawnumber"] = "".join(
            response.xpath(
                '//table[4]/tr/td[1]/table/tr/td[2]/table[3]/tr[7]/td[2]/text()'
            ).extract())
        # 职业状态:0-正常、1-注销
        detail_item["professional_status"] = ""
        # 人员类型:专职
        detail_item["personnel_type"] = "".join(
            response.xpath(
                '/html/body/table[4]/tbody/tr/td[1]/table/tbody/tr/td[2]/table[3]/tbody/tr[6]/td[2]/text()'
            ).extract())
        # 首次执业时间
        detail_item["start_time"] = ""
        # 资格证获取时间
        detail_item["get_time"] = ""
        # 证书类型
        detail_item["cert_type"] = ""
        # 专业
        detail_item["profession"] = ''
        # 是否合伙人 0-否 1-是
        detail_item["ispartnership"] = ""
        # 所属律所
        detail_item["firm"] = "".join(
            response.xpath(
                '//table[4]/tr/td[1]/table/tr/td[2]/table[3]/tr[2]/td[2]/a/text()'
            ).extract())
        detail_item['collection'] = 'lawyers'
        return detail_item
Beispiel #4
0
 def parse_list(self, response):
     data = json.loads(response.body_as_unicode())
     for dc in data['data']['items']:
         item = LawyerInfoItem()
         item['name'] = dc['username']
         if dc['usersex'] != None:
             item['sex'] = int(dc['usersex'])
         item['personnel_type'] = self.search_personnel_type(
             dc['lawyertype'])
         item['firm'] = dc['lawofficename']
         item['lawnumber'] = dc['workcardnum']
         item['get_time'] = ''
         item['start_time'] = '' if dc['practiceyear'] == None else dc[
             'practiceyear']
         item['province'] = u'天津'
         item['education'] = self.search_edu(dc['cultuerlev'])
         item['cert_type'] = ''
         item['headurl'] = ''
         imagesrc = str(dc['image'])
         if imagesrc != '':
             headurl = "http://111.160.0.142:8091/lawyer/resources/photo/" + imagesrc
             item['headurl'] = ''.join(
                 http_util.downloadImage([headurl], 'lawyer_pics/tianjin'))
         item['ispartnership'] = ''
         item['nation'] = ''
         item['political_status'] = ''
         item['professional_status'] = 0 if dc['officeresult'] == "0" else 1
         item['profession'] = ''
         item[
             'url'] = 'http://111.160.0.142:8091/lawyer/home/lawyer-detail.html?id={0}'.format(
                 (dc['lawyerid']))
         item['collection'] = 'lawyers'
         yield item
Beispiel #5
0
 def parse_detail(self, response):
     for x in response.xpath(
             u'//td[text()="序号"]/ancestor::table[1]/tr[position()>1]'):
         item = LawyerInfoItem()
         name = ''.join(x.xpath('td[2]/text()').re('[^\s+]'))
         if name != '':
             item['name'] = name
             item['sex'] = ''
             item['personnel_type'] = ''
             item['firm'] = ''
             item['lawnumber'] = ''.join(
                 x.xpath('td[3]/text()').re('[^\s+]'))
             item['get_time'] = ''
             item['start_time'] = ''
             item['province'] = u'新疆'
             item['education'] = ''
             item['cert_type'] = ''
             item['headurl'] = ''
             item['ispartnership'] = ''
             item['nation'] = ''
             item['political_status'] = ''
             item['professional_status'] = ''
             item['profession'] = ''
             item['url'] = response.url
             item['collection'] = 'lawyers'
             yield item
Beispiel #6
0
    def parse_lawyer_item(self, response):
        item = LawyerInfoItem()
        item["name"] = response.xpath(
            '//span[@id="ess_ctr742_LawyerView_lblName"]/text()'
        ).extract_first()
        item["sex"] = 0 if response.xpath(
            '//span[@id="ess_ctr742_LawyerView_lblSex"]/text()').extract_first(
            ) == u"男" else 1
        item["nation"] = response.xpath(
            '//span[@id="ess_ctr742_LawyerView_lblFolk"]/text()'
        ).extract_first()
        item["education"] = response.xpath(
            '//span[@id="ess_ctr742_LawyerView_lblEdu"]/text()').extract_first(
            )
        item["political_status"] = response.xpath(
            '//span[@id="ess_ctr742_LawyerView_lblParty"]/text()'
        ).extract_first()
        item["headurl"] = ''.join(
            http_util.downloadImage([
                "http://app.bjsf.gov.cn" +
                response.xpath('//img[@id="ess_ctr742_LawyerView_Image1"]/@src'
                               ).extract_first()
            ], 'lawyer_pics/beijing'))
        item["lawnumber"] = response.xpath(
            '//span[@id="ess_ctr742_LawyerView_lblCertificate_Code"]/text()'
        ).extract_first()
        item["professional_status"] = 0 if response.xpath(
            '//span[@id="ess_ctr742_LawyerView_lblStatus"]/text()'
        ).extract_first() == u"执业" else 1
        item["personnel_type"] = response.xpath(
            '//span[@id="ess_ctr742_LawyerView_lblPerson_Type"]/text()'
        ).extract_first()
        start_time = response.xpath(
            '//span[@id="ess_ctr742_LawyerView_lblFirst_Date"]/text()'
        ).extract_first()
        if start_time != None:
            item["start_time"] = start_time.replace("/", "-")
        get_time = response.xpath(
            '//span[@id="ess_ctr742_LawyerView_lblCompetency_Date"]/text()'
        ).extract_first()
        if get_time != None:
            item["get_time"] = get_time.replace("/", "-")
        item["cert_type"] = response.xpath(
            '//span[@id="ess_ctr742_LawyerView_lblCompetency_Type"]/text()'
        ).extract_first()
        item["profession"] = ''
        item["ispartnership"] = 0 if response.xpath(
            '//span[@id="ess_ctr742_LawyerView_lblIsCopartner"]/text()'
        ).extract_first() == u"否" else 1
        item["firm"] = response.xpath(
            '//span[@id="ess_ctr742_LawyerView_lblLo_Name"]/text()'
        ).extract_first()
        item["province"] = u"北京"
        item['collection'] = 'lawyers'
        item["url"] = response.url

        return item
Beispiel #7
0
 def pase_item_details(self, response):
     detail_item=  LawyerInfoItem()
     detail_item["url"] = response.url
     #省份
     detail_item["province"]=u"山西"
     #姓名
     detail_item["name"]="".join(response.xpath('//*[@id="Form1"]/div[3]/div[2]/table/tr[1]/td[2]/text()').extract())
     #性别
     sex = "".join(response.xpath('//*[@id="Form1"]/div[3]/div[2]/table/tr[1]/td[4]/text()').extract())
     if(sex==u"男"):
         detail_item["sex"]="0"
     elif(sex==u"女"):
         detail_item["sex"]="1"
     else:
         detail_item["sex"]=""
     #民族
     detail_item["nation"]="".join(response.xpath('//*[@id="Form1"]/div[3]/div[2]/table/tr[2]/td[2]/text()').extract())
    #学历
     detail_item["education"] = "".join(response.xpath('//*[@id="Form1"]/div[3]/div[2]/table/tr[2]/td[4]/text()').extract())
     # 政治面貌
     detail_item["political_status"] = "".join(response.xpath('//*[@id="Form1"]/div[3]/div[2]/table/tr[3]/td[4]/text()').extract())
     # 头像路径
     headurl = "".join(response.xpath('//*[@id="Form1"]/div[3]/div[2]/table/tr[1]/td[5]/img/@src').extract())
     if(headurl==""):
         detail_item["headurl"] = ""
     else:
         detail_item["headurl"] = ''.join(http_util.downloadImage(["http://sx.sxlawyer.cn".join(response.xpath('//*[@id="Form1"]/div[3]/div[2]/table/tr[1]/td[5]/img/@src').extract())], 'lawyer_pics/shanxi'))
     # 律师职业证号
     detail_item["lawnumber"] = "".join(response.xpath('//*[@id="Form1"]/div[3]/div[2]/table/tr[6]/td[4]/text()').extract())
     # 职业状态:0-正常、1-注销
     professional_status ="".join(response.xpath('//*[@id="Form1"]/div[3]/div[2]/table/tr[7]/td[2]/text()').extract())
     if(professional_status==u"在职"):
         detail_item["professional_status"] ="0"
     else:
         detail_item["professional_status"] = "1"
     # 人员类型:专职
     detail_item["personnel_type"] = "".join(response.xpath('//*[@id="Form1"]/div[3]/div[2]/table/tr[6]/td[2]/text()').extract())
     # 首次执业时间
     detail_item["start_time"] = "".join(response.xpath('//*[@id="Form1"]/div[3]/div[2]/table/tr[10]/td[2]/text()').extract())
     # 资格证获取时间
     detail_item["get_time"] = "".join(response.xpath('//*[@id="Form1"]/div[3]/div[2]/table/tr[8]/td[2]/text()').extract())
     # 证书类型
     detail_item["cert_type"] = ""
     # 专业
     detail_item["profession"] = ""
     # 是否合伙人 0-否 1-是
     ispartnership="".join(response.xpath('//*[@id="Form1"]/div[3]/div[2]/table/tr[2]/td[1]/text()').extract())
     if(ispartnership==u"是"):
          detail_item["ispartnership"] ="1"
     elif(ispartnership==u"否"):
         detail_item["ispartnership"]="0"
     else:
         detail_item["ispartnership"]=""
     # 所属律所
     detail_item["firm"] = "".join(response.xpath('//*[@id="Form1"]/div[3]/div[2]/table/tr[8]/td[4]/text()').extract())
     detail_item['collection'] = 'lawyers'
     return detail_item
Beispiel #8
0
 def parse_detail(self, response):
     item = LawyerInfoItem()
     item["name"] = response.xpath(
         '//div[@class="list-item page"]/dl[@class="user-info"]/dd[@class="name"]/text()'
     ).extract_first()
     item["sex"] = 0 if response.xpath(
         '//div[@id="detail01"]/div[@class="body"]/ul[@class="info-list clearfix"]/li[2]/text()'
     ).extract_first() == u"男" else 1
     item["nation"] = response.xpath(
         '//div[@id="detail01"]/div[@class="body"]/ul[@class="info-list clearfix"]/li[4]/text()'
     ).extract_first()
     item["education"] = response.xpath(
         '//div[@id="detail01"]/div[@class="body"]/ul[@class="info-list clearfix"]/li[5]/text()'
     ).extract_first()
     item["political_status"] = response.xpath(
         '//div[@id="detail01"]/div[@class="body"]/ul[@class="info-list clearfix"]/li[7]/text()'
     ).extract_first()
     item["headurl"] = ''.join(
         http_util.downloadImage(
             response.xpath('//dt[@class="avatar"]/img/@src').extract(),
             'lawyer_pics/shanghai'))
     item["lawnumber"] = response.xpath(
         '//div[@id="detail01"]/div[@class="body"]/ul[@class="info-list clearfix"]/li[1]/text()'
     ).extract_first()
     item["professional_status"] = 0 if response.xpath(
         '//ul[@class="user-credit"]/li/div/text()').extract_first(
         ) == u"正常" else 1
     item["personnel_type"] = response.xpath(
         '//div[@id="detail01"]/div[@class="body"]/ul[@class="info-list clearfix"]/li[6]/text()'
     ).extract_first()
     item["start_time"] = response.xpath(
         '//div[@id="detail01"]/div[@class="body"]/ul[@class="info-list clearfix"]/li[9]/text()'
     ).extract_first()
     item["get_time"] = response.xpath(
         '//div[@id="detail01"]/div[@class="body"]/ul[@class="info-list clearfix"]/li[10]/text()'
     ).extract_first()
     item["cert_type"] = ''
     item["profession"] = ''
     item["ispartnership"] = ''
     item["firm"] = response.xpath(
         '//dl[@class="user-info"]/dd[@class="info"][2]/a/text()'
     ).extract_first()
     item["province"] = u"上海"
     item["url"] = response.url
     item['collection'] = 'lawyers'
     return item
Beispiel #9
0
    def parseList(self, response):
        item = LawyerInfoItem()

        # 地址
        item['url'] = str(response.url)

        for tr in response.xpath('//table/tbody/tr'):
            # 姓名
            item['name'] = "".join(tr.xpath('td[7]/text()').re('[^\s]'))
            # 律师职业证号
            item['lawnumber'] = "".join(tr.xpath('td[8]/text()').re('[^\s]'))
            # 省
            item['province'] = u"湖北"
            # 律所
            item['firm'] = "".join(tr.xpath('td[1]/text()').re('[^\s]'))
            # 性别
            item['sex'] = ''
            # 民族
            item['nation'] = ''
            # 学历
            item['education'] = ''
            # 政治面貌
            item['political_status'] = ''
            # 头像路径
            item['headurl'] = ''
            # 职业状态:0-正常、1-注销
            item['professional_status'] = ''
            # 人员类型:专职
            item['personnel_type'] = ''
            # 首次执业时间
            item['start_time'] = ''
            # 资格证获取时间
            item['get_time'] = ''
            # 证书类型
            item['cert_type'] = ''
            # 专业
            item['profession'] = ''
            # 是否合伙人 0-否 1-是
            item['ispartnership'] = ''
            item['collection'] = 'lawyers'
            yield item
Beispiel #10
0
 def parse_detail(self, response):
     item = LawyerInfoItem()
     item["name"] = response.xpath(
         "//table/tr[1]/td[1]/text()").extract_first()
     sex = response.xpath("//table/tr[1]/td[2]/text()").extract_first()
     if sex is not None:
         sex = sex.strip()
     if sex == u"男":
         sex = 0
     elif sex == u"女":
         sex = 1
     else:
         sex = ""
     item["sex"] = sex
     item["education"] = response.xpath(
         "//table/tr[5]/td[2]/text()").extract_first()
     item["political_status"] = response.xpath(
         "//table/tr[2]/td[2]/text()").extract_first()
     headurl = 'http://www.sxsf.gov.cn' + response.xpath(
         '//table/tr[1]/td[3]/img/@src').extract_first()
     item["headurl"] = ''.join(
         http_util.downloadImage([headurl], 'lawyer_pics/sshanxi'))
     item["lawnumber"] = response.xpath(
         "//table/tr[3]/td[1]/text()").extract_first()
     item["get_time"] = response.xpath(
         "//table/tr[4]/td[1]/text()").extract_first()
     item["cert_type"] = response.xpath(
         "//table/tr[3]/td[2]/text()").extract_first()
     item["profession"] = ''.join(
         response.xpath("//table/tr[9]/td[1]/text()").extract()).split(u'、')
     item["firm"] = response.xpath(
         "//table/tr[10]/td[1]/a/text()").extract_first()
     item['collection'] = 'lawyers'
     item['province'] = u'陕西'
     item['url'] = response.url
     return item
Beispiel #11
0
 def pase_item_details(self, response):
     if response.status == 200:
         detail_item = LawyerInfoItem()
         detail_item["url"] = response.url
         #省份
         detail_item["province"] = u"广东"
         #姓名
         detail_item["name"] = "".join(
             response.xpath(
                 '//input[@name="textfield22"]/@value').extract())
         #性别
         sex = "".join(
             response.xpath(
                 '//input[@name="textfield224"]/@value').extract())
         if (sex == u"男"):
             detail_item["sex"] = "0"
         elif (sex == u"女"):
             detail_item["sex"] = "1"
         else:
             detail_item["sex"] = ""
         #民族
         detail_item["nation"] = "".join(
             response.xpath(
                 '//input[@name="textfield222"]/@value').extract())
         #学历
         detail_item["education"] = "".join(
             response.xpath(
                 '//input[@name="textfield225"]/@value').extract())
         # 政治面貌
         detail_item["political_status"] = ""
         # 头像路径
         detail_item["headurl"] = ""
         # 律师职业证号
         detail_item["lawnumber"] = "".join(
             response.xpath(
                 '//input[@name="textfield223"]/@value').extract())
         # 职业状态:0-正常、1-注销
         professional_status = "".join(
             response.xpath(
                 '//input[@name="textfield529"]/@value').extract())
         if (professional_status == u"正常"):
             detail_item["professional_status"] = "0"
         else:
             detail_item["professional_status"] = "1"
         # 人员类型:专职
         detail_item["personnel_type"] = ""
         # 首次执业时间
         detail_item["start_time"] = ''
         # 资格证获取时间
         detail_item["get_time"] = "".join(
             response.xpath(
                 '//input[@name="textfield227"][1]/@value').extract())
         # 证书类型
         detail_item["cert_type"] = ""
         # 专业
         detail_item["profession"] = ""
         # 是否合伙人 0-否 1-是
         detail_item["ispartnership"] = ""
         # 所属律所
         detail_item["firm"] = "".join(
             response.xpath(
                 '//input[@name="textfield229"]/@value').extract())
         detail_item['collection'] = 'lawyers'
         return detail_item