def parse_detail(self, response): item = LawyerInfoItem() item["name"] = response.xpath( '//span[@id="Label1"]/text()').extract_first() item["sex"] = '' item["nation"] = response.xpath( '//span[@id="Label6"]/text()').extract_first() item["education"] = response.xpath( '//span[@id="Label7"]/text()').extract_first() item["political_status"] = response.xpath( '//span[@id="Label8"]/text()').extract_first() headurl = ''.join( response.xpath('//img[@id="Image1"]/@src').extract()).replace( '121.197.1.207:8001', '118.178.181.229:8000') item["headurl"] = ''.join( http_util.downloadImage([headurl], 'lawyer_pics/chongqing')) item["lawnumber"] = response.xpath( '//span[@id="Label11"]/text()').extract_first() item["professional_status"] = '' item["personnel_type"] = '' item["start_time"] = '' item["get_time"] = response.xpath( '//span[@id="Label12"]/text()').extract_first() item["cert_type"] = '' item["profession"] = '' item["ispartnership"] = '' item["firm"] = response.xpath( '//span[@id="Label3"]/text()').extract_first() item["province"] = u"重庆" item["url"] = response.url item['collection'] = 'lawyers' return item
def parse_detail(self, response): data = json.loads(response.body_as_unicode()) for x in data['rows']: item = LawyerInfoItem() item['name'] = x['ID'] sex = x['xingB'] if (sex == u"男"): item["sex"] = "0" elif (sex == u"女"): item["sex"] = "1" else: item["sex"] = "" item['personnel_type'] = x['zhiYLB'] item['firm'] = x['zhiYJG'] item['lawnumber'] = x['zhiYZH'] item['get_time'] = '' item['start_time'] = '' item['province'] = u'贵州' item['education'] = '' item['cert_type'] = '' item['headurl'] = '' item['ispartnership'] = '' item['nation'] = '' item['political_status'] = '' item['professional_status'] = '' item['profession'] = '' item['url'] = response.url item['collection'] = 'lawyers' yield item
def pase_item_details(self, response): detail_item = LawyerInfoItem() #页面url detail_item["url"] = response.url # 省份 detail_item["province"] = "浙江" # 姓名 detail_item["name"] = "".join( response.xpath( '//table[4]/tr/td[1]/table/tr/td[2]/table[3]/tr[1]/td[2]/text()' ).extract()) # 性别 sex = "".join( response.xpath( '//table[4]//tr/td[1]/table/tr/td[2]/table[3]/tr[4]/td[2]/text()' ).extract()) if (sex == u"男"): detail_item["sex"] = "0" elif (sex == u"女"): detail_item["sex"] = "1" else: detail_item["sex"] = "" # 民族 detail_item["nation"] = "" # 学历 detail_item["education"] = "" # 政治面貌 detail_item["political_status"] = "" # 头像路径 detail_item["headurl"] = "" # 律师职业证号 detail_item["lawnumber"] = "".join( response.xpath( '//table[4]/tr/td[1]/table/tr/td[2]/table[3]/tr[7]/td[2]/text()' ).extract()) # 职业状态:0-正常、1-注销 detail_item["professional_status"] = "" # 人员类型:专职 detail_item["personnel_type"] = "".join( response.xpath( '/html/body/table[4]/tbody/tr/td[1]/table/tbody/tr/td[2]/table[3]/tbody/tr[6]/td[2]/text()' ).extract()) # 首次执业时间 detail_item["start_time"] = "" # 资格证获取时间 detail_item["get_time"] = "" # 证书类型 detail_item["cert_type"] = "" # 专业 detail_item["profession"] = '' # 是否合伙人 0-否 1-是 detail_item["ispartnership"] = "" # 所属律所 detail_item["firm"] = "".join( response.xpath( '//table[4]/tr/td[1]/table/tr/td[2]/table[3]/tr[2]/td[2]/a/text()' ).extract()) detail_item['collection'] = 'lawyers' return detail_item
def parse_list(self, response): data = json.loads(response.body_as_unicode()) for dc in data['data']['items']: item = LawyerInfoItem() item['name'] = dc['username'] if dc['usersex'] != None: item['sex'] = int(dc['usersex']) item['personnel_type'] = self.search_personnel_type( dc['lawyertype']) item['firm'] = dc['lawofficename'] item['lawnumber'] = dc['workcardnum'] item['get_time'] = '' item['start_time'] = '' if dc['practiceyear'] == None else dc[ 'practiceyear'] item['province'] = u'天津' item['education'] = self.search_edu(dc['cultuerlev']) item['cert_type'] = '' item['headurl'] = '' imagesrc = str(dc['image']) if imagesrc != '': headurl = "http://111.160.0.142:8091/lawyer/resources/photo/" + imagesrc item['headurl'] = ''.join( http_util.downloadImage([headurl], 'lawyer_pics/tianjin')) item['ispartnership'] = '' item['nation'] = '' item['political_status'] = '' item['professional_status'] = 0 if dc['officeresult'] == "0" else 1 item['profession'] = '' item[ 'url'] = 'http://111.160.0.142:8091/lawyer/home/lawyer-detail.html?id={0}'.format( (dc['lawyerid'])) item['collection'] = 'lawyers' yield item
def parse_detail(self, response): for x in response.xpath( u'//td[text()="序号"]/ancestor::table[1]/tr[position()>1]'): item = LawyerInfoItem() name = ''.join(x.xpath('td[2]/text()').re('[^\s+]')) if name != '': item['name'] = name item['sex'] = '' item['personnel_type'] = '' item['firm'] = '' item['lawnumber'] = ''.join( x.xpath('td[3]/text()').re('[^\s+]')) item['get_time'] = '' item['start_time'] = '' item['province'] = u'新疆' item['education'] = '' item['cert_type'] = '' item['headurl'] = '' item['ispartnership'] = '' item['nation'] = '' item['political_status'] = '' item['professional_status'] = '' item['profession'] = '' item['url'] = response.url item['collection'] = 'lawyers' yield item
def parse_lawyer_item(self, response): item = LawyerInfoItem() item["name"] = response.xpath( '//span[@id="ess_ctr742_LawyerView_lblName"]/text()' ).extract_first() item["sex"] = 0 if response.xpath( '//span[@id="ess_ctr742_LawyerView_lblSex"]/text()').extract_first( ) == u"男" else 1 item["nation"] = response.xpath( '//span[@id="ess_ctr742_LawyerView_lblFolk"]/text()' ).extract_first() item["education"] = response.xpath( '//span[@id="ess_ctr742_LawyerView_lblEdu"]/text()').extract_first( ) item["political_status"] = response.xpath( '//span[@id="ess_ctr742_LawyerView_lblParty"]/text()' ).extract_first() item["headurl"] = ''.join( http_util.downloadImage([ "http://app.bjsf.gov.cn" + response.xpath('//img[@id="ess_ctr742_LawyerView_Image1"]/@src' ).extract_first() ], 'lawyer_pics/beijing')) item["lawnumber"] = response.xpath( '//span[@id="ess_ctr742_LawyerView_lblCertificate_Code"]/text()' ).extract_first() item["professional_status"] = 0 if response.xpath( '//span[@id="ess_ctr742_LawyerView_lblStatus"]/text()' ).extract_first() == u"执业" else 1 item["personnel_type"] = response.xpath( '//span[@id="ess_ctr742_LawyerView_lblPerson_Type"]/text()' ).extract_first() start_time = response.xpath( '//span[@id="ess_ctr742_LawyerView_lblFirst_Date"]/text()' ).extract_first() if start_time != None: item["start_time"] = start_time.replace("/", "-") get_time = response.xpath( '//span[@id="ess_ctr742_LawyerView_lblCompetency_Date"]/text()' ).extract_first() if get_time != None: item["get_time"] = get_time.replace("/", "-") item["cert_type"] = response.xpath( '//span[@id="ess_ctr742_LawyerView_lblCompetency_Type"]/text()' ).extract_first() item["profession"] = '' item["ispartnership"] = 0 if response.xpath( '//span[@id="ess_ctr742_LawyerView_lblIsCopartner"]/text()' ).extract_first() == u"否" else 1 item["firm"] = response.xpath( '//span[@id="ess_ctr742_LawyerView_lblLo_Name"]/text()' ).extract_first() item["province"] = u"北京" item['collection'] = 'lawyers' item["url"] = response.url return item
def pase_item_details(self, response): detail_item= LawyerInfoItem() detail_item["url"] = response.url #省份 detail_item["province"]=u"山西" #姓名 detail_item["name"]="".join(response.xpath('//*[@id="Form1"]/div[3]/div[2]/table/tr[1]/td[2]/text()').extract()) #性别 sex = "".join(response.xpath('//*[@id="Form1"]/div[3]/div[2]/table/tr[1]/td[4]/text()').extract()) if(sex==u"男"): detail_item["sex"]="0" elif(sex==u"女"): detail_item["sex"]="1" else: detail_item["sex"]="" #民族 detail_item["nation"]="".join(response.xpath('//*[@id="Form1"]/div[3]/div[2]/table/tr[2]/td[2]/text()').extract()) #学历 detail_item["education"] = "".join(response.xpath('//*[@id="Form1"]/div[3]/div[2]/table/tr[2]/td[4]/text()').extract()) # 政治面貌 detail_item["political_status"] = "".join(response.xpath('//*[@id="Form1"]/div[3]/div[2]/table/tr[3]/td[4]/text()').extract()) # 头像路径 headurl = "".join(response.xpath('//*[@id="Form1"]/div[3]/div[2]/table/tr[1]/td[5]/img/@src').extract()) if(headurl==""): detail_item["headurl"] = "" else: detail_item["headurl"] = ''.join(http_util.downloadImage(["http://sx.sxlawyer.cn".join(response.xpath('//*[@id="Form1"]/div[3]/div[2]/table/tr[1]/td[5]/img/@src').extract())], 'lawyer_pics/shanxi')) # 律师职业证号 detail_item["lawnumber"] = "".join(response.xpath('//*[@id="Form1"]/div[3]/div[2]/table/tr[6]/td[4]/text()').extract()) # 职业状态:0-正常、1-注销 professional_status ="".join(response.xpath('//*[@id="Form1"]/div[3]/div[2]/table/tr[7]/td[2]/text()').extract()) if(professional_status==u"在职"): detail_item["professional_status"] ="0" else: detail_item["professional_status"] = "1" # 人员类型:专职 detail_item["personnel_type"] = "".join(response.xpath('//*[@id="Form1"]/div[3]/div[2]/table/tr[6]/td[2]/text()').extract()) # 首次执业时间 detail_item["start_time"] = "".join(response.xpath('//*[@id="Form1"]/div[3]/div[2]/table/tr[10]/td[2]/text()').extract()) # 资格证获取时间 detail_item["get_time"] = "".join(response.xpath('//*[@id="Form1"]/div[3]/div[2]/table/tr[8]/td[2]/text()').extract()) # 证书类型 detail_item["cert_type"] = "" # 专业 detail_item["profession"] = "" # 是否合伙人 0-否 1-是 ispartnership="".join(response.xpath('//*[@id="Form1"]/div[3]/div[2]/table/tr[2]/td[1]/text()').extract()) if(ispartnership==u"是"): detail_item["ispartnership"] ="1" elif(ispartnership==u"否"): detail_item["ispartnership"]="0" else: detail_item["ispartnership"]="" # 所属律所 detail_item["firm"] = "".join(response.xpath('//*[@id="Form1"]/div[3]/div[2]/table/tr[8]/td[4]/text()').extract()) detail_item['collection'] = 'lawyers' return detail_item
def parse_detail(self, response): item = LawyerInfoItem() item["name"] = response.xpath( '//div[@class="list-item page"]/dl[@class="user-info"]/dd[@class="name"]/text()' ).extract_first() item["sex"] = 0 if response.xpath( '//div[@id="detail01"]/div[@class="body"]/ul[@class="info-list clearfix"]/li[2]/text()' ).extract_first() == u"男" else 1 item["nation"] = response.xpath( '//div[@id="detail01"]/div[@class="body"]/ul[@class="info-list clearfix"]/li[4]/text()' ).extract_first() item["education"] = response.xpath( '//div[@id="detail01"]/div[@class="body"]/ul[@class="info-list clearfix"]/li[5]/text()' ).extract_first() item["political_status"] = response.xpath( '//div[@id="detail01"]/div[@class="body"]/ul[@class="info-list clearfix"]/li[7]/text()' ).extract_first() item["headurl"] = ''.join( http_util.downloadImage( response.xpath('//dt[@class="avatar"]/img/@src').extract(), 'lawyer_pics/shanghai')) item["lawnumber"] = response.xpath( '//div[@id="detail01"]/div[@class="body"]/ul[@class="info-list clearfix"]/li[1]/text()' ).extract_first() item["professional_status"] = 0 if response.xpath( '//ul[@class="user-credit"]/li/div/text()').extract_first( ) == u"正常" else 1 item["personnel_type"] = response.xpath( '//div[@id="detail01"]/div[@class="body"]/ul[@class="info-list clearfix"]/li[6]/text()' ).extract_first() item["start_time"] = response.xpath( '//div[@id="detail01"]/div[@class="body"]/ul[@class="info-list clearfix"]/li[9]/text()' ).extract_first() item["get_time"] = response.xpath( '//div[@id="detail01"]/div[@class="body"]/ul[@class="info-list clearfix"]/li[10]/text()' ).extract_first() item["cert_type"] = '' item["profession"] = '' item["ispartnership"] = '' item["firm"] = response.xpath( '//dl[@class="user-info"]/dd[@class="info"][2]/a/text()' ).extract_first() item["province"] = u"上海" item["url"] = response.url item['collection'] = 'lawyers' return item
def parseList(self, response): item = LawyerInfoItem() # 地址 item['url'] = str(response.url) for tr in response.xpath('//table/tbody/tr'): # 姓名 item['name'] = "".join(tr.xpath('td[7]/text()').re('[^\s]')) # 律师职业证号 item['lawnumber'] = "".join(tr.xpath('td[8]/text()').re('[^\s]')) # 省 item['province'] = u"湖北" # 律所 item['firm'] = "".join(tr.xpath('td[1]/text()').re('[^\s]')) # 性别 item['sex'] = '' # 民族 item['nation'] = '' # 学历 item['education'] = '' # 政治面貌 item['political_status'] = '' # 头像路径 item['headurl'] = '' # 职业状态:0-正常、1-注销 item['professional_status'] = '' # 人员类型:专职 item['personnel_type'] = '' # 首次执业时间 item['start_time'] = '' # 资格证获取时间 item['get_time'] = '' # 证书类型 item['cert_type'] = '' # 专业 item['profession'] = '' # 是否合伙人 0-否 1-是 item['ispartnership'] = '' item['collection'] = 'lawyers' yield item
def parse_detail(self, response): item = LawyerInfoItem() item["name"] = response.xpath( "//table/tr[1]/td[1]/text()").extract_first() sex = response.xpath("//table/tr[1]/td[2]/text()").extract_first() if sex is not None: sex = sex.strip() if sex == u"男": sex = 0 elif sex == u"女": sex = 1 else: sex = "" item["sex"] = sex item["education"] = response.xpath( "//table/tr[5]/td[2]/text()").extract_first() item["political_status"] = response.xpath( "//table/tr[2]/td[2]/text()").extract_first() headurl = 'http://www.sxsf.gov.cn' + response.xpath( '//table/tr[1]/td[3]/img/@src').extract_first() item["headurl"] = ''.join( http_util.downloadImage([headurl], 'lawyer_pics/sshanxi')) item["lawnumber"] = response.xpath( "//table/tr[3]/td[1]/text()").extract_first() item["get_time"] = response.xpath( "//table/tr[4]/td[1]/text()").extract_first() item["cert_type"] = response.xpath( "//table/tr[3]/td[2]/text()").extract_first() item["profession"] = ''.join( response.xpath("//table/tr[9]/td[1]/text()").extract()).split(u'、') item["firm"] = response.xpath( "//table/tr[10]/td[1]/a/text()").extract_first() item['collection'] = 'lawyers' item['province'] = u'陕西' item['url'] = response.url return item
def pase_item_details(self, response): if response.status == 200: detail_item = LawyerInfoItem() detail_item["url"] = response.url #省份 detail_item["province"] = u"广东" #姓名 detail_item["name"] = "".join( response.xpath( '//input[@name="textfield22"]/@value').extract()) #性别 sex = "".join( response.xpath( '//input[@name="textfield224"]/@value').extract()) if (sex == u"男"): detail_item["sex"] = "0" elif (sex == u"女"): detail_item["sex"] = "1" else: detail_item["sex"] = "" #民族 detail_item["nation"] = "".join( response.xpath( '//input[@name="textfield222"]/@value').extract()) #学历 detail_item["education"] = "".join( response.xpath( '//input[@name="textfield225"]/@value').extract()) # 政治面貌 detail_item["political_status"] = "" # 头像路径 detail_item["headurl"] = "" # 律师职业证号 detail_item["lawnumber"] = "".join( response.xpath( '//input[@name="textfield223"]/@value').extract()) # 职业状态:0-正常、1-注销 professional_status = "".join( response.xpath( '//input[@name="textfield529"]/@value').extract()) if (professional_status == u"正常"): detail_item["professional_status"] = "0" else: detail_item["professional_status"] = "1" # 人员类型:专职 detail_item["personnel_type"] = "" # 首次执业时间 detail_item["start_time"] = '' # 资格证获取时间 detail_item["get_time"] = "".join( response.xpath( '//input[@name="textfield227"][1]/@value').extract()) # 证书类型 detail_item["cert_type"] = "" # 专业 detail_item["profession"] = "" # 是否合伙人 0-否 1-是 detail_item["ispartnership"] = "" # 所属律所 detail_item["firm"] = "".join( response.xpath( '//input[@name="textfield229"]/@value').extract()) detail_item['collection'] = 'lawyers' return detail_item