def parse_detail(self, data): item = {} # #[UIID],[UIPhone] ,[UIName] ,[UIEmail] ,[UIPic],[UILawNumber],[LawOrg],[ProvinceCode],[CityCode],[Address],[UISignature] item["UIID"] = str(uuid.uuid1()).replace('-', '') uiphone = '' if data.has_key( 'cell_phone') == False else data['cell_phone'] match_count = len(re.findall(r'[1][3,4,5,6,7,8][0-9]{9}', uiphone)) item['UILawNumber'] = None if data.has_key( 'accountcode') == False else data['accountcode'] if item["UILawNumber"] != None and len( item["UILawNumber"] ) == 17 and self.userInfoInfoData.find_lawyer_by_lawlumber( (item["UILawNumber"], )) == None: item["UIPhone"] = None if match_count == 0 else uiphone item['UIName'] = data['user_name'] item["ProvinceCode"] = self.provincode item['LawOrg'] = None if data.has_key( 'accountorg') == False else data['accountorg'] item['UIEmail'] = None if data.has_key( 'email') == False else data['email'] item["UISignature"] = None item["Address"] = None if data.has_key( 'address') == False else data['address'] item['UISex'] = 0 if data['sex'] == 1 else 1 item["CityCode"] = ''.join( self.areaData.find_area_by_name_return_code( (data['cityname']))) # 头像路径 dirname = "shandong" item["UIPic"] = ''.join( http_util.downloadImage([data['picImg']], '/AppFile/' + dirname + "/" + item["UIID"] + '/head')) return item
def parse_detail(self, response): item = LawyerInfoItem() item["name"] = response.xpath( '//span[@id="Label1"]/text()').extract_first() item["sex"] = '' item["nation"] = response.xpath( '//span[@id="Label6"]/text()').extract_first() item["education"] = response.xpath( '//span[@id="Label7"]/text()').extract_first() item["political_status"] = response.xpath( '//span[@id="Label8"]/text()').extract_first() headurl = ''.join( response.xpath('//img[@id="Image1"]/@src').extract()).replace( '121.197.1.207:8001', '118.178.181.229:8000') item["headurl"] = ''.join( http_util.downloadImage([headurl], 'lawyer_pics/chongqing')) item["lawnumber"] = response.xpath( '//span[@id="Label11"]/text()').extract_first() item["professional_status"] = '' item["personnel_type"] = '' item["start_time"] = '' item["get_time"] = response.xpath( '//span[@id="Label12"]/text()').extract_first() item["cert_type"] = '' item["profession"] = '' item["ispartnership"] = '' item["firm"] = response.xpath( '//span[@id="Label3"]/text()').extract_first() item["province"] = u"重庆" item["url"] = response.url item['collection'] = 'lawyers' return item
def parse_list(self, response): data = json.loads(response.body_as_unicode()) for dc in data['data']['items']: item = LawyerInfoItem() item['name'] = dc['username'] if dc['usersex'] != None: item['sex'] = int(dc['usersex']) item['personnel_type'] = self.search_personnel_type( dc['lawyertype']) item['firm'] = dc['lawofficename'] item['lawnumber'] = dc['workcardnum'] item['get_time'] = '' item['start_time'] = '' if dc['practiceyear'] == None else dc[ 'practiceyear'] item['province'] = u'天津' item['education'] = self.search_edu(dc['cultuerlev']) item['cert_type'] = '' item['headurl'] = '' imagesrc = str(dc['image']) if imagesrc != '': headurl = "http://111.160.0.142:8091/lawyer/resources/photo/" + imagesrc item['headurl'] = ''.join( http_util.downloadImage([headurl], 'lawyer_pics/tianjin')) item['ispartnership'] = '' item['nation'] = '' item['political_status'] = '' item['professional_status'] = 0 if dc['officeresult'] == "0" else 1 item['profession'] = '' item[ 'url'] = 'http://111.160.0.142:8091/lawyer/home/lawyer-detail.html?id={0}'.format( (dc['lawyerid'])) item['collection'] = 'lawyers' yield item
def parse_detail(self, response): item = {} # #[UIID],[UIPhone] ,[UIName] ,[UIEmail] ,[UIPic],[UILawNumber],[LawOrg],[ProvinceCode],[CityCode],[Address],[UISignature] data = json.loads(response.body_as_unicode()) item["UIID"] = str(uuid.uuid1()).replace('-', '') uiphone = data['telnum'] match_count = len(re.findall(r'[1][3,4,5,6,7,8][0-9]{9}', uiphone)) item['UILawNumber'] = data['licenseno'] if item["UILawNumber"] != None and len( item["UILawNumber"] ) == 17 and self.userInfoInfoData.find_lawyer_by_lawlumber( (item["UILawNumber"], )) == None: item["UIPhone"] = None if match_count == 0 else uiphone item['UIName'] = data['name'] item["ProvinceCode"] = self.provincode item['LawOrg'] = data['lawfirmname'] item['UIEmail'] = None item["UISignature"] = data['lawyerinfo'] item['fiil_str'] = field_info_dic.find_field_by_name( data['zhuangchang']) item["Address"] = data['lawfirmaddress'] item["CityCode"] = ''.join( self.areaData.find_area_by_name_return_code( (response.meta['cityname']))) # 头像路径 dirname = self.name item["UIPic"] = ''.join( http_util.downloadImage( ["http://sd.12348.gov.cn" + data['logourl']], '/AppFile/' + dirname + "/" + item["UIID"] + '/head')) item['url'] = response.url return item
def parse_detail(self, data, cityname): item = {} # #[UIID],[UIPhone] ,[UIName] ,[UIEmail] ,[UIPic],[UILawNumber],[LawOrg],[ProvinceCode],[CityCode],[Address],[UISignature] item["UIID"] = str(uuid.uuid1()).replace('-', '') uiphone = '' if data.has_key('sjhm') == False else data['sjhm'] match_count = len(re.findall(r'[1][3,4,5,6,7,8][0-9]{9}', uiphone)) item['UILawNumber'] = None if data.has_key('zyzh') == False else data['zyzh'] if item["UILawNumber"] != None and len( item["UILawNumber"]) == 17 and self.userInfoInfoData.find_lawyer_by_lawlumber( (item["UILawNumber"],)) == None: item["UIPhone"] = None if match_count == 0 else uiphone item['UIName'] = data['lsxm'] item["ProvinceCode"] = self.provincode item['LawOrg'] = None if data.has_key('deptName') == False else data['deptName'] item['UIEmail'] = None if data.has_key('dzyx') == False else data['dzyx'] item["UISignature"] = None if data.has_key('grjj') == False else data['grjj'] item["Address"] = None if data.has_key('lxdz') == False else data['lxdz'].replace("\r",'').replace("\n",'').replace(' ','') item['UISex'] = None if data.has_key('xbMc') == False else (0 if data['xbMc']==u'男' else 1) item["CityCode"] = ''.join(self.areaData.find_area_by_name_return_code((cityname))) fiil_str = None if data.has_key('ywzcmc') == False else data['ywzcmc'] if fiil_str != None: item['fiil_str'] = field_info_dic.find_field_by_name(fiil_str.split(u",")) # 头像路径 dirname = "ningxia" item["UIPic"] = ''.join( http_util.downloadImage(['http://nx.12348.gov.cn/flfw-xt/views/picture/'+data['lszp']], '/AppFile/' + dirname + "/" + item["UIID"] + '/head')) return item
def parse_lawyer_item(self, response): item = LawyerInfoItem() item["name"] = response.xpath( '//span[@id="ess_ctr742_LawyerView_lblName"]/text()' ).extract_first() item["sex"] = 0 if response.xpath( '//span[@id="ess_ctr742_LawyerView_lblSex"]/text()').extract_first( ) == u"男" else 1 item["nation"] = response.xpath( '//span[@id="ess_ctr742_LawyerView_lblFolk"]/text()' ).extract_first() item["education"] = response.xpath( '//span[@id="ess_ctr742_LawyerView_lblEdu"]/text()').extract_first( ) item["political_status"] = response.xpath( '//span[@id="ess_ctr742_LawyerView_lblParty"]/text()' ).extract_first() item["headurl"] = ''.join( http_util.downloadImage([ "http://app.bjsf.gov.cn" + response.xpath('//img[@id="ess_ctr742_LawyerView_Image1"]/@src' ).extract_first() ], 'lawyer_pics/beijing')) item["lawnumber"] = response.xpath( '//span[@id="ess_ctr742_LawyerView_lblCertificate_Code"]/text()' ).extract_first() item["professional_status"] = 0 if response.xpath( '//span[@id="ess_ctr742_LawyerView_lblStatus"]/text()' ).extract_first() == u"执业" else 1 item["personnel_type"] = response.xpath( '//span[@id="ess_ctr742_LawyerView_lblPerson_Type"]/text()' ).extract_first() start_time = response.xpath( '//span[@id="ess_ctr742_LawyerView_lblFirst_Date"]/text()' ).extract_first() if start_time != None: item["start_time"] = start_time.replace("/", "-") get_time = response.xpath( '//span[@id="ess_ctr742_LawyerView_lblCompetency_Date"]/text()' ).extract_first() if get_time != None: item["get_time"] = get_time.replace("/", "-") item["cert_type"] = response.xpath( '//span[@id="ess_ctr742_LawyerView_lblCompetency_Type"]/text()' ).extract_first() item["profession"] = '' item["ispartnership"] = 0 if response.xpath( '//span[@id="ess_ctr742_LawyerView_lblIsCopartner"]/text()' ).extract_first() == u"否" else 1 item["firm"] = response.xpath( '//span[@id="ess_ctr742_LawyerView_lblLo_Name"]/text()' ).extract_first() item["province"] = u"北京" item['collection'] = 'lawyers' item["url"] = response.url return item
def pase_item_details(self, response): detail_item= LawyerInfoItem() detail_item["url"] = response.url #省份 detail_item["province"]=u"山西" #姓名 detail_item["name"]="".join(response.xpath('//*[@id="Form1"]/div[3]/div[2]/table/tr[1]/td[2]/text()').extract()) #性别 sex = "".join(response.xpath('//*[@id="Form1"]/div[3]/div[2]/table/tr[1]/td[4]/text()').extract()) if(sex==u"男"): detail_item["sex"]="0" elif(sex==u"女"): detail_item["sex"]="1" else: detail_item["sex"]="" #民族 detail_item["nation"]="".join(response.xpath('//*[@id="Form1"]/div[3]/div[2]/table/tr[2]/td[2]/text()').extract()) #学历 detail_item["education"] = "".join(response.xpath('//*[@id="Form1"]/div[3]/div[2]/table/tr[2]/td[4]/text()').extract()) # 政治面貌 detail_item["political_status"] = "".join(response.xpath('//*[@id="Form1"]/div[3]/div[2]/table/tr[3]/td[4]/text()').extract()) # 头像路径 headurl = "".join(response.xpath('//*[@id="Form1"]/div[3]/div[2]/table/tr[1]/td[5]/img/@src').extract()) if(headurl==""): detail_item["headurl"] = "" else: detail_item["headurl"] = ''.join(http_util.downloadImage(["http://sx.sxlawyer.cn".join(response.xpath('//*[@id="Form1"]/div[3]/div[2]/table/tr[1]/td[5]/img/@src').extract())], 'lawyer_pics/shanxi')) # 律师职业证号 detail_item["lawnumber"] = "".join(response.xpath('//*[@id="Form1"]/div[3]/div[2]/table/tr[6]/td[4]/text()').extract()) # 职业状态:0-正常、1-注销 professional_status ="".join(response.xpath('//*[@id="Form1"]/div[3]/div[2]/table/tr[7]/td[2]/text()').extract()) if(professional_status==u"在职"): detail_item["professional_status"] ="0" else: detail_item["professional_status"] = "1" # 人员类型:专职 detail_item["personnel_type"] = "".join(response.xpath('//*[@id="Form1"]/div[3]/div[2]/table/tr[6]/td[2]/text()').extract()) # 首次执业时间 detail_item["start_time"] = "".join(response.xpath('//*[@id="Form1"]/div[3]/div[2]/table/tr[10]/td[2]/text()').extract()) # 资格证获取时间 detail_item["get_time"] = "".join(response.xpath('//*[@id="Form1"]/div[3]/div[2]/table/tr[8]/td[2]/text()').extract()) # 证书类型 detail_item["cert_type"] = "" # 专业 detail_item["profession"] = "" # 是否合伙人 0-否 1-是 ispartnership="".join(response.xpath('//*[@id="Form1"]/div[3]/div[2]/table/tr[2]/td[1]/text()').extract()) if(ispartnership==u"是"): detail_item["ispartnership"] ="1" elif(ispartnership==u"否"): detail_item["ispartnership"]="0" else: detail_item["ispartnership"]="" # 所属律所 detail_item["firm"] = "".join(response.xpath('//*[@id="Form1"]/div[3]/div[2]/table/tr[8]/td[4]/text()').extract()) detail_item['collection'] = 'lawyers' return detail_item
def parse_detail(self, response): item = {} # #[UIID],[UIPhone] ,[UIName] ,[UIEmail] ,[UIPic],[UILawNumber],[LawOrg],[ProvinceCode],[CityCode],[Address],[UISignature] item["UIID"] = str(uuid.uuid1()).replace('-', '') uiphone = ''.join( response.xpath( '/html/body/div[1]/div[1]/div/div/div[2]/div[2]/text()').re( '[^\s+]')) match_count = len(re.findall(r'[1][3,4,5,6,7,8][0-9]{9}', uiphone)) item['UILawNumber'] = ''.join( response.xpath( "/html/body/div[1]/div[1]/div/div/div[2]/div[3]/text()").re( '[^\s+]')) if item["UILawNumber"] != None and len( item["UILawNumber"] ) == 17 and self.userInfoInfoData.find_lawyer_by_lawlumber( (item["UILawNumber"], )) == None: item["UIPhone"] = None if match_count == 0 else uiphone item['UIName'] = ''.join( response.xpath( "/html/body/div[1]/div[1]/div/div/div[2]/div[1]/div[1]/text()" ).re('[^\s+]')).replace(u'律师', '') item['LawOrg'] = ''.join( response.xpath( '/html/body/div[1]/div[1]/div/div/div[2]/div[4]/text()'). re('[^\s+]')) item['UIEmail'] = None item["UISignature"] = ''.join( response.css('#about::text').re('[^\s+]')).replace("\t", '') item["Address"] = ''.join( response.xpath( '/html/body/div[1]/div[1]/div/div/div[2]/div[5]/text()'). re('[^\s+]')) item["ProvinceCode"] = ''.join( self.areaData.find_area_by_name_return_code( (response.meta['province']))) item["CityCode"] = None fiil_str = ''.join( response.xpath( '/html/body/div[1]/div[1]/div/div/div[3]/span/text()'). extract()).replace('\r', '').replace('\t', '').replace('\n', '') item['fiil_str'] = field_info_dic.find_field_by_name( fiil_str.split(" ")) # 头像路径 dirname = 'fws' head_url = ''.join( response.css('.lshil3-1-1 img::attr(src)').extract()) item["UIPic"] = ''.join( http_util.downloadImage([head_url], '/AppFile/' + dirname + "/" + item["UIID"] + '/head')) if item["UIPic"] == '' or item["UIPic"] == None: item["UIPic"] = '/APPFile/head.jpg' item['url'] = response.url return item
def parse_detail(self, response): item = {} # #[UIID],[UIPhone] ,[UIName] ,[UIEmail] ,[UIPic],[UILawNumber],[LawOrg],[ProvinceCode],[CityCode],[Address],[UISignature] print response.url item["UIID"] = str(uuid.uuid1()).replace('-', '') uiphone = ''.join( response.xpath( '/html/body/div[2]/div/div[3]/div/dl/dd/li[6]/text()').re( '[^s]')).replace(' ', '').replace('\t', '').replace('\n', '') match_count = len(re.findall(r'[1][3,4,5,6,7,8][0-9]{9}', uiphone)) item['UILawNumber'] = ''.join( response.xpath( '/html/body/div[2]/div/div[3]/div/dl/dd/li[3]/text()').re( '[^s]')).replace(' ', '').replace('\t', '').replace('\n', '') if item["UILawNumber"] != None and len( item["UILawNumber"] ) == 17 and self.userInfoInfoData.find_lawyer_by_lawlumber( (item["UILawNumber"], )) == None: item["UIPhone"] = None if match_count == 0 else uiphone item['UIName'] = ''.join( response.xpath( '/html/body/div[2]/div/div[3]/div/dl/dd/h1/text()').re( '[^s]')).replace(' ', '').replace('\t', '').replace('\n', '') item["ProvinceCode"] = self.provincode item['LawOrg'] = ''.join( response.xpath( '/html/body/div[2]/div/div[3]/div/dl/dd/li[4]/a/text()'). re('[^s]')).replace(' ', '').replace('\t', '').replace('\n', '') item['UIEmail'] = None item["UISignature"] = ''.join( response.xpath('//*[@id="news_content_0"]/text()').re( '[^s]')).replace(' ', '').replace('\t', '').replace( '\n', '').replace('\r', '') item["Address"] = None item["CityCode"] = ''.join( self.areaData.find_area_by_name_return_code( (response.meta['cityname']))) # 头像路径 dirname = 'hlj' headurl = response.xpath( '/html/body/div[2]/div/div[3]/div/dl/dt/img/@src' ).extract_first() item["UIPic"] = ''.join( http_util.downloadImage(["http://hl.12348.gov.cn" + headurl], '/AppFile/' + dirname + "/" + item["UIID"] + '/head')) item['url'] = response.url return item
def parse_lawyer_item(self, response): item = {} zhiye = response.xpath( "//dl[@class='information_practice information_practice_new']") # print ''.join(zhiye.extract()) item["UILawNumber"] = ''.join( zhiye.xpath(u'dd/span[contains(text(),"律师证编号:")]/text()').extract( )).replace(u'执业律师 (律师证编号:', '').replace(u')', '').replace(' ', '') uiphone = ''.join( response.css('.right_consult_phone a::text').extract()) match_count = len(re.findall(r'[1][3,4,5,6,7,8][0-9]{9}', uiphone)) item["UIPhone"] = None if match_count == 0 else uiphone #如果数据库不存在执业证号 if item["UILawNumber"] != None and len( item["UILawNumber"] ) == 17 and self.userInfoInfoData.find_lawyer_by_lawlumber( (item["UILawNumber"], )) == None and item["UIPhone"] != None: item["UIName"] = ''.join( response.xpath('//h1[@class="lvshi_info_name"]/text()'). extract()).replace(' ', '').replace(u"律师", '') item["LawOrg"] = response.xpath( '//p[@class="lvshi_info_add"]/text()').extract_first() item["Address"] = ''.join( response.css( '.information_practice_dd::text').extract()).replace( ' ', '') item["UIEmail"] = None desc = ''.join( response.xpath("//p[@class='information_info']/span/text()"). extract()).replace(u"\xa0", '') desc = re.sub( r'(<a.*?>.*?</a>)|((class|style|color|href)="[^"]*?")|(<.*?>)|(<[/].*?>)', '', desc).replace("\r", '').replace("\n", '').replace(' ', '') item["UISignature"] = None if desc == '' else desc.replace( u"\xa0", '').replace("\t", '').replace("\n", '').replace( ' ', '').replace(u'&', '').replace('...', '') item["ProvinceCode"] = ''.join( self.areaData.find_area_by_name_return_code( (response.meta['province']))) item["CityCode"] = ''.join( self.areaData.find_area_by_name_return_code( (response.meta['city']))) item["UIID"] = str(uuid.uuid1()).replace('-', '') item["UIPic"] = ''.join( http_util.downloadImage([ "http:" + ''.join( response.css( '.lvshi_info_pic a img::attr(src)').extract()) ], '/AppFile/' + item["UIID"] + '/head')) item["url"] = response.url return item
def parse_detail(self, response): item = LawyerInfoItem() item["name"] = response.xpath( '//div[@class="list-item page"]/dl[@class="user-info"]/dd[@class="name"]/text()' ).extract_first() item["sex"] = 0 if response.xpath( '//div[@id="detail01"]/div[@class="body"]/ul[@class="info-list clearfix"]/li[2]/text()' ).extract_first() == u"男" else 1 item["nation"] = response.xpath( '//div[@id="detail01"]/div[@class="body"]/ul[@class="info-list clearfix"]/li[4]/text()' ).extract_first() item["education"] = response.xpath( '//div[@id="detail01"]/div[@class="body"]/ul[@class="info-list clearfix"]/li[5]/text()' ).extract_first() item["political_status"] = response.xpath( '//div[@id="detail01"]/div[@class="body"]/ul[@class="info-list clearfix"]/li[7]/text()' ).extract_first() item["headurl"] = ''.join( http_util.downloadImage( response.xpath('//dt[@class="avatar"]/img/@src').extract(), 'lawyer_pics/shanghai')) item["lawnumber"] = response.xpath( '//div[@id="detail01"]/div[@class="body"]/ul[@class="info-list clearfix"]/li[1]/text()' ).extract_first() item["professional_status"] = 0 if response.xpath( '//ul[@class="user-credit"]/li/div/text()').extract_first( ) == u"正常" else 1 item["personnel_type"] = response.xpath( '//div[@id="detail01"]/div[@class="body"]/ul[@class="info-list clearfix"]/li[6]/text()' ).extract_first() item["start_time"] = response.xpath( '//div[@id="detail01"]/div[@class="body"]/ul[@class="info-list clearfix"]/li[9]/text()' ).extract_first() item["get_time"] = response.xpath( '//div[@id="detail01"]/div[@class="body"]/ul[@class="info-list clearfix"]/li[10]/text()' ).extract_first() item["cert_type"] = '' item["profession"] = '' item["ispartnership"] = '' item["firm"] = response.xpath( '//dl[@class="user-info"]/dd[@class="info"][2]/a/text()' ).extract_first() item["province"] = u"上海" item["url"] = response.url item['collection'] = 'lawyers' return item
def parse_detail(self, response): item = {} # #[UIID],[UIPhone] ,[UIName] ,[UIEmail] ,[UIPic],[UILawNumber],[LawOrg],[ProvinceCode],[CityCode],[Address],[UISignature] item["UIID"] = str(uuid.uuid1()).replace('-', '') uiphone = ''.join( response.xpath('/html/body/div[3]/table/tbody/tr[5]/td[4]/text()'). extract()).replace('\t', '').replace('\n', '') match_count = len(re.findall(r'[1][3,4,5,6,7,8][0-9]{9}', uiphone)) item['UILawNumber'] = ''.join( response.css('.font18::text').extract()).replace( u'执业证号 (', '').replace(u')', '').replace(u"\xa0", '') if item["UILawNumber"] != None and len( item["UILawNumber"] ) == 17 and self.userInfoInfoData.find_lawyer_by_lawlumber( (item["UILawNumber"], )) == None: item["UIPhone"] = None if match_count == 0 else uiphone item['UIName'] = ''.join(response.css('.font28::text').extract()) item["ProvinceCode"] = self.provincode item['LawOrg'] = ''.join( response.css('.lsjjxg3::text').extract()).replace('\t', '').replace( '\n', '') item['UIEmail'] = ''.join( response.xpath( '/html/body/div[3]/table/tbody/tr[6]/td[4]/text()'). extract()).replace('\t', '').replace('\n', '') item["UISignature"] = None item["Address"] = None item["CityCode"] = ''.join( self.areaData.find_area_by_name_return_code( (response.meta['cityname']))) # 头像路径 dirname = 'sichuan' head_url = ''.join( response.xpath( '/html/body/div[3]/table/tbody/tr[1]/td[1]/img/@src'). extract()) item["UIPic"] = ''.join( http_util.downloadImage(["http://sd.12348.gov.cn/" + head_url], '/AppFile/' + dirname + "/" + item["UIID"] + '/head')) item['url'] = response.url return item
def parse_detail(self, response): item = {} # #[UIID],[UIPhone] ,[UIName] ,[UIEmail] ,[UIPic],[UILawNumber],[LawOrg],[ProvinceCode],[CityCode],[Address],[UISignature] item["UIID"] = str(uuid.uuid1()).replace('-', '') uiphone = response.css('.zynx::text').extract_first().replace( '\t', '').replace('\r', '').replace('\n', '') match_count = len(re.findall(r'[1][3,4,5,6,7,8][0-9]{9}', uiphone)) item['UILawNumber'] = response.css( '.zyzh::text').extract_first().replace('\t', '').replace( '\r', '').replace('\n', '') if item["UILawNumber"] != None and len( item["UILawNumber"] ) == 17 and self.userInfoInfoData.find_lawyer_by_lawlumber( (item["UILawNumber"], )) == None: item["UIPhone"] = None if match_count == 0 else uiphone item['UIName'] = response.xpath( "//div[@class='row ryjs-top-name']/h3/text()").extract_first( ).replace('\t', '').replace('\r', '').replace('\n', '') item["ProvinceCode"] = self.provincode item['LawOrg'] = response.css( ".zyjg a::text").extract_first().replace('\t', '').replace( '\r', '').replace('\n', '') item['UIEmail'] = None item["UISignature"] = None item['fiil_str'] = field_info_dic.find_field_by_name(''.join( response.css("#ywzc::attr(value)").extract()).split(u",")) item["Address"] = response.xpath( "/html/body/div[1]/div[4]/div/div[2]/div[2]/div[1]/div[5]/span/text()" ).extract_first().replace('\t', '').replace('\r', '').replace('\n', '') item["CityCode"] = ''.join( self.areaData.find_area_by_name_return_code( (response.meta['cityname']))) # 头像路径 dirname = 'guangxi' headurl = "http://gx.12348.gov.cn" + ''.join( response.xpath('//img[@id="img-billid"]/@src').extract() ).replace('..', '') item["UIPic"] = ''.join( http_util.downloadImage([headurl], '/AppFile/' + dirname + "/" + item["UIID"] + '/head')) item['url'] = response.url return item
def parse_detail(self, response): item = {} # #[UIID],[UIPhone] ,[UIName] ,[UIEmail] ,[UIPic],[UILawNumber],[LawOrg],[ProvinceCode],[CityCode],[Address],[UISignature] item["UIID"] = str(uuid.uuid1()).replace('-', '') table = response.xpath('//div[@class="zi35"]/table') uiphone = "".join( table.xpath('tr[7]/td/text()').re('[^\s]')).split(u':')[1] match_count = len(re.findall(r'[1][3,4,5,6,7,8][0-9]{9}', uiphone)) item['UILawNumber'] = "".join( table.xpath('tr[11]/td[1]/text()').re('[^\s]')).split(u':')[1] if item["UILawNumber"] != None and len( item["UILawNumber"] ) == 17 and self.userInfoInfoData.find_lawyer_by_lawlumber( (item["UILawNumber"], )) == None: item["UIPhone"] = None if match_count == 0 else uiphone item['UIName'] = "".join( table.xpath('tr[1]/td[1]/text()').re('[^\s]')).split(u':')[1] item["ProvinceCode"] = ''.join( self.areaData.find_area_by_name_return_code((u'辽宁'))) item['LawOrg'] = "".join( table.xpath('tr[2]/td/text()').re('[^\s]')).split(u':')[1] item['UIEmail'] = "".join( table.xpath('tr[14]/td/text()').re('[^\s]')).split(':')[1] item["UISignature"] = None item['FIID'] = None item["Address"] = None item["CityCode"] = None # 头像路径 dirname = 'liaoning' item["UIPic"] = ''.join( http_util.downloadImage([ "http://218.60.145.124:8080/lnlxoa/govhall" + "".join(table.xpath('tr[1]/td[2]/img/@src').re('[^\s]')) ], '/AppFile/' + dirname + "/" + item["UIID"] + '/head')) item['url'] = response.url return item
def parse_detail(self, response): item = LawyerInfoItem() item["name"] = response.xpath( "//table/tr[1]/td[1]/text()").extract_first() sex = response.xpath("//table/tr[1]/td[2]/text()").extract_first() if sex is not None: sex = sex.strip() if sex == u"男": sex = 0 elif sex == u"女": sex = 1 else: sex = "" item["sex"] = sex item["education"] = response.xpath( "//table/tr[5]/td[2]/text()").extract_first() item["political_status"] = response.xpath( "//table/tr[2]/td[2]/text()").extract_first() headurl = 'http://www.sxsf.gov.cn' + response.xpath( '//table/tr[1]/td[3]/img/@src').extract_first() item["headurl"] = ''.join( http_util.downloadImage([headurl], 'lawyer_pics/sshanxi')) item["lawnumber"] = response.xpath( "//table/tr[3]/td[1]/text()").extract_first() item["get_time"] = response.xpath( "//table/tr[4]/td[1]/text()").extract_first() item["cert_type"] = response.xpath( "//table/tr[3]/td[2]/text()").extract_first() item["profession"] = ''.join( response.xpath("//table/tr[9]/td[1]/text()").extract()).split(u'、') item["firm"] = response.xpath( "//table/tr[10]/td[1]/a/text()").extract_first() item['collection'] = 'lawyers' item['province'] = u'陕西' item['url'] = response.url return item
def parse_lawyer_item(self, response): item = {} item["UILawNumber"] = ''.join( response.xpath( u'//p[contains(text(),"执业证号:")]/text()').extract()).replace( ' ', '').replace(u'执业证号:', '') uiphone = ''.join( response.xpath( '//strong[@class="mobile"]/text()').extract()).replace( ' ', '') match_count = len(re.findall(r'[1][3,4,5,6,7,8][0-9]{9}', uiphone)) item["UIPhone"] = None if match_count == 0 else uiphone #如果数据库不存在执业证号 if item["UILawNumber"] != None and len( item["UILawNumber"] ) == 17 and self.userInfoInfoData.find_lawyer_by_lawlumber( (item["UILawNumber"], )) == None and item["UIPhone"] != no: item["UIName"] = ''.join( response.xpath( '//strong[@class="lawyername"]/text()').extract()).replace( ' ', '').replace(u"律师", '') item["LawOrg"] = response.xpath( '//p[@class="jigou"][1]/a/text()').extract_first() item["Address"] = ''.join( response.xpath( u'//p[contains(text(),"地\xa0\xa0\xa0\xa0址:")]/text()'). extract()).replace(' ', '').replace(u'地\xa0\xa0\xa0\xa0址:', '') item["UIEmail"] = ''.join( response.xpath( u'//p[contains(text(),"邮\xa0\xa0\xa0\xa0箱:")]/text()'). extract()).replace(' ', '').replace(u'邮\xa0\xa0\xa0\xa0箱:', '') fiil_str = ''.join( response.xpath(u'//p[contains(text(),"专长领域:")]/text()'). extract()).replace(' ', '').replace(u'专长领域:', '') desc = ''.join( response.xpath( "//div[@class='content'][last()]/*").extract()).replace( u"\xa0", '') desc = re.sub( r'(<a.*?>.*?</a>)|((class|style|color|href)="[^"]*?")|(<.*?>)|(<[/].*?>)', '', desc).replace("\r", '').replace("\n", '').replace(' ', '') s_start_index = 0 if desc.index(u'分享到:') == -1 else desc.index( u'分享到:') item["UISignature"] = None if desc == '' else desc[ s_start_index:].replace(u'分享到:', '').replace( u"\xa0", '').replace("\t", '').replace("\n", '').replace( ' ', '').replace(u'&', '').replace('...', '') province_city = response.xpath( '//div[@class="info_nm SG_txtc "]/text()').extract_first( ).replace("\r", '').replace("\n", '').split(" ") item["ProvinceCode"] = ''.join( self.areaData.find_area_by_name_return_code( (province_city[0]))) item["CityCode"] = ''.join( self.areaData.find_area_by_name_return_code( (province_city[1]))) item['fiil_str'] = field_info_dic.find_field_by_name( fiil_str.split(u"\xa0")) item["UIID"] = str(uuid.uuid1()).replace('-', '') item["UIPic"] = ''.join( http_util.downloadImage([ "http://lawyer.fabang.com" + ''.join( response.css( '.info_img_area img::attr(src)').extract()) ], '/AppFile/' + item["UIID"] + '/head')) item["url"] = response.url return item