Example #1
0
 def parse_detail(self, data):
     item = {}
     #  #[UIID],[UIPhone] ,[UIName] ,[UIEmail] ,[UIPic],[UILawNumber],[LawOrg],[ProvinceCode],[CityCode],[Address],[UISignature]
     item["UIID"] = str(uuid.uuid1()).replace('-', '')
     uiphone = '' if data.has_key(
         'cell_phone') == False else data['cell_phone']
     match_count = len(re.findall(r'[1][3,4,5,6,7,8][0-9]{9}', uiphone))
     item['UILawNumber'] = None if data.has_key(
         'accountcode') == False else data['accountcode']
     if item["UILawNumber"] != None and len(
             item["UILawNumber"]
     ) == 17 and self.userInfoInfoData.find_lawyer_by_lawlumber(
         (item["UILawNumber"], )) == None:
         item["UIPhone"] = None if match_count == 0 else uiphone
         item['UIName'] = data['user_name']
         item["ProvinceCode"] = self.provincode
         item['LawOrg'] = None if data.has_key(
             'accountorg') == False else data['accountorg']
         item['UIEmail'] = None if data.has_key(
             'email') == False else data['email']
         item["UISignature"] = None
         item["Address"] = None if data.has_key(
             'address') == False else data['address']
         item['UISex'] = 0 if data['sex'] == 1 else 1
         item["CityCode"] = ''.join(
             self.areaData.find_area_by_name_return_code(
                 (data['cityname'])))
         # 头像路径
         dirname = "shandong"
         item["UIPic"] = ''.join(
             http_util.downloadImage([data['picImg']],
                                     '/AppFile/' + dirname + "/" +
                                     item["UIID"] + '/head'))
         return item
Example #2
0
 def parse_detail(self, response):
     item = LawyerInfoItem()
     item["name"] = response.xpath(
         '//span[@id="Label1"]/text()').extract_first()
     item["sex"] = ''
     item["nation"] = response.xpath(
         '//span[@id="Label6"]/text()').extract_first()
     item["education"] = response.xpath(
         '//span[@id="Label7"]/text()').extract_first()
     item["political_status"] = response.xpath(
         '//span[@id="Label8"]/text()').extract_first()
     headurl = ''.join(
         response.xpath('//img[@id="Image1"]/@src').extract()).replace(
             '121.197.1.207:8001', '118.178.181.229:8000')
     item["headurl"] = ''.join(
         http_util.downloadImage([headurl], 'lawyer_pics/chongqing'))
     item["lawnumber"] = response.xpath(
         '//span[@id="Label11"]/text()').extract_first()
     item["professional_status"] = ''
     item["personnel_type"] = ''
     item["start_time"] = ''
     item["get_time"] = response.xpath(
         '//span[@id="Label12"]/text()').extract_first()
     item["cert_type"] = ''
     item["profession"] = ''
     item["ispartnership"] = ''
     item["firm"] = response.xpath(
         '//span[@id="Label3"]/text()').extract_first()
     item["province"] = u"重庆"
     item["url"] = response.url
     item['collection'] = 'lawyers'
     return item
Example #3
0
 def parse_list(self, response):
     data = json.loads(response.body_as_unicode())
     for dc in data['data']['items']:
         item = LawyerInfoItem()
         item['name'] = dc['username']
         if dc['usersex'] != None:
             item['sex'] = int(dc['usersex'])
         item['personnel_type'] = self.search_personnel_type(
             dc['lawyertype'])
         item['firm'] = dc['lawofficename']
         item['lawnumber'] = dc['workcardnum']
         item['get_time'] = ''
         item['start_time'] = '' if dc['practiceyear'] == None else dc[
             'practiceyear']
         item['province'] = u'天津'
         item['education'] = self.search_edu(dc['cultuerlev'])
         item['cert_type'] = ''
         item['headurl'] = ''
         imagesrc = str(dc['image'])
         if imagesrc != '':
             headurl = "http://111.160.0.142:8091/lawyer/resources/photo/" + imagesrc
             item['headurl'] = ''.join(
                 http_util.downloadImage([headurl], 'lawyer_pics/tianjin'))
         item['ispartnership'] = ''
         item['nation'] = ''
         item['political_status'] = ''
         item['professional_status'] = 0 if dc['officeresult'] == "0" else 1
         item['profession'] = ''
         item[
             'url'] = 'http://111.160.0.142:8091/lawyer/home/lawyer-detail.html?id={0}'.format(
                 (dc['lawyerid']))
         item['collection'] = 'lawyers'
         yield item
Example #4
0
 def parse_detail(self, response):
     item = {}
     #  #[UIID],[UIPhone] ,[UIName] ,[UIEmail] ,[UIPic],[UILawNumber],[LawOrg],[ProvinceCode],[CityCode],[Address],[UISignature]
     data = json.loads(response.body_as_unicode())
     item["UIID"] = str(uuid.uuid1()).replace('-', '')
     uiphone = data['telnum']
     match_count = len(re.findall(r'[1][3,4,5,6,7,8][0-9]{9}', uiphone))
     item['UILawNumber'] = data['licenseno']
     if item["UILawNumber"] != None and len(
             item["UILawNumber"]
     ) == 17 and self.userInfoInfoData.find_lawyer_by_lawlumber(
         (item["UILawNumber"], )) == None:
         item["UIPhone"] = None if match_count == 0 else uiphone
         item['UIName'] = data['name']
         item["ProvinceCode"] = self.provincode
         item['LawOrg'] = data['lawfirmname']
         item['UIEmail'] = None
         item["UISignature"] = data['lawyerinfo']
         item['fiil_str'] = field_info_dic.find_field_by_name(
             data['zhuangchang'])
         item["Address"] = data['lawfirmaddress']
         item["CityCode"] = ''.join(
             self.areaData.find_area_by_name_return_code(
                 (response.meta['cityname'])))
         # 头像路径
         dirname = self.name
         item["UIPic"] = ''.join(
             http_util.downloadImage(
                 ["http://sd.12348.gov.cn" + data['logourl']],
                 '/AppFile/' + dirname + "/" + item["UIID"] + '/head'))
         item['url'] = response.url
         return item
Example #5
0
 def parse_detail(self, data, cityname):
     item = {}
     #  #[UIID],[UIPhone] ,[UIName] ,[UIEmail] ,[UIPic],[UILawNumber],[LawOrg],[ProvinceCode],[CityCode],[Address],[UISignature]
     item["UIID"] = str(uuid.uuid1()).replace('-', '')
     uiphone = '' if data.has_key('sjhm') == False else data['sjhm']
     match_count = len(re.findall(r'[1][3,4,5,6,7,8][0-9]{9}', uiphone))
     item['UILawNumber'] = None if data.has_key('zyzh') == False else data['zyzh']
     if item["UILawNumber"] != None and len(
             item["UILawNumber"]) == 17 and self.userInfoInfoData.find_lawyer_by_lawlumber(
             (item["UILawNumber"],)) == None:
         item["UIPhone"] = None if match_count == 0 else uiphone
         item['UIName'] = data['lsxm']
         item["ProvinceCode"] = self.provincode
         item['LawOrg'] = None if data.has_key('deptName') == False else data['deptName']
         item['UIEmail'] = None if data.has_key('dzyx') == False else data['dzyx']
         item["UISignature"] = None if data.has_key('grjj') == False else data['grjj']
         item["Address"] = None if data.has_key('lxdz') == False else data['lxdz'].replace("\r",'').replace("\n",'').replace(' ','')
         item['UISex'] = None if data.has_key('xbMc') == False else (0 if data['xbMc']==u'男' else 1)
         item["CityCode"] = ''.join(self.areaData.find_area_by_name_return_code((cityname)))
         fiil_str = None if data.has_key('ywzcmc') == False else data['ywzcmc']
         if fiil_str != None:
             item['fiil_str'] = field_info_dic.find_field_by_name(fiil_str.split(u","))
         # 头像路径
         dirname = "ningxia"
         item["UIPic"] = ''.join(
             http_util.downloadImage(['http://nx.12348.gov.cn/flfw-xt/views/picture/'+data['lszp']], '/AppFile/' + dirname + "/" + item["UIID"] + '/head'))
         return item
Example #6
0
    def parse_lawyer_item(self, response):
        item = LawyerInfoItem()
        item["name"] = response.xpath(
            '//span[@id="ess_ctr742_LawyerView_lblName"]/text()'
        ).extract_first()
        item["sex"] = 0 if response.xpath(
            '//span[@id="ess_ctr742_LawyerView_lblSex"]/text()').extract_first(
            ) == u"男" else 1
        item["nation"] = response.xpath(
            '//span[@id="ess_ctr742_LawyerView_lblFolk"]/text()'
        ).extract_first()
        item["education"] = response.xpath(
            '//span[@id="ess_ctr742_LawyerView_lblEdu"]/text()').extract_first(
            )
        item["political_status"] = response.xpath(
            '//span[@id="ess_ctr742_LawyerView_lblParty"]/text()'
        ).extract_first()
        item["headurl"] = ''.join(
            http_util.downloadImage([
                "http://app.bjsf.gov.cn" +
                response.xpath('//img[@id="ess_ctr742_LawyerView_Image1"]/@src'
                               ).extract_first()
            ], 'lawyer_pics/beijing'))
        item["lawnumber"] = response.xpath(
            '//span[@id="ess_ctr742_LawyerView_lblCertificate_Code"]/text()'
        ).extract_first()
        item["professional_status"] = 0 if response.xpath(
            '//span[@id="ess_ctr742_LawyerView_lblStatus"]/text()'
        ).extract_first() == u"执业" else 1
        item["personnel_type"] = response.xpath(
            '//span[@id="ess_ctr742_LawyerView_lblPerson_Type"]/text()'
        ).extract_first()
        start_time = response.xpath(
            '//span[@id="ess_ctr742_LawyerView_lblFirst_Date"]/text()'
        ).extract_first()
        if start_time != None:
            item["start_time"] = start_time.replace("/", "-")
        get_time = response.xpath(
            '//span[@id="ess_ctr742_LawyerView_lblCompetency_Date"]/text()'
        ).extract_first()
        if get_time != None:
            item["get_time"] = get_time.replace("/", "-")
        item["cert_type"] = response.xpath(
            '//span[@id="ess_ctr742_LawyerView_lblCompetency_Type"]/text()'
        ).extract_first()
        item["profession"] = ''
        item["ispartnership"] = 0 if response.xpath(
            '//span[@id="ess_ctr742_LawyerView_lblIsCopartner"]/text()'
        ).extract_first() == u"否" else 1
        item["firm"] = response.xpath(
            '//span[@id="ess_ctr742_LawyerView_lblLo_Name"]/text()'
        ).extract_first()
        item["province"] = u"北京"
        item['collection'] = 'lawyers'
        item["url"] = response.url

        return item
Example #7
0
 def pase_item_details(self, response):
     detail_item=  LawyerInfoItem()
     detail_item["url"] = response.url
     #省份
     detail_item["province"]=u"山西"
     #姓名
     detail_item["name"]="".join(response.xpath('//*[@id="Form1"]/div[3]/div[2]/table/tr[1]/td[2]/text()').extract())
     #性别
     sex = "".join(response.xpath('//*[@id="Form1"]/div[3]/div[2]/table/tr[1]/td[4]/text()').extract())
     if(sex==u"男"):
         detail_item["sex"]="0"
     elif(sex==u"女"):
         detail_item["sex"]="1"
     else:
         detail_item["sex"]=""
     #民族
     detail_item["nation"]="".join(response.xpath('//*[@id="Form1"]/div[3]/div[2]/table/tr[2]/td[2]/text()').extract())
    #学历
     detail_item["education"] = "".join(response.xpath('//*[@id="Form1"]/div[3]/div[2]/table/tr[2]/td[4]/text()').extract())
     # 政治面貌
     detail_item["political_status"] = "".join(response.xpath('//*[@id="Form1"]/div[3]/div[2]/table/tr[3]/td[4]/text()').extract())
     # 头像路径
     headurl = "".join(response.xpath('//*[@id="Form1"]/div[3]/div[2]/table/tr[1]/td[5]/img/@src').extract())
     if(headurl==""):
         detail_item["headurl"] = ""
     else:
         detail_item["headurl"] = ''.join(http_util.downloadImage(["http://sx.sxlawyer.cn".join(response.xpath('//*[@id="Form1"]/div[3]/div[2]/table/tr[1]/td[5]/img/@src').extract())], 'lawyer_pics/shanxi'))
     # 律师职业证号
     detail_item["lawnumber"] = "".join(response.xpath('//*[@id="Form1"]/div[3]/div[2]/table/tr[6]/td[4]/text()').extract())
     # 职业状态:0-正常、1-注销
     professional_status ="".join(response.xpath('//*[@id="Form1"]/div[3]/div[2]/table/tr[7]/td[2]/text()').extract())
     if(professional_status==u"在职"):
         detail_item["professional_status"] ="0"
     else:
         detail_item["professional_status"] = "1"
     # 人员类型:专职
     detail_item["personnel_type"] = "".join(response.xpath('//*[@id="Form1"]/div[3]/div[2]/table/tr[6]/td[2]/text()').extract())
     # 首次执业时间
     detail_item["start_time"] = "".join(response.xpath('//*[@id="Form1"]/div[3]/div[2]/table/tr[10]/td[2]/text()').extract())
     # 资格证获取时间
     detail_item["get_time"] = "".join(response.xpath('//*[@id="Form1"]/div[3]/div[2]/table/tr[8]/td[2]/text()').extract())
     # 证书类型
     detail_item["cert_type"] = ""
     # 专业
     detail_item["profession"] = ""
     # 是否合伙人 0-否 1-是
     ispartnership="".join(response.xpath('//*[@id="Form1"]/div[3]/div[2]/table/tr[2]/td[1]/text()').extract())
     if(ispartnership==u"是"):
          detail_item["ispartnership"] ="1"
     elif(ispartnership==u"否"):
         detail_item["ispartnership"]="0"
     else:
         detail_item["ispartnership"]=""
     # 所属律所
     detail_item["firm"] = "".join(response.xpath('//*[@id="Form1"]/div[3]/div[2]/table/tr[8]/td[4]/text()').extract())
     detail_item['collection'] = 'lawyers'
     return detail_item
Example #8
0
 def parse_detail(self, response):
     item = {}
     #  #[UIID],[UIPhone] ,[UIName] ,[UIEmail] ,[UIPic],[UILawNumber],[LawOrg],[ProvinceCode],[CityCode],[Address],[UISignature]
     item["UIID"] = str(uuid.uuid1()).replace('-', '')
     uiphone = ''.join(
         response.xpath(
             '/html/body/div[1]/div[1]/div/div/div[2]/div[2]/text()').re(
                 '[^\s+]'))
     match_count = len(re.findall(r'[1][3,4,5,6,7,8][0-9]{9}', uiphone))
     item['UILawNumber'] = ''.join(
         response.xpath(
             "/html/body/div[1]/div[1]/div/div/div[2]/div[3]/text()").re(
                 '[^\s+]'))
     if item["UILawNumber"] != None and len(
             item["UILawNumber"]
     ) == 17 and self.userInfoInfoData.find_lawyer_by_lawlumber(
         (item["UILawNumber"], )) == None:
         item["UIPhone"] = None if match_count == 0 else uiphone
         item['UIName'] = ''.join(
             response.xpath(
                 "/html/body/div[1]/div[1]/div/div/div[2]/div[1]/div[1]/text()"
             ).re('[^\s+]')).replace(u'律师', '')
         item['LawOrg'] = ''.join(
             response.xpath(
                 '/html/body/div[1]/div[1]/div/div/div[2]/div[4]/text()').
             re('[^\s+]'))
         item['UIEmail'] = None
         item["UISignature"] = ''.join(
             response.css('#about::text').re('[^\s+]')).replace("\t", '')
         item["Address"] = ''.join(
             response.xpath(
                 '/html/body/div[1]/div[1]/div/div/div[2]/div[5]/text()').
             re('[^\s+]'))
         item["ProvinceCode"] = ''.join(
             self.areaData.find_area_by_name_return_code(
                 (response.meta['province'])))
         item["CityCode"] = None
         fiil_str = ''.join(
             response.xpath(
                 '/html/body/div[1]/div[1]/div/div/div[3]/span/text()').
             extract()).replace('\r', '').replace('\t',
                                                  '').replace('\n', '')
         item['fiil_str'] = field_info_dic.find_field_by_name(
             fiil_str.split(" "))
         # 头像路径
         dirname = 'fws'
         head_url = ''.join(
             response.css('.lshil3-1-1 img::attr(src)').extract())
         item["UIPic"] = ''.join(
             http_util.downloadImage([head_url], '/AppFile/' + dirname +
                                     "/" + item["UIID"] + '/head'))
         if item["UIPic"] == '' or item["UIPic"] == None:
             item["UIPic"] = '/APPFile/head.jpg'
         item['url'] = response.url
         return item
 def parse_detail(self, response):
     item = {}
     #  #[UIID],[UIPhone] ,[UIName] ,[UIEmail] ,[UIPic],[UILawNumber],[LawOrg],[ProvinceCode],[CityCode],[Address],[UISignature]
     print response.url
     item["UIID"] = str(uuid.uuid1()).replace('-', '')
     uiphone = ''.join(
         response.xpath(
             '/html/body/div[2]/div/div[3]/div/dl/dd/li[6]/text()').re(
                 '[^s]')).replace(' ', '').replace('\t',
                                                   '').replace('\n', '')
     match_count = len(re.findall(r'[1][3,4,5,6,7,8][0-9]{9}', uiphone))
     item['UILawNumber'] = ''.join(
         response.xpath(
             '/html/body/div[2]/div/div[3]/div/dl/dd/li[3]/text()').re(
                 '[^s]')).replace(' ', '').replace('\t',
                                                   '').replace('\n', '')
     if item["UILawNumber"] != None and len(
             item["UILawNumber"]
     ) == 17 and self.userInfoInfoData.find_lawyer_by_lawlumber(
         (item["UILawNumber"], )) == None:
         item["UIPhone"] = None if match_count == 0 else uiphone
         item['UIName'] = ''.join(
             response.xpath(
                 '/html/body/div[2]/div/div[3]/div/dl/dd/h1/text()').re(
                     '[^s]')).replace(' ',
                                      '').replace('\t',
                                                  '').replace('\n', '')
         item["ProvinceCode"] = self.provincode
         item['LawOrg'] = ''.join(
             response.xpath(
                 '/html/body/div[2]/div/div[3]/div/dl/dd/li[4]/a/text()').
             re('[^s]')).replace(' ', '').replace('\t',
                                                  '').replace('\n', '')
         item['UIEmail'] = None
         item["UISignature"] = ''.join(
             response.xpath('//*[@id="news_content_0"]/text()').re(
                 '[^s]')).replace(' ', '').replace('\t', '').replace(
                     '\n', '').replace('\r', '')
         item["Address"] = None
         item["CityCode"] = ''.join(
             self.areaData.find_area_by_name_return_code(
                 (response.meta['cityname'])))
         # 头像路径
         dirname = 'hlj'
         headurl = response.xpath(
             '/html/body/div[2]/div/div[3]/div/dl/dt/img/@src'
         ).extract_first()
         item["UIPic"] = ''.join(
             http_util.downloadImage(["http://hl.12348.gov.cn" + headurl],
                                     '/AppFile/' + dirname + "/" +
                                     item["UIID"] + '/head'))
         item['url'] = response.url
         return item
Example #10
0
    def parse_lawyer_item(self, response):
        item = {}
        zhiye = response.xpath(
            "//dl[@class='information_practice information_practice_new']")
        # print ''.join(zhiye.extract())
        item["UILawNumber"] = ''.join(
            zhiye.xpath(u'dd/span[contains(text(),"律师证编号:")]/text()').extract(
            )).replace(u'执业律师 (律师证编号:', '').replace(u')', '').replace(' ', '')
        uiphone = ''.join(
            response.css('.right_consult_phone a::text').extract())

        match_count = len(re.findall(r'[1][3,4,5,6,7,8][0-9]{9}', uiphone))
        item["UIPhone"] = None if match_count == 0 else uiphone
        #如果数据库不存在执业证号
        if item["UILawNumber"] != None and len(
                item["UILawNumber"]
        ) == 17 and self.userInfoInfoData.find_lawyer_by_lawlumber(
            (item["UILawNumber"], )) == None and item["UIPhone"] != None:
            item["UIName"] = ''.join(
                response.xpath('//h1[@class="lvshi_info_name"]/text()').
                extract()).replace(' ', '').replace(u"律师", '')
            item["LawOrg"] = response.xpath(
                '//p[@class="lvshi_info_add"]/text()').extract_first()
            item["Address"] = ''.join(
                response.css(
                    '.information_practice_dd::text').extract()).replace(
                        ' ', '')
            item["UIEmail"] = None
            desc = ''.join(
                response.xpath("//p[@class='information_info']/span/text()").
                extract()).replace(u"\xa0", '')
            desc = re.sub(
                r'(<a.*?>.*?</a>)|((class|style|color|href)="[^"]*?")|(<.*?>)|(<[/].*?>)',
                '', desc).replace("\r", '').replace("\n", '').replace(' ', '')
            item["UISignature"] = None if desc == '' else desc.replace(
                u"\xa0", '').replace("\t", '').replace("\n", '').replace(
                    ' ', '').replace(u'&amp;', '').replace('...', '')
            item["ProvinceCode"] = ''.join(
                self.areaData.find_area_by_name_return_code(
                    (response.meta['province'])))
            item["CityCode"] = ''.join(
                self.areaData.find_area_by_name_return_code(
                    (response.meta['city'])))
            item["UIID"] = str(uuid.uuid1()).replace('-', '')
            item["UIPic"] = ''.join(
                http_util.downloadImage([
                    "http:" + ''.join(
                        response.css(
                            '.lvshi_info_pic a img::attr(src)').extract())
                ], '/AppFile/' + item["UIID"] + '/head'))
            item["url"] = response.url
            return item
Example #11
0
 def parse_detail(self, response):
     item = LawyerInfoItem()
     item["name"] = response.xpath(
         '//div[@class="list-item page"]/dl[@class="user-info"]/dd[@class="name"]/text()'
     ).extract_first()
     item["sex"] = 0 if response.xpath(
         '//div[@id="detail01"]/div[@class="body"]/ul[@class="info-list clearfix"]/li[2]/text()'
     ).extract_first() == u"男" else 1
     item["nation"] = response.xpath(
         '//div[@id="detail01"]/div[@class="body"]/ul[@class="info-list clearfix"]/li[4]/text()'
     ).extract_first()
     item["education"] = response.xpath(
         '//div[@id="detail01"]/div[@class="body"]/ul[@class="info-list clearfix"]/li[5]/text()'
     ).extract_first()
     item["political_status"] = response.xpath(
         '//div[@id="detail01"]/div[@class="body"]/ul[@class="info-list clearfix"]/li[7]/text()'
     ).extract_first()
     item["headurl"] = ''.join(
         http_util.downloadImage(
             response.xpath('//dt[@class="avatar"]/img/@src').extract(),
             'lawyer_pics/shanghai'))
     item["lawnumber"] = response.xpath(
         '//div[@id="detail01"]/div[@class="body"]/ul[@class="info-list clearfix"]/li[1]/text()'
     ).extract_first()
     item["professional_status"] = 0 if response.xpath(
         '//ul[@class="user-credit"]/li/div/text()').extract_first(
         ) == u"正常" else 1
     item["personnel_type"] = response.xpath(
         '//div[@id="detail01"]/div[@class="body"]/ul[@class="info-list clearfix"]/li[6]/text()'
     ).extract_first()
     item["start_time"] = response.xpath(
         '//div[@id="detail01"]/div[@class="body"]/ul[@class="info-list clearfix"]/li[9]/text()'
     ).extract_first()
     item["get_time"] = response.xpath(
         '//div[@id="detail01"]/div[@class="body"]/ul[@class="info-list clearfix"]/li[10]/text()'
     ).extract_first()
     item["cert_type"] = ''
     item["profession"] = ''
     item["ispartnership"] = ''
     item["firm"] = response.xpath(
         '//dl[@class="user-info"]/dd[@class="info"][2]/a/text()'
     ).extract_first()
     item["province"] = u"上海"
     item["url"] = response.url
     item['collection'] = 'lawyers'
     return item
Example #12
0
 def parse_detail(self, response):
     item = {}
     #  #[UIID],[UIPhone] ,[UIName] ,[UIEmail] ,[UIPic],[UILawNumber],[LawOrg],[ProvinceCode],[CityCode],[Address],[UISignature]
     item["UIID"] = str(uuid.uuid1()).replace('-', '')
     uiphone = ''.join(
         response.xpath('/html/body/div[3]/table/tbody/tr[5]/td[4]/text()').
         extract()).replace('\t', '').replace('\n', '')
     match_count = len(re.findall(r'[1][3,4,5,6,7,8][0-9]{9}', uiphone))
     item['UILawNumber'] = ''.join(
         response.css('.font18::text').extract()).replace(
             u'执业证号 (', '').replace(u')', '').replace(u"\xa0", '')
     if item["UILawNumber"] != None and len(
             item["UILawNumber"]
     ) == 17 and self.userInfoInfoData.find_lawyer_by_lawlumber(
         (item["UILawNumber"], )) == None:
         item["UIPhone"] = None if match_count == 0 else uiphone
         item['UIName'] = ''.join(response.css('.font28::text').extract())
         item["ProvinceCode"] = self.provincode
         item['LawOrg'] = ''.join(
             response.css('.lsjjxg3::text').extract()).replace('\t',
                                                               '').replace(
                                                                   '\n', '')
         item['UIEmail'] = ''.join(
             response.xpath(
                 '/html/body/div[3]/table/tbody/tr[6]/td[4]/text()').
             extract()).replace('\t', '').replace('\n', '')
         item["UISignature"] = None
         item["Address"] = None
         item["CityCode"] = ''.join(
             self.areaData.find_area_by_name_return_code(
                 (response.meta['cityname'])))
         # 头像路径
         dirname = 'sichuan'
         head_url = ''.join(
             response.xpath(
                 '/html/body/div[3]/table/tbody/tr[1]/td[1]/img/@src').
             extract())
         item["UIPic"] = ''.join(
             http_util.downloadImage(["http://sd.12348.gov.cn/" + head_url],
                                     '/AppFile/' + dirname + "/" +
                                     item["UIID"] + '/head'))
         item['url'] = response.url
         return item
Example #13
0
 def parse_detail(self, response):
     item = {}
     #  #[UIID],[UIPhone] ,[UIName] ,[UIEmail] ,[UIPic],[UILawNumber],[LawOrg],[ProvinceCode],[CityCode],[Address],[UISignature]
     item["UIID"] = str(uuid.uuid1()).replace('-', '')
     uiphone = response.css('.zynx::text').extract_first().replace(
         '\t', '').replace('\r', '').replace('\n', '')
     match_count = len(re.findall(r'[1][3,4,5,6,7,8][0-9]{9}', uiphone))
     item['UILawNumber'] = response.css(
         '.zyzh::text').extract_first().replace('\t', '').replace(
             '\r', '').replace('\n', '')
     if item["UILawNumber"] != None and len(
             item["UILawNumber"]
     ) == 17 and self.userInfoInfoData.find_lawyer_by_lawlumber(
         (item["UILawNumber"], )) == None:
         item["UIPhone"] = None if match_count == 0 else uiphone
         item['UIName'] = response.xpath(
             "//div[@class='row ryjs-top-name']/h3/text()").extract_first(
             ).replace('\t', '').replace('\r', '').replace('\n', '')
         item["ProvinceCode"] = self.provincode
         item['LawOrg'] = response.css(
             ".zyjg a::text").extract_first().replace('\t', '').replace(
                 '\r', '').replace('\n', '')
         item['UIEmail'] = None
         item["UISignature"] = None
         item['fiil_str'] = field_info_dic.find_field_by_name(''.join(
             response.css("#ywzc::attr(value)").extract()).split(u","))
         item["Address"] = response.xpath(
             "/html/body/div[1]/div[4]/div/div[2]/div[2]/div[1]/div[5]/span/text()"
         ).extract_first().replace('\t', '').replace('\r',
                                                     '').replace('\n', '')
         item["CityCode"] = ''.join(
             self.areaData.find_area_by_name_return_code(
                 (response.meta['cityname'])))
         # 头像路径
         dirname = 'guangxi'
         headurl = "http://gx.12348.gov.cn" + ''.join(
             response.xpath('//img[@id="img-billid"]/@src').extract()
         ).replace('..', '')
         item["UIPic"] = ''.join(
             http_util.downloadImage([headurl], '/AppFile/' + dirname +
                                     "/" + item["UIID"] + '/head'))
         item['url'] = response.url
         return item
Example #14
0
 def parse_detail(self, response):
     item = {}
     #  #[UIID],[UIPhone] ,[UIName] ,[UIEmail] ,[UIPic],[UILawNumber],[LawOrg],[ProvinceCode],[CityCode],[Address],[UISignature]
     item["UIID"] = str(uuid.uuid1()).replace('-', '')
     table = response.xpath('//div[@class="zi35"]/table')
     uiphone = "".join(
         table.xpath('tr[7]/td/text()').re('[^\s]')).split(u':')[1]
     match_count = len(re.findall(r'[1][3,4,5,6,7,8][0-9]{9}', uiphone))
     item['UILawNumber'] = "".join(
         table.xpath('tr[11]/td[1]/text()').re('[^\s]')).split(u':')[1]
     if item["UILawNumber"] != None and len(
             item["UILawNumber"]
     ) == 17 and self.userInfoInfoData.find_lawyer_by_lawlumber(
         (item["UILawNumber"], )) == None:
         item["UIPhone"] = None if match_count == 0 else uiphone
         item['UIName'] = "".join(
             table.xpath('tr[1]/td[1]/text()').re('[^\s]')).split(u':')[1]
         item["ProvinceCode"] = ''.join(
             self.areaData.find_area_by_name_return_code((u'辽宁')))
         item['LawOrg'] = "".join(
             table.xpath('tr[2]/td/text()').re('[^\s]')).split(u':')[1]
         item['UIEmail'] = "".join(
             table.xpath('tr[14]/td/text()').re('[^\s]')).split(':')[1]
         item["UISignature"] = None
         item['FIID'] = None
         item["Address"] = None
         item["CityCode"] = None
         # 头像路径
         dirname = 'liaoning'
         item["UIPic"] = ''.join(
             http_util.downloadImage([
                 "http://218.60.145.124:8080/lnlxoa/govhall" +
                 "".join(table.xpath('tr[1]/td[2]/img/@src').re('[^\s]'))
             ], '/AppFile/' + dirname + "/" + item["UIID"] + '/head'))
         item['url'] = response.url
         return item
Example #15
0
 def parse_detail(self, response):
     item = LawyerInfoItem()
     item["name"] = response.xpath(
         "//table/tr[1]/td[1]/text()").extract_first()
     sex = response.xpath("//table/tr[1]/td[2]/text()").extract_first()
     if sex is not None:
         sex = sex.strip()
     if sex == u"男":
         sex = 0
     elif sex == u"女":
         sex = 1
     else:
         sex = ""
     item["sex"] = sex
     item["education"] = response.xpath(
         "//table/tr[5]/td[2]/text()").extract_first()
     item["political_status"] = response.xpath(
         "//table/tr[2]/td[2]/text()").extract_first()
     headurl = 'http://www.sxsf.gov.cn' + response.xpath(
         '//table/tr[1]/td[3]/img/@src').extract_first()
     item["headurl"] = ''.join(
         http_util.downloadImage([headurl], 'lawyer_pics/sshanxi'))
     item["lawnumber"] = response.xpath(
         "//table/tr[3]/td[1]/text()").extract_first()
     item["get_time"] = response.xpath(
         "//table/tr[4]/td[1]/text()").extract_first()
     item["cert_type"] = response.xpath(
         "//table/tr[3]/td[2]/text()").extract_first()
     item["profession"] = ''.join(
         response.xpath("//table/tr[9]/td[1]/text()").extract()).split(u'、')
     item["firm"] = response.xpath(
         "//table/tr[10]/td[1]/a/text()").extract_first()
     item['collection'] = 'lawyers'
     item['province'] = u'陕西'
     item['url'] = response.url
     return item
Example #16
0
    def parse_lawyer_item(self, response):
        item = {}
        item["UILawNumber"] = ''.join(
            response.xpath(
                u'//p[contains(text(),"执业证号:")]/text()').extract()).replace(
                    ' ', '').replace(u'执业证号:', '')
        uiphone = ''.join(
            response.xpath(
                '//strong[@class="mobile"]/text()').extract()).replace(
                    ' ', '')
        match_count = len(re.findall(r'[1][3,4,5,6,7,8][0-9]{9}', uiphone))
        item["UIPhone"] = None if match_count == 0 else uiphone
        #如果数据库不存在执业证号
        if item["UILawNumber"] != None and len(
                item["UILawNumber"]
        ) == 17 and self.userInfoInfoData.find_lawyer_by_lawlumber(
            (item["UILawNumber"], )) == None and item["UIPhone"] != no:
            item["UIName"] = ''.join(
                response.xpath(
                    '//strong[@class="lawyername"]/text()').extract()).replace(
                        ' ', '').replace(u"律师", '')

            item["LawOrg"] = response.xpath(
                '//p[@class="jigou"][1]/a/text()').extract_first()
            item["Address"] = ''.join(
                response.xpath(
                    u'//p[contains(text(),"地\xa0\xa0\xa0\xa0址:")]/text()').
                extract()).replace(' ', '').replace(u'地\xa0\xa0\xa0\xa0址:', '')
            item["UIEmail"] = ''.join(
                response.xpath(
                    u'//p[contains(text(),"邮\xa0\xa0\xa0\xa0箱:")]/text()').
                extract()).replace(' ', '').replace(u'邮\xa0\xa0\xa0\xa0箱:', '')
            fiil_str = ''.join(
                response.xpath(u'//p[contains(text(),"专长领域:")]/text()').
                extract()).replace(' ', '').replace(u'专长领域:', '')
            desc = ''.join(
                response.xpath(
                    "//div[@class='content'][last()]/*").extract()).replace(
                        u"\xa0", '')
            desc = re.sub(
                r'(<a.*?>.*?</a>)|((class|style|color|href)="[^"]*?")|(<.*?>)|(<[/].*?>)',
                '', desc).replace("\r", '').replace("\n", '').replace(' ', '')
            s_start_index = 0 if desc.index(u'分享到:') == -1 else desc.index(
                u'分享到:')
            item["UISignature"] = None if desc == '' else desc[
                s_start_index:].replace(u'分享到:', '').replace(
                    u"\xa0", '').replace("\t", '').replace("\n", '').replace(
                        ' ', '').replace(u'&amp;', '').replace('...', '')
            province_city = response.xpath(
                '//div[@class="info_nm SG_txtc "]/text()').extract_first(
                ).replace("\r", '').replace("\n", '').split(" ")
            item["ProvinceCode"] = ''.join(
                self.areaData.find_area_by_name_return_code(
                    (province_city[0])))
            item["CityCode"] = ''.join(
                self.areaData.find_area_by_name_return_code(
                    (province_city[1])))
            item['fiil_str'] = field_info_dic.find_field_by_name(
                fiil_str.split(u"\xa0"))
            item["UIID"] = str(uuid.uuid1()).replace('-', '')
            item["UIPic"] = ''.join(
                http_util.downloadImage([
                    "http://lawyer.fabang.com" + ''.join(
                        response.css(
                            '.info_img_area img::attr(src)').extract())
                ], '/AppFile/' + item["UIID"] + '/head'))
            item["url"] = response.url
            return item