Beispiel #1
0
class LawtimeQuestionSpider(scrapy.Spider):
    name = "lawtime_q"
    start_urls = ["http://www.lawtime.cn"]
    areaData = AreaData()
    duestionData = QuestionData()

    def parse(self, response):
        list_url = 'http://www.lawtime.cn/ask/browse_t2_p{0}.html'
        for p in range(3, 118):
            yield scrapy.Request(url=list_url.format(str(p)),
                                 callback=self.parse_question_next_page,
                                 errback=self.handle_error)

    def parse_question_next_page(self, response):
        for li in response.css(".tab-body li"):
            province_city = ''.join(li.css(".info::text").extract())
            detail_url = li.css("a::attr(href)").extract_first()
            yield scrapy.Request(url=detail_url,
                                 meta={'province_city': province_city},
                                 callback=self.parse_detail,
                                 errback=self.handle_error)

    def parse_detail(self, response):
        item = {}
        province_city = response.meta['province_city'].split('-')
        title = ''.join(response.css('.title::text').extract()).replace(
            ' ', '').replace(u'最佳答案其他', '')
        content = ''.join(response.css('.content::text').extract()).replace(
            ' ', '').replace(u'最佳答案其他', '')
        item["Content"] = content if content != '' else title
        if item["Content"] != '':
            item["ID"] = str(uuid.uuid1()).replace('-', '')
            qinfo = ''.join(response.css('.question .info::text').extract())
            item["UserName"] = qinfo[0:qinfo.index(u' ')]
            item["CreateTime"] = qinfo[qinfo.index(('-')):].replace(
                u'\u3000', '').lstrip('-')
            item["UserHeadUrl"] = '/APPFile/userhead.jpg'
            province = province_city[0].replace('[', '')
            city = province_city[1].replace(']', '')
            item["ProvinceCode"] = None
            item["CityCode"] = None
            if (province != None):
                prodata = self.areaData.find_area_by_name_return_code(
                    (province))
                if prodata != None:
                    item["ProvinceCode"] = ''.join(prodata)
            if (city != None):
                citydata = self.areaData.find_area_by_name_return_code((city))
                if citydata != None:
                    item["CityCode"] = ''.join(citydata)
            item['FIID'] = None
            item["url"] = response.url
            #ID,UserName,UserHeadUrl,Content,CreateTime,ProvinceCode,CityCode,FIID
            self.duestionData.insert_free_question((
                item["ID"],
                item["UserName"],
                item["UserHeadUrl"],
                item["Content"],
                item["CreateTime"],
                item["ProvinceCode"],
                item["CityCode"],
                item["FIID"],
            ))

            item['Replys'] = []
            for r in response.xpath('//div[@class="answer-item"]'):
                reply = {}
                reply["ID"] = str(uuid.uuid1()).replace('-', '')
                reply['Content'] = ''.join(
                    r.css('.answer-w p:nth-child(1)::text').extract())
                reply['CreateTime'] = ''.join(
                    r.css('.time:nth-child(1)::text').extract())
                # UIID 暂时为空,导入完成数据库指定律小脉 :e40b0d4f5bdc4732a3bdc8c66d4269c3
                #ID,UIID,Content,QID,RID,CreateTime,IsDel
                reply['UIID'] = None
                reply['RID'] = None
                reply['QID'] = item["ID"]
                reply['IsDel'] = 0
                item['Replys'].append(reply)
                self.duestionData.insert_free_reply(
                    (reply["ID"], reply['UIID'], reply['Content'],
                     reply['QID'], reply['RID'], reply['CreateTime'],
                     reply['IsDel']))
            print item

    def handle_error(self, result, *args, **kw):
        print "error url is :%s" % result.request.url
        self.logger.error("error url is :%s" % result.request.url)
Beispiel #2
0
class SiChuanLawyerSpider(scrapy.Spider):
    name = "sichuan_law_spider"
    start_urls = ["http://fwpt.scsf.gov.cn/lsfw/lsfw.shtml"]
    areaData = AreaData()
    userInfoInfoData = UserInfoInfoData()
    pagesize = 20
    provincode = '51'
    baseurl = "http://fwpt.scsf.gov.cn/lsfw/lsfwlist.shtml"

    def parse(self, response):
        for item in response.css(".dropdown a"):
            prostr = ''.join(item.xpath("@onclick").extract())
            citycode = prostr.replace(u"sfjdjg('", '').replace(u"')", '')
            cityname = ''.join(item.xpath("text()").extract())
            yield scrapy.FormRequest(
                url=self.baseurl,
                method="POST",
                headers={'X-Requested-With': 'XMLHttpRequest'},
                dont_filter=True,
                callback=self.parseAjaxPageList,
                errback=self.handle_error,
                meta={
                    'areacode': citycode,
                    'cityname': cityname
                },
                formdata={
                    "page": '1',
                    'fydm': citycode,
                    'kplb': '2'
                })

    def parseAjaxPageList(self, response):
        pagecount = int(''.join(
            response.xpath(u'//a[last()]/@onclick').extract()).replace(
                u'query(', '').replace(u')', ''))
        for page in range(1, pagecount):
            yield scrapy.FormRequest(
                url=self.baseurl,
                method="POST",
                headers={'X-Requested-With': 'XMLHttpRequest'},
                dont_filter=True,
                callback=self.parseAjaxList,
                errback=self.handle_error,
                meta={
                    'areacode': response.meta['areacode'],
                    'cityname': response.meta['cityname']
                },
                formdata={
                    "page": str(page),
                    'fydm': response.meta['areacode'],
                    'kplb': '2'
                })

    def parseAjaxList(self, response):
        for i in response.xpath(
                "//div[@class='synopsis_N fl']/a/@href").extract():
            detail_url = 'http://fwpt.scsf.gov.cn/' + i
            yield scrapy.Request(
                url=detail_url,
                method="GET",
                dont_filter=True,
                callback=self.parse_detail,
                errback=self.handle_error,
                meta={'cityname': response.meta['cityname']},
            )

    #详情页面
    def parse_detail(self, response):
        item = {}
        #  #[UIID],[UIPhone] ,[UIName] ,[UIEmail] ,[UIPic],[UILawNumber],[LawOrg],[ProvinceCode],[CityCode],[Address],[UISignature]
        item["UIID"] = str(uuid.uuid1()).replace('-', '')
        uiphone = ''.join(
            response.xpath('/html/body/div[3]/table/tbody/tr[5]/td[4]/text()').
            extract()).replace('\t', '').replace('\n', '')
        match_count = len(re.findall(r'[1][3,4,5,6,7,8][0-9]{9}', uiphone))
        item['UILawNumber'] = ''.join(
            response.css('.font18::text').extract()).replace(
                u'执业证号 (', '').replace(u')', '').replace(u"\xa0", '')
        if item["UILawNumber"] != None and len(
                item["UILawNumber"]
        ) == 17 and self.userInfoInfoData.find_lawyer_by_lawlumber(
            (item["UILawNumber"], )) == None:
            item["UIPhone"] = None if match_count == 0 else uiphone
            item['UIName'] = ''.join(response.css('.font28::text').extract())
            item["ProvinceCode"] = self.provincode
            item['LawOrg'] = ''.join(
                response.css('.lsjjxg3::text').extract()).replace('\t',
                                                                  '').replace(
                                                                      '\n', '')
            item['UIEmail'] = ''.join(
                response.xpath(
                    '/html/body/div[3]/table/tbody/tr[6]/td[4]/text()').
                extract()).replace('\t', '').replace('\n', '')
            item["UISignature"] = None
            item["Address"] = None
            item["CityCode"] = ''.join(
                self.areaData.find_area_by_name_return_code(
                    (response.meta['cityname'])))
            # 头像路径
            dirname = 'sichuan'
            head_url = ''.join(
                response.xpath(
                    '/html/body/div[3]/table/tbody/tr[1]/td[1]/img/@src').
                extract())
            item["UIPic"] = ''.join(
                http_util.downloadImage(["http://sd.12348.gov.cn/" + head_url],
                                        '/AppFile/' + dirname + "/" +
                                        item["UIID"] + '/head'))
            item['url'] = response.url
            return item

    def handle_error(self, result, *args, **kw):
        print "error url is :%s" % result.request.url
        self.logger.error("error url is :%s" % result.request.url)
Beispiel #3
0
#!/usr/bin/python
#-*- coding: utf-8 -*-

from dal.service.AreaData import AreaData
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
if __name__ == "__main__":
    areaData = AreaData()
    print areaData.find_area_by_name_return_code((u'北京', ))
    pass
Beispiel #4
0
class HeBeiLawyerSpider(scrapy.Spider):
    name = "hebei_law_spider"
    start_urls = ["http://he.12348.gov.cn/skywcm/webpage/search/index.jsp"]
    areaData = AreaData()
    userInfoInfoData = UserInfoInfoData()
    pagesize = 20
    provincode = '13'
    baseurl = "http://he.12348.gov.cn/skywcm/webpage/search/search_do.jsp"

    def parse(self, response):
        isflag = 0
        for item in response.xpath("//dl[@class='searchTab1']/dd"):
            if isflag < 3:
                isflag = isflag + 1
                continue
            citycode = item.xpath('@data').extract_first().replace(
                u"{districtcode:'", "").replace(u"'}", '')
            cityname = item.xpath('a/text()').extract_first()
            yield scrapy.FormRequest(
                url=self.baseurl,
                method="POST",
                headers={'X-Requested-With': 'XMLHttpRequest'},
                dont_filter=True,
                callback=self.parseAjaxPageList,
                errback=self.handle_error,
                meta={
                    'areacode': citycode,
                    'cityname': cityname
                },
                formdata={
                    "pageNum": '1',
                    'pageSize': str(self.pagesize),
                    'districtcode': citycode,
                    'type': '2',
                    'businessType': '1',
                    'pkid': '0',
                    't': str(int(time.time()))
                })

    def parseAjaxPageList(self, response):
        data = json.loads(response.body_as_unicode())
        pagecount = int(data['pageCount'])
        for page in range(1, pagecount):
            yield scrapy.FormRequest(
                url=self.baseurl,
                method="POST",
                headers={'X-Requested-With': 'XMLHttpRequest'},
                dont_filter=True,
                callback=self.parseAjaxList,
                errback=self.handle_error,
                meta={'cityname': response.meta["cityname"]},
                formdata={
                    "pageNum": str(page),
                    'pageSize': str(self.pagesize),
                    'districtcode': response.meta['areacode'],
                    'type': '2',
                    'businessType': '1',
                    'pkid': '0',
                    't': str(int(time.time()))
                })

    def parseAjaxList(self, response):
        data = json.loads(response.body_as_unicode())
        for item in data['datas']:
            item['cityname'] = response.meta['cityname']
            yield self.parse_detail(item)

    #详情页面
    def parse_detail(self, data):
        item = {}
        #  #[UIID],[UIPhone] ,[UIName] ,[UIEmail] ,[UIPic],[UILawNumber],[LawOrg],[ProvinceCode],[CityCode],[Address],[UISignature]
        item["UIID"] = str(uuid.uuid1()).replace('-', '')
        uiphone = '' if data.has_key(
            'cell_phone') == False else data['cell_phone']
        match_count = len(re.findall(r'[1][3,4,5,6,7,8][0-9]{9}', uiphone))
        item['UILawNumber'] = None if data.has_key(
            'accountcode') == False else data['accountcode']
        if item["UILawNumber"] != None and len(
                item["UILawNumber"]
        ) == 17 and self.userInfoInfoData.find_lawyer_by_lawlumber(
            (item["UILawNumber"], )) == None:
            item["UIPhone"] = None if match_count == 0 else uiphone
            item['UIName'] = data['user_name']
            item["ProvinceCode"] = self.provincode
            item['LawOrg'] = None if data.has_key(
                'accountorg') == False else data['accountorg']
            item['UIEmail'] = None if data.has_key(
                'email') == False else data['email']
            item["UISignature"] = None
            item["Address"] = None if data.has_key(
                'address') == False else data['address']
            item['UISex'] = 0 if data['sex'] == 1 else 1
            item["CityCode"] = ''.join(
                self.areaData.find_area_by_name_return_code(
                    (data['cityname'])))
            # 头像路径
            dirname = "shandong"
            item["UIPic"] = ''.join(
                http_util.downloadImage([data['picImg']],
                                        '/AppFile/' + dirname + "/" +
                                        item["UIID"] + '/head'))
            return item

    def handle_error(self, result, *args, **kw):
        print "error url is :%s" % result.request.url
        self.logger.error("error url is :%s" % result.request.url)
class HeiLongJiangLawyerSpider(scrapy.Spider):
    name = "hlj_law_spider"
    start_urls = [
        "http://hl.12348.gov.cn/gfpt/public/gfpt/ggflfw/wsbs/ls/tolist?dqbm=23"
    ]
    areaData = AreaData()
    userInfoInfoData = UserInfoInfoData()
    pagesize = 20
    provincode = '23'
    baseurl = "http://hl.12348.gov.cn/gfpt/public/gfpt/ggflfw/wsbs/ls/listlsry"

    def parse(self, response):
        for item in response.css(
                "#shiqu_second li a::attr(onclick)").extract():
            prostr = item.split(u',')
            citycode = prostr[0].replace(u'xzShi(', '').replace(u"'", '')
            cityname = prostr[1].replace(u"'", '')
            yield scrapy.FormRequest(
                url=self.baseurl,
                method="POST",
                headers={'X-Requested-With': 'XMLHttpRequest'},
                dont_filter=True,
                callback=self.parseAjaxPageList,
                errback=self.handle_error,
                meta={
                    'citycode': citycode,
                    'cityname': cityname
                },
                formdata={
                    "dqPage": '1',
                    'countSize': str(self.pagesize),
                    'startSize': '1',
                    'dqbm': citycode,
                    'type': '1',
                    'rymc': u'请输入关键词'
                })

    def parseAjaxPageList(self, response):
        yieldlist = []
        data = json.loads(response.body_as_unicode())
        pagecount = int(data['countPage'])
        yieldlist.extend(self.parseAjaxList(response))
        for page in range(2, pagecount):
            countSize = self.pagesize * page
            startSize = self.pagesize + page
            yieldlist.append(
                scrapy.FormRequest(
                    url=self.baseurl,
                    method="POST",
                    headers={'X-Requested-With': 'XMLHttpRequest'},
                    dont_filter=True,
                    callback=self.parseAjaxList,
                    errback=self.handle_error,
                    meta={
                        'citycode': response.meta['citycode'],
                        'cityname': response.meta['cityname']
                    },
                    formdata={
                        "dqPage": str(page),
                        'countSize': str(countSize),
                        'startSize': str(startSize),
                        'dqbm': response.meta['citycode'],
                        'type': '1',
                        'rymc': u'请输入关键词'
                    }))
        return yieldlist

    def parseAjaxList(self, response):
        data = json.loads(response.body_as_unicode())
        detail_url = 'http://hl.12348.gov.cn/gfpt/public/gfpt/ggflfw/wsbs/ls/ryDetail?rybm={0}'
        for item in data['lsrylist']:
            yield scrapy.Request(
                url=detail_url.format(item[0]),
                method="GET",
                dont_filter=True,
                meta={'cityname': response.meta['cityname']},
                errback=self.handle_error,
                callback=self.parse_detail,
            )

    #详情页面
    def parse_detail(self, response):
        item = {}
        #  #[UIID],[UIPhone] ,[UIName] ,[UIEmail] ,[UIPic],[UILawNumber],[LawOrg],[ProvinceCode],[CityCode],[Address],[UISignature]
        print response.url
        item["UIID"] = str(uuid.uuid1()).replace('-', '')
        uiphone = ''.join(
            response.xpath(
                '/html/body/div[2]/div/div[3]/div/dl/dd/li[6]/text()').re(
                    '[^s]')).replace(' ', '').replace('\t',
                                                      '').replace('\n', '')
        match_count = len(re.findall(r'[1][3,4,5,6,7,8][0-9]{9}', uiphone))
        item['UILawNumber'] = ''.join(
            response.xpath(
                '/html/body/div[2]/div/div[3]/div/dl/dd/li[3]/text()').re(
                    '[^s]')).replace(' ', '').replace('\t',
                                                      '').replace('\n', '')
        if item["UILawNumber"] != None and len(
                item["UILawNumber"]
        ) == 17 and self.userInfoInfoData.find_lawyer_by_lawlumber(
            (item["UILawNumber"], )) == None:
            item["UIPhone"] = None if match_count == 0 else uiphone
            item['UIName'] = ''.join(
                response.xpath(
                    '/html/body/div[2]/div/div[3]/div/dl/dd/h1/text()').re(
                        '[^s]')).replace(' ',
                                         '').replace('\t',
                                                     '').replace('\n', '')
            item["ProvinceCode"] = self.provincode
            item['LawOrg'] = ''.join(
                response.xpath(
                    '/html/body/div[2]/div/div[3]/div/dl/dd/li[4]/a/text()').
                re('[^s]')).replace(' ', '').replace('\t',
                                                     '').replace('\n', '')
            item['UIEmail'] = None
            item["UISignature"] = ''.join(
                response.xpath('//*[@id="news_content_0"]/text()').re(
                    '[^s]')).replace(' ', '').replace('\t', '').replace(
                        '\n', '').replace('\r', '')
            item["Address"] = None
            item["CityCode"] = ''.join(
                self.areaData.find_area_by_name_return_code(
                    (response.meta['cityname'])))
            # 头像路径
            dirname = 'hlj'
            headurl = response.xpath(
                '/html/body/div[2]/div/div[3]/div/dl/dt/img/@src'
            ).extract_first()
            item["UIPic"] = ''.join(
                http_util.downloadImage(["http://hl.12348.gov.cn" + headurl],
                                        '/AppFile/' + dirname + "/" +
                                        item["UIID"] + '/head'))
            item['url'] = response.url
            return item

    def handle_error(self, result, *args, **kw):
        print "error url is :%s" % result.request.url
        self.logger.error("error url is :%s" % result.request.url)
Beispiel #6
0
class ShanDongLawyerSpider(scrapy.Spider):
    name = "guangxi_law_spider"
    start_urls = ["http://gx.12348.gov.cn/lssws/index.jhtml"]
    areaData = AreaData()
    userInfoInfoData = UserInfoInfoData()
    pagesize = 20
    provincode = '45'
    baseurl = "http://gx.12348.gov.cn/lssws/index_1.jhtml?qkey=mscms.ms.getLSList&args_code={0}"

    def parse(self, response):
        isflag = 0
        for item in response.css(".content-inquiry-city button"):
            if isflag == 0:
                isflag = 1
                continue
            citycode = item.xpath("@q").extract_first()[0:4]
            cityname = item.xpath("text()").extract_first()
            city_href = self.baseurl.format(citycode)
            yield scrapy.FormRequest(
                url=city_href,
                method="GET",
                dont_filter=True,
                callback=self.parsePageList,
                errback=self.handle_error,
                meta={
                    'cityname': cityname,
                    'citycode': citycode
                },
            )

    def parsePageList(self, response):
        pagecountstr = response.css('#totalnum::attr(value)').extract_first()
        pagecount = (int(pagecountstr) - 1) / (self.pagesize + 1)
        page_next_url = "http://gx.12348.gov.cn/lssws/index_{0}.jhtml?qkey=mscms.ms.getLSList&args_code={1}"
        for page in range(1, pagecount):
            yield scrapy.FormRequest(
                url=page_next_url.format(str(page), response.meta['citycode']),
                method="GET",
                dont_filter=True,
                callback=self.parseList,
                errback=self.handle_error,
                meta=response.meta,
            )

    def parseList(self, response):
        for item in response.css(
                '.search-results-box a::attr(href)').extract():
            detail_url = "http://gx.12348.gov.cn" + item.replace('..', '')
            yield scrapy.FormRequest(
                url=detail_url,
                method="GET",
                dont_filter=True,
                callback=self.parse_detail,
                errback=self.handle_error,
                meta={'cityname': response.meta['cityname']},
            )

    #详情页面
    def parse_detail(self, response):
        item = {}
        #  #[UIID],[UIPhone] ,[UIName] ,[UIEmail] ,[UIPic],[UILawNumber],[LawOrg],[ProvinceCode],[CityCode],[Address],[UISignature]
        item["UIID"] = str(uuid.uuid1()).replace('-', '')
        uiphone = response.css('.zynx::text').extract_first().replace(
            '\t', '').replace('\r', '').replace('\n', '')
        match_count = len(re.findall(r'[1][3,4,5,6,7,8][0-9]{9}', uiphone))
        item['UILawNumber'] = response.css(
            '.zyzh::text').extract_first().replace('\t', '').replace(
                '\r', '').replace('\n', '')
        if item["UILawNumber"] != None and len(
                item["UILawNumber"]
        ) == 17 and self.userInfoInfoData.find_lawyer_by_lawlumber(
            (item["UILawNumber"], )) == None:
            item["UIPhone"] = None if match_count == 0 else uiphone
            item['UIName'] = response.xpath(
                "//div[@class='row ryjs-top-name']/h3/text()").extract_first(
                ).replace('\t', '').replace('\r', '').replace('\n', '')
            item["ProvinceCode"] = self.provincode
            item['LawOrg'] = response.css(
                ".zyjg a::text").extract_first().replace('\t', '').replace(
                    '\r', '').replace('\n', '')
            item['UIEmail'] = None
            item["UISignature"] = None
            item['fiil_str'] = field_info_dic.find_field_by_name(''.join(
                response.css("#ywzc::attr(value)").extract()).split(u","))
            item["Address"] = response.xpath(
                "/html/body/div[1]/div[4]/div/div[2]/div[2]/div[1]/div[5]/span/text()"
            ).extract_first().replace('\t', '').replace('\r',
                                                        '').replace('\n', '')
            item["CityCode"] = ''.join(
                self.areaData.find_area_by_name_return_code(
                    (response.meta['cityname'])))
            # 头像路径
            dirname = 'guangxi'
            headurl = "http://gx.12348.gov.cn" + ''.join(
                response.xpath('//img[@id="img-billid"]/@src').extract()
            ).replace('..', '')
            item["UIPic"] = ''.join(
                http_util.downloadImage([headurl], '/AppFile/' + dirname +
                                        "/" + item["UIID"] + '/head'))
            item['url'] = response.url
            return item

    def handle_error(self, result, *args, **kw):
        print "error url is :%s" % result.request.url
        self.logger.error("error url is :%s" % result.request.url)
Beispiel #7
0
class JiangXiLawyerSpider(scrapy.Spider):
    name = "jiangxi_law_spider"
    start_urls = ["http://lawnew.jxsf.gov.cn/views/lawyerInfo/findLawyer.jsp"]
    areaData = AreaData()
    userInfoInfoData = UserInfoInfoData()
    pagesize = 20
    provincode = '38'
    baseurl = "http://lawnew.jxsf.gov.cn/flfw-jx/portEmpLs/queryLSList?{0}"

    def parse(self, response):
        province_city_dic = []
        province_city_dic.append({'citycode': '3601', 'cityname': u'南昌市'})
        province_city_dic.append({'citycode': '3602', 'cityname': u'景德镇市'})
        province_city_dic.append({'citycode': '3603', 'cityname': u'萍乡市'})
        province_city_dic.append({'citycode': '3604', 'cityname': u'九江市'})
        province_city_dic.append({'citycode': '3605', 'cityname': u'新余市'})
        province_city_dic.append({'citycode': '3606', 'cityname': u'鹰潭市'})
        province_city_dic.append({'citycode': '3607', 'cityname': u'赣州市'})
        province_city_dic.append({'citycode': '3608', 'cityname': u'吉安市'})
        province_city_dic.append({'citycode': '3609', 'cityname': u'宜春市'})
        province_city_dic.append({'citycode': '3610', 'cityname': u'抚州市'})
        province_city_dic.append({'citycode': '3611', 'cityname': u'上饶市'})
        for item in province_city_dic:
            news_url = self.baseurl.format("pageSize=" + str(self.pagesize) +
                                           "&pageNum=1&city=" +
                                           item['citycode'] +
                                           "&selInfo=&ywzc=&_=" +
                                           str(int(time.time())))
            yield scrapy.Request(
                url=news_url,
                method="GET",
                dont_filter=True,
                callback=self.parseAjaxPageList,
                errback=self.handle_error,
                meta={'cityname': item['cityname']},
            )

    def parseAjaxPageList(self, response):
        data = json.loads(response.body_as_unicode())
        pagecount = int(data['content']['pages'])
        for page in range(1, pagecount):
            page_url = re.sub('&pageNum=\d+', '&pageNum=' + str(page),
                              response.url)
            detail_url = re.sub('&_=.*', '&_=' + str(int(time.time())),
                                page_url)
            yield scrapy.Request(
                url=detail_url,
                method="GET",
                dont_filter=True,
                callback=self.parseAjaxList,
                errback=self.handle_error,
                meta={'cityname': response.meta['cityname']},
            )

    def parseAjaxList(self, response):
        data = json.loads(response.body_as_unicode())['content']['list']
        for item in data:
            yield self.parse_detail(item, response.meta['cityname'])
            # 详情页面

    def parse_detail(self, data, cityname):
        item = {}
        #  #[UIID],[UIPhone] ,[UIName] ,[UIEmail] ,[UIPic],[UILawNumber],[LawOrg],[ProvinceCode],[CityCode],[Address],[UISignature]
        item["UIID"] = str(uuid.uuid1()).replace('-', '')
        uiphone = '' if data.has_key('sjhm') == False else data['sjhm']
        match_count = len(re.findall(r'[1][3,4,5,6,7,8][0-9]{9}', uiphone))
        item['UILawNumber'] = None if data.has_key(
            'zyzh') == False else data['zyzh']
        if item["UILawNumber"] != None and len(
                item["UILawNumber"]
        ) == 17 and self.userInfoInfoData.find_lawyer_by_lawlumber(
            (item["UILawNumber"], )) == None:
            item["UIPhone"] = None if match_count == 0 else uiphone
            item['UIName'] = data['lsxm']
            item["ProvinceCode"] = self.provincode
            item['LawOrg'] = None if data.has_key(
                'swsmc') == False else data['swsmc']
            item['UIEmail'] = None if data.has_key(
                'dzyx') == False else data['dzyx']
            item["UISignature"] = None if data.has_key(
                'grjj') == False else data['grjj']
            item["Address"] = None if data.has_key(
                'deptAdress') == False else data['deptAdress'].replace(
                    "\r", '').replace("\n", '').replace(' ', '')
            item['UISex'] = None if data.has_key('xb') == False else (
                0 if data['xb'] == u'男' else 1)
            item["CityCode"] = ''.join(
                self.areaData.find_area_by_name_return_code((cityname)))
            fiil_str = None if data.has_key(
                'ywzcmc') == False else data['ywzcmc']
            if fiil_str != None:
                item['fiil_str'] = field_info_dic.find_field_by_name(
                    fiil_str.split(u","))
            # 头像路径
            dirname = "jiangxi"
            item["UIPic"] = ''.join(
                http_util.downloadImage([
                    'http://lawnew.jxsf.gov.cn/flfw-jx/views/picture/' +
                    data['lszp']
                ], '/AppFile/' + dirname + "/" + item["UIID"] + '/head'))
            return item

    def handle_error(self, result, *args, **kw):
        print "error url is :%s" % result.request.url
        self.logger.error("error url is :%s" % result.request.url)
Beispiel #8
0
class ZhaoFaLawyerSpider(scrapy.Spider):
    name = "zhaofa_lawyer"
    start_urls = ["http://china.findlaw.cn/beijing/lawyer"]
    areaData = AreaData()
    userInfoInfoData = UserInfoInfoData()

    def parse(self, response):
        provinceSet = list()
        urlmetedata = list()
        child_city_url = 'http://china.findlaw.cn/area_front/index.php?c=ajax&a=getChildCity'
        headers = {
            'User-Agent':
            'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:22.0) Gecko/20100101 Firefox/22.0'
        }
        for item in response.xpath(
                "//select[@id='province']/option/@value").extract():
            if item != '':
                provinceSet.append(item)
        for pro in provinceSet:
            res = requests.post(child_city_url,
                                data={
                                    'areacode': pro,
                                    'profPy': None,
                                    'typeid': '0'
                                },
                                headers=headers)
            data = res.json()
            for key in data['data']:
                try:
                    urlmetedata.append({
                        'url':
                        data['data'][key]['url'],
                        'province':
                        data['data'][key]['province'],
                        'city':
                        data['data'][key]['city']
                    })
                except Exception as e:
                    pass
        for item in urlmetedata:
            yield scrapy.Request(url=item['url'],
                                 meta={
                                     'province': item['province'],
                                     'city': item['city']
                                 },
                                 callback=self.parse_lawyer_next_page,
                                 errback=self.handle_error)

    def parse_lawyer_next_page(self, response):
        last_page_url = ''.join(
            response.xpath(
                "//div[@class='common-pagination']/a[last()]/@href").extract())
        self.parse_lawyer_list(response)
        if last_page_url != '':
            pagecount = int(
                re.match('.*/p_(?P<page>\d+)/', last_page_url).group('page'))
            for page in range(2, int(pagecount)):
                list_url = response.url + '/p_' + str(page)
                yield scrapy.Request(url=list_url,
                                     meta={
                                         'province': response.meta['province'],
                                         'city': response.meta['city']
                                     },
                                     callback=self.parse_lawyer_list,
                                     errback=self.handle_error)

    def parse_lawyer_list(self, response):
        for item in response.css(".sr-list li"):
            detail_url = item.css(".lawyer_name::attr(href)").extract_first()
            yield scrapy.Request(url=detail_url,
                                 meta={
                                     'province': response.meta['province'],
                                     'city': response.meta['city']
                                 },
                                 callback=self.parse_lawyer_item,
                                 errback=self.handle_error)

    def parse_lawyer_item(self, response):
        item = {}
        zhiye = response.xpath(
            "//dl[@class='information_practice information_practice_new']")
        # print ''.join(zhiye.extract())
        item["UILawNumber"] = ''.join(
            zhiye.xpath(u'dd/span[contains(text(),"律师证编号:")]/text()').extract(
            )).replace(u'执业律师 (律师证编号:', '').replace(u')', '').replace(' ', '')
        uiphone = ''.join(
            response.css('.right_consult_phone a::text').extract())

        match_count = len(re.findall(r'[1][3,4,5,6,7,8][0-9]{9}', uiphone))
        item["UIPhone"] = None if match_count == 0 else uiphone
        #如果数据库不存在执业证号
        if item["UILawNumber"] != None and len(
                item["UILawNumber"]
        ) == 17 and self.userInfoInfoData.find_lawyer_by_lawlumber(
            (item["UILawNumber"], )) == None and item["UIPhone"] != None:
            item["UIName"] = ''.join(
                response.xpath('//h1[@class="lvshi_info_name"]/text()').
                extract()).replace(' ', '').replace(u"律师", '')
            item["LawOrg"] = response.xpath(
                '//p[@class="lvshi_info_add"]/text()').extract_first()
            item["Address"] = ''.join(
                response.css(
                    '.information_practice_dd::text').extract()).replace(
                        ' ', '')
            item["UIEmail"] = None
            desc = ''.join(
                response.xpath("//p[@class='information_info']/span/text()").
                extract()).replace(u"\xa0", '')
            desc = re.sub(
                r'(<a.*?>.*?</a>)|((class|style|color|href)="[^"]*?")|(<.*?>)|(<[/].*?>)',
                '', desc).replace("\r", '').replace("\n", '').replace(' ', '')
            item["UISignature"] = None if desc == '' else desc.replace(
                u"\xa0", '').replace("\t", '').replace("\n", '').replace(
                    ' ', '').replace(u'&amp;', '').replace('...', '')
            item["ProvinceCode"] = ''.join(
                self.areaData.find_area_by_name_return_code(
                    (response.meta['province'])))
            item["CityCode"] = ''.join(
                self.areaData.find_area_by_name_return_code(
                    (response.meta['city'])))
            item["UIID"] = str(uuid.uuid1()).replace('-', '')
            item["UIPic"] = ''.join(
                http_util.downloadImage([
                    "http:" + ''.join(
                        response.css(
                            '.lvshi_info_pic a img::attr(src)').extract())
                ], '/AppFile/' + item["UIID"] + '/head'))
            item["url"] = response.url
            return item

    def handle_error(self, result, *args, **kw):
        print "error url is :%s" % result.request.url
        self.logger.error("error url is :%s" % result.request.url)
Beispiel #9
0
class ShanDongLawyerSpider(scrapy.Spider):
    name = "shandong_law_spider"
    start_urls = ["http://www.sd12348.gov.cn/channels/ch00630/"]
    areaData = AreaData()
    userInfoInfoData = UserInfoInfoData()
    pagesize = 20
    provincode = '37'
    baseurl = "http://www.sd12348.gov.cn/sftIDC/select/search.do"

    def parse(self, response):
        isflag = 0
        for item in response.css("#cityDiv ul li a::attr(href)").extract():
            if isflag == 0:
                isflag = 1
                continue
            prostr = item.split(u',')
            citycode = prostr[0].replace(u'javascript:changeCitya(',
                                         '').replace(u"'", '')
            cityname = prostr[1].replace(u"'", '').replace(u");", '')
            yield scrapy.FormRequest(
                url=self.baseurl,
                method="POST",
                headers={'X-Requested-With': 'XMLHttpRequest'},
                dont_filter=True,
                callback=self.parseAjaxPageList,
                errback=self.handle_error,
                meta={
                    'pageSize': str(self.pagesize),
                    'areacode': citycode,
                    'cityname': cityname,
                    'type': 'lawyer',
                    'flag': '0',
                    'status': '0'
                },
                formdata={
                    "page": '1',
                    'pageSize': str(self.pagesize),
                    'areacode': citycode,
                    'type': 'lawyer',
                    'flag': '0',
                    'status': '0'
                })

    def parseAjaxPageList(self, response):
        data = json.loads(response.body_as_unicode())
        pagecount = (int(data['totalCount']) - 1) / (self.pagesize + 1)
        for page in range(1, pagecount):
            response.meta['page'] = str(page)
            yield scrapy.FormRequest(
                url=self.baseurl,
                method="POST",
                headers={'X-Requested-With': 'XMLHttpRequest'},
                dont_filter=True,
                callback=self.parseAjaxList,
                errback=self.handle_error,
                meta=response.meta,
                formdata={
                    "page": str(page),
                    'pageSize': str(self.pagesize),
                    'areacode': response.meta['areacode'],
                    'type': 'lawyer',
                    'flag': '0',
                    'status': '0'
                })

    def parseAjaxList(self, response):
        data = json.loads(response.body_as_unicode())
        detail_url = 'http://sd.12348.gov.cn/sftIDC/lawworkmanage/findPersonnelListByid.do?type=lawyer&id={0}'
        for i in data['list']:
            yield scrapy.FormRequest(
                url=detail_url.format(i['id']),
                method="POST",
                headers={'X-Requested-With': 'XMLHttpRequest'},
                dont_filter=True,
                callback=self.parse_detail,
                errback=self.handle_error,
                meta={'cityname': response.meta['cityname']},
            )

    #详情页面
    def parse_detail(self, response):
        item = {}
        #  #[UIID],[UIPhone] ,[UIName] ,[UIEmail] ,[UIPic],[UILawNumber],[LawOrg],[ProvinceCode],[CityCode],[Address],[UISignature]
        data = json.loads(response.body_as_unicode())
        item["UIID"] = str(uuid.uuid1()).replace('-', '')
        uiphone = data['telnum']
        match_count = len(re.findall(r'[1][3,4,5,6,7,8][0-9]{9}', uiphone))
        item['UILawNumber'] = data['licenseno']
        if item["UILawNumber"] != None and len(
                item["UILawNumber"]
        ) == 17 and self.userInfoInfoData.find_lawyer_by_lawlumber(
            (item["UILawNumber"], )) == None:
            item["UIPhone"] = None if match_count == 0 else uiphone
            item['UIName'] = data['name']
            item["ProvinceCode"] = self.provincode
            item['LawOrg'] = data['lawfirmname']
            item['UIEmail'] = None
            item["UISignature"] = data['lawyerinfo']
            item['fiil_str'] = field_info_dic.find_field_by_name(
                data['zhuangchang'])
            item["Address"] = data['lawfirmaddress']
            item["CityCode"] = ''.join(
                self.areaData.find_area_by_name_return_code(
                    (response.meta['cityname'])))
            # 头像路径
            dirname = self.name
            item["UIPic"] = ''.join(
                http_util.downloadImage(
                    ["http://sd.12348.gov.cn" + data['logourl']],
                    '/AppFile/' + dirname + "/" + item["UIID"] + '/head'))
            item['url'] = response.url
            return item

    def handle_error(self, result, *args, **kw):
        print "error url is :%s" % result.request.url
        self.logger.error("error url is :%s" % result.request.url)
Beispiel #10
0
class LiaoNingLawyerSpider(scrapy.Spider):
    name = "liaoning_law_spider"
    start_urls = ["http://218.60.145.124:8080/lnlxoa/govhall/lawyerResult.jsp"]
    areaData = AreaData()
    userInfoInfoData = UserInfoInfoData()

    def parse(self, response):
        baseurl = "http://218.60.145.124:8080/lnlxoa/govhall/lawyerResultOne.jsp?pn={0}"
        #1145
        for i in range(1, 1145):
            yield scrapy.Request(
                url=baseurl.format(str(i)),
                method="GET",
                callback=self.parse_list,
                meta={"dont_redirect": True},
                errback=self.handle_error,
            )

    def parse_list(self, response):
        url = 'http://218.60.145.124:8080//lnlxoa/govhall/{0}'
        for i in response.css('.zi11 a::attr(href)'):
            yield scrapy.Request(
                url=url.format(i.extract()),
                method="GET",
                callback=self.parse_detail,
                meta={"dont_redirect": True},
                errback=self.handle_error,
            )

    #详情页面
    def parse_detail(self, response):
        item = {}
        #  #[UIID],[UIPhone] ,[UIName] ,[UIEmail] ,[UIPic],[UILawNumber],[LawOrg],[ProvinceCode],[CityCode],[Address],[UISignature]
        item["UIID"] = str(uuid.uuid1()).replace('-', '')
        table = response.xpath('//div[@class="zi35"]/table')
        uiphone = "".join(
            table.xpath('tr[7]/td/text()').re('[^\s]')).split(u':')[1]
        match_count = len(re.findall(r'[1][3,4,5,6,7,8][0-9]{9}', uiphone))
        item['UILawNumber'] = "".join(
            table.xpath('tr[11]/td[1]/text()').re('[^\s]')).split(u':')[1]
        if item["UILawNumber"] != None and len(
                item["UILawNumber"]
        ) == 17 and self.userInfoInfoData.find_lawyer_by_lawlumber(
            (item["UILawNumber"], )) == None:
            item["UIPhone"] = None if match_count == 0 else uiphone
            item['UIName'] = "".join(
                table.xpath('tr[1]/td[1]/text()').re('[^\s]')).split(u':')[1]
            item["ProvinceCode"] = ''.join(
                self.areaData.find_area_by_name_return_code((u'辽宁')))
            item['LawOrg'] = "".join(
                table.xpath('tr[2]/td/text()').re('[^\s]')).split(u':')[1]
            item['UIEmail'] = "".join(
                table.xpath('tr[14]/td/text()').re('[^\s]')).split(':')[1]
            item["UISignature"] = None
            item['FIID'] = None
            item["Address"] = None
            item["CityCode"] = None
            # 头像路径
            dirname = 'liaoning'
            item["UIPic"] = ''.join(
                http_util.downloadImage([
                    "http://218.60.145.124:8080/lnlxoa/govhall" +
                    "".join(table.xpath('tr[1]/td[2]/img/@src').re('[^\s]'))
                ], '/AppFile/' + dirname + "/" + item["UIID"] + '/head'))
            item['url'] = response.url
            return item

    def handle_error(self, result, *args, **kw):
        print "error url is :%s" % result.request.url
        self.logger.error("error url is :%s" % result.request.url)
Beispiel #11
0
class ZhaoFaQuestionSpider(scrapy.Spider):
    name = "zhaofa_q"
    start_urls = ["http://china.findlaw.cn"]
    areaData = AreaData()
    duestionData = QuestionData()

    def parse(self, response):
           list_url='http://china.findlaw.cn/ask/browse_t01_page{0}/'
           for p in range(1,7747):
               yield scrapy.Request(url=list_url.format(str(p)), callback=self.parse_question_next_page, errback=self.handle_error)


    def parse_question_next_page(self,response):
            for li in response.css(".result-list li"):
                fieldname=''.join(li.css('.rli-item.item-classify::text').extract())
                detail_url=li.css("a::attr(href)").extract_first()
                yield  scrapy.Request(url=detail_url,meta={'fieldname': fieldname},
                                                    callback=self.parse_detail,
                                                    errback=self.handle_error)



    def parse_detail(self, response):
        item={}
        item["Content"] = ''.join(response.css('.q-title::text').extract()).replace(' ','')
        if item["Content"]!='':
            item["ID"] = str(uuid.uuid1()).replace('-', '')
            item["UserName"]= ''.join(response.xpath('//p[@class="q-about"]/span[1]/text()').extract()).replace(' ','').replace(u"提问者:",'')
            item["CreateTime"] = ''.join(response.xpath('//p[@class="q-about"]/span[2]/text()').extract()).replace( u"时间:", '')
            item["UserHeadUrl"] = '/APPFile/userhead.jpg'
            province= ''.join(response.xpath("//div[@class='site-location']/a[3]/text()").extract()).replace(' ','').replace(u"法律咨询",'')
            city = ''.join(response.xpath("//div[@class='site-location']/a[4]/text()").extract()).replace(' ','').replace( u"法律咨询", '')
            item["ProvinceCode"] = None
            item["CityCode"] = None
            if(province!=None):
                prodata=self.areaData.find_area_by_name_return_code((province))
                if prodata!=None:
                    item["ProvinceCode"] = ''.join(prodata)
            if (city != None):
                citydata= self.areaData.find_area_by_name_return_code((city))
                if citydata!=None:
                    item["CityCode"] = ''.join(citydata)
            fiildstr = field_info_dic.find_field_by_name([response.meta['fieldname']])
            item['FIID']=None
            if len(fiildstr)>0:
                item['FIID']=fiildstr[0]
            item["url"]=response.url
        #ID,UserName,UserHeadUrl,Content,CreateTime,ProvinceCode,CityCode,FIID
            self.duestionData.insert_free_question((item["ID"] ,
                                                    item["UserName"],
                                                    item["UserHeadUrl"],
                                                    item["Content"],
                                                    item["CreateTime"],
                                                    item["ProvinceCode"],
                                                    item["CityCode"],
                                                    item["FIID"],
                                                     ))

            item['Replys']=[]
            for  r in  response.xpath('//div[@class="answer"]'):
                  reply={}
                  reply["ID"] = str(uuid.uuid1()).replace('-', '')
                  reply['Content']=r.css('.about-text::text').extract_first()
                  reply['CreateTime'] = r.css('.an-time::text').extract_first()
                  # UIID 暂时为空,导入完成数据库指定律小脉 :e40b0d4f5bdc4732a3bdc8c66d4269c3
                  #ID,UIID,Content,QID,RID,CreateTime,IsDel
                  reply['UIID']=None
                  reply['RID']=None
                  reply['QID']=item["ID"]
                  reply['IsDel'] = 0
                  item['Replys'].append(reply)
                  self.duestionData.insert_free_reply((reply["ID"],reply['UIID'],reply['Content'],reply['QID'],reply['RID'],reply['CreateTime'],reply['IsDel']))
            print  item

    def handle_error(self, result, *args, **kw):
        print "error url is :%s" % result.request.url
        self.logger.error("error url is :%s" % result.request.url)
Beispiel #12
0
class FaBangQuestionSpider(scrapy.Spider):
    name = "fabang_q"
    start_urls = ["http://www.fabang.com"]
    areaData = AreaData()
    duestionData = QuestionData()

    def parse(self, response):
           list_url='http://www.fabang.com/ask/browser_0_0_0_0_3_{0}.html'
           for p in range(1, 4682):
               yield scrapy.Request(url=list_url.format(str(p)), callback=self.parse_question_next_page, errback=self.handle_error)


    def parse_question_next_page(self,response):
            for li in response.css(".list ul li"):
                province_city=''.join(li.css("span:nth-child(3)::text").extract())
                fieldname = ''.join(li.css('.tit::text').extract()).replace(u'\u3000','').replace('[','').replace(']','').replace(' ', '').replace('\n','').replace('\t','').replace('\r','')
                detail_url= 'http://www.fabang.com/ask/'+li.css(".tit a::attr(href)").extract_first()
                yield  scrapy.Request(url=detail_url,meta={'province_city': province_city,'fieldname':fieldname},
                                                    callback=self.parse_detail,
                                                    errback=self.handle_error)



    def parse_detail(self, response):
        item={}
        province_city = response.meta['province_city'].split('-')
        item["Content"] = response.xpath('//div[@class="tbrig"][1]/p[1]/text()').extract()[0].replace(' ', '').replace('\n','').replace('\t','')
        if item["Content"]!='':
            item["ID"] = str(uuid.uuid1()).replace('-', '')
            item["UserName"]= response.xpath('//div[@class="tblef"][1]/span[@class="username color06b"]/a/text()').extract()[0].replace(u'\u3000','').replace(' ','')
            item["CreateTime"] = ''.join(response.css('.fenxiang.margintop10 b::text').extract())
            item["UserHeadUrl"] = '/APPFile/userhead.jpg'
            province= province_city[0]
            city = province_city[1]
            item["ProvinceCode"] = None
            item["CityCode"] = None
            if(province!=None):
                prodata=self.areaData.find_area_by_name_return_code((province))
                if prodata!=None:
                    item["ProvinceCode"] = ''.join(prodata)
            if (city != None):
                citydata= self.areaData.find_area_by_name_return_code((city))
                if citydata!=None:
                    item["CityCode"] = ''.join(citydata)

            fiildstr = field_info_dic.find_field_by_name([response.meta['fieldname']])
            item['FIID'] = None
            if len(fiildstr) > 0:
                item['FIID'] = fiildstr[0]
            item["url"]=response.url
        #ID,UserName,UserHeadUrl,Content,CreateTime,ProvinceCode,CityCode,FIID
            self.duestionData.insert_free_question((item["ID"] ,
                                                    item["UserName"],
                                                    item["UserHeadUrl"],
                                                    item["Content"],
                                                    item["CreateTime"],
                                                    item["ProvinceCode"],
                                                    item["CityCode"],
                                                    item["FIID"],
                                                     ))
            item['Replys']=[]
            isfilter=0;
            for  r in response.xpath('//div[@class="tbrig"]'):
                  if isfilter==0:
                      isfilter = 1
                      continue;
                  reply={}
                  reply["ID"] = str(uuid.uuid1()).replace('-', '')
                  reply['Content']= ''.join(r.xpath('p[1]/text()').extract()).replace(' ', '').replace('\n','').replace('\t','')
                  ftime=''.join(r.xpath('span[1]/label[1]/b[1]/text()').extract()).replace('\n','').replace('\t','')
                  stime = ''.join(r.xpath('span[1]/b[1]/text()').extract()).replace('\n', '').replace('\t', '')
                  reply['CreateTime'] = ftime if ftime!='' else stime
                  # UIID 暂时为空,导入完成数据库指定律小脉 :e40b0d4f5bdc4732a3bdc8c66d4269c3
                  #ID,UIID,Content,QID,RID,CreateTime,IsDel
                  reply['UIID']=None
                  reply['RID']=None
                  reply['QID']=item["ID"]
                  reply['IsDel'] = 0
                  item['Replys'].append(reply)
                  self.duestionData.insert_free_reply((reply["ID"],reply['UIID'],reply['Content'],reply['QID'],reply['RID'],reply['CreateTime'],reply['IsDel']))
            print  item

    def handle_error(self, result, *args, **kw):
        print "error url is :%s" % result.request.url
        self.logger.error("error url is :%s" % result.request.url)
Beispiel #13
0
class ChinaLineLawyerSpider(scrapy.Spider):
    name = "chinaline_law_spider"
    start_urls = ["http://www.fl900.com"]
    areaData = AreaData()
    userInfoInfoData = UserInfoInfoData()
    baseurl = "http://www.fl900.com/lawyer/0-0-{0}.html"

    def parse(self, response):
        for p in range(1, 1551):
            yield scrapy.FormRequest(
                url=self.baseurl.format(str(p)),
                method="GET",
                dont_filter=True,
                callback=self.parseList,
                errback=self.handle_error,
            )

    def parseList(self, response):
        for i in response.css(".lawyerlist li a:nth-child(1)"):
            detail_url = 'http://www.fl900.com' + i.xpath(
                "@href").extract_first()
            uname = i.xpath("img/@alt").extract_first()
            yield scrapy.FormRequest(url=detail_url,
                                     method="GET",
                                     dont_filter=True,
                                     callback=self.parse_detail,
                                     errback=self.handle_error,
                                     meta={'uname': uname})

    #详情页面
    def parse_detail(self, response):
        item = {}
        #  #[UIID],[UIPhone] ,[UIName] ,[UIEmail] ,[UIPic],[UILawNumber],[LawOrg],[ProvinceCode],[CityCode],[Address],[UISignature]
        item["UIID"] = str(uuid.uuid1()).replace('-', '')
        uiphone = ''.join(
            response.xpath('/html/body/div[3]/div[2]/div[2]/li[3]/text()').
            extract()).replace(u'手机:', '').replace(u"\xa0", '')
        match_count = len(re.findall(r'[1][3,4,5,6,7,8][0-9]{9}', uiphone))
        item['UILawNumber'] = ''.join(
            response.xpath(
                '/html/body/div[3]/div[2]/div[1]/ul[1]/li[2]/p[2]/label[1]/text()'
            ).extract()).replace(u'执业证号:', '').replace(u"\xa0", '')
        if item["UILawNumber"] != None and len(
                item["UILawNumber"]
        ) == 17 and self.userInfoInfoData.find_lawyer_by_lawlumber(
            (item["UILawNumber"], )) == None:
            item["UIPhone"] = None if match_count == 0 else uiphone
            item['UIName'] = response.meta['uname']
            item['LawOrg'] = ''.join(
                response.xpath(
                    '/html/body/div[3]/div[2]/div[1]/ul[1]/li[2]/p[2]/label[2]/text()'
                ).extract()).replace(u'执业机构:', '').replace(u"\xa0", '')
            item['UIEmail'] = ''.join(
                response.xpath(
                    '/html/body/div[3]/table/tbody/tr[6]/td[4]/text()').
                extract()).replace('\t', '').replace('\n', '')
            item["UISignature"] = ''.join(
                response.xpath(
                    '/html/body/div[3]/div[2]/div[1]/ul[1]/li[2]/p[1]/text()').
                extract()).replace(u"\xa0", '')
            item["Address"] = ''.join(
                response.xpath(
                    '/html/body/div[3]/div[2]/div[1]/ul[1]/li[2]/p[2]/label[3]/text()'
                ).extract()).replace(u'联系地址:', '').replace(u"\xa0", '')
            pro_city_str = ''.join(
                response.xpath('/html/body/div[3]/div[2]/div[2]/li[1]/text()').
                extract()).replace(u'地区:', '').split(' ')
            item["ProvinceCode"] = ''.join(
                self.areaData.find_area_by_name_return_code((pro_city_str[0])))
            item["CityCode"] = ''.join(
                self.areaData.find_area_by_name_return_code((pro_city_str[1])))
            # 头像路径
            dirname = 'fl900'
            item["UIPic"] = '/APPFile/head.jpg'
            fiil_str = response.css('.goodat span::text').extract()
            if fiil_str != None:
                item['fiil_str'] = field_info_dic.find_field_by_name(fiil_str)
            item['url'] = response.url
            return item

    def handle_error(self, result, *args, **kw):
        print "error url is :%s" % result.request.url
        self.logger.error("error url is :%s" % result.request.url)
Beispiel #14
0
class FaWeiShiLawyerSpider(scrapy.Spider):
    name = "faweishi_law_spider"
    start_urls = ["http://m.faweishi.com/lawyer/china/"]
    areaData = AreaData()
    userInfoInfoData = UserInfoInfoData()
    baseurl = "http://m.faweishi.com/ajax.php"

    def parse(self, response):
        for item in response.xpath("//div[@class='fenl'][2]/ul/li/a"):
            yield scrapy.Request(url="http://m.faweishi.com" +
                                 item.xpath('@href').extract_first(),
                                 method="GET",
                                 dont_filter=True,
                                 callback=self.parse_province,
                                 errback=self.handle_error,
                                 meta={
                                     'province':
                                     item.xpath('text()').extract_first(),
                                     'start':
                                     str(1)
                                 })

    def parse_province(self, response):
        requests_arr = []
        where = response.css('.lawList::attr("w")').extract_first()
        if where != None:
            requests_arr.extend(self.parse_province_list(response))
            start = str(int(response.meta['start']) + 1)
            requests_arr.append(
                scrapy.FormRequest(url=self.baseurl,
                                   method="POST",
                                   headers={
                                       'X-Requested-With':
                                       'XMLHttpRequest',
                                       'Content-Type':
                                       'application/x-www-form-urlencoded'
                                   },
                                   dont_filter=True,
                                   callback=self.parse_province,
                                   errback=self.handle_error,
                                   formdata={
                                       'action': 'get_law',
                                       'start': start,
                                       'where': where
                                   },
                                   meta={
                                       'province': response.meta['province'],
                                       'start': start
                                   }))
            return requests_arr

    def parse_province_list(self, response):
        for item in response.css('.lawList li a::attr(href)').extract():
            yield scrapy.Request(url=item,
                                 method="GET",
                                 dont_filter=True,
                                 callback=self.parse_detail,
                                 errback=self.handle_error,
                                 meta={'province': response.meta['province']})

    #详情页面
    def parse_detail(self, response):
        item = {}
        #  #[UIID],[UIPhone] ,[UIName] ,[UIEmail] ,[UIPic],[UILawNumber],[LawOrg],[ProvinceCode],[CityCode],[Address],[UISignature]
        item["UIID"] = str(uuid.uuid1()).replace('-', '')
        uiphone = ''.join(
            response.xpath(
                '/html/body/div[1]/div[1]/div/div/div[2]/div[2]/text()').re(
                    '[^\s+]'))
        match_count = len(re.findall(r'[1][3,4,5,6,7,8][0-9]{9}', uiphone))
        item['UILawNumber'] = ''.join(
            response.xpath(
                "/html/body/div[1]/div[1]/div/div/div[2]/div[3]/text()").re(
                    '[^\s+]'))
        if item["UILawNumber"] != None and len(
                item["UILawNumber"]
        ) == 17 and self.userInfoInfoData.find_lawyer_by_lawlumber(
            (item["UILawNumber"], )) == None:
            item["UIPhone"] = None if match_count == 0 else uiphone
            item['UIName'] = ''.join(
                response.xpath(
                    "/html/body/div[1]/div[1]/div/div/div[2]/div[1]/div[1]/text()"
                ).re('[^\s+]')).replace(u'律师', '')
            item['LawOrg'] = ''.join(
                response.xpath(
                    '/html/body/div[1]/div[1]/div/div/div[2]/div[4]/text()').
                re('[^\s+]'))
            item['UIEmail'] = None
            item["UISignature"] = ''.join(
                response.css('#about::text').re('[^\s+]')).replace("\t", '')
            item["Address"] = ''.join(
                response.xpath(
                    '/html/body/div[1]/div[1]/div/div/div[2]/div[5]/text()').
                re('[^\s+]'))
            item["ProvinceCode"] = ''.join(
                self.areaData.find_area_by_name_return_code(
                    (response.meta['province'])))
            item["CityCode"] = None
            fiil_str = ''.join(
                response.xpath(
                    '/html/body/div[1]/div[1]/div/div/div[3]/span/text()').
                extract()).replace('\r', '').replace('\t',
                                                     '').replace('\n', '')
            item['fiil_str'] = field_info_dic.find_field_by_name(
                fiil_str.split(" "))
            # 头像路径
            dirname = 'fws'
            head_url = ''.join(
                response.css('.lshil3-1-1 img::attr(src)').extract())
            item["UIPic"] = ''.join(
                http_util.downloadImage([head_url], '/AppFile/' + dirname +
                                        "/" + item["UIID"] + '/head'))
            if item["UIPic"] == '' or item["UIPic"] == None:
                item["UIPic"] = '/APPFile/head.jpg'
            item['url'] = response.url
            return item

    def handle_error(self, result, *args, **kw):
        print "error url is :%s" % result.request.url
        self.logger.error("error url is :%s" % result.request.url)
Beispiel #15
0
class FabangLawyerSpider(scrapy.Spider):
    name = "fabang_lawyer"
    start_urls = ["http://lawyer.fabang.com"]
    areaData = AreaData()
    userInfoInfoData = UserInfoInfoData()

    def parse(self, response):
        start_url = "http://lawyer.fabang.com/list/0-0-0-key-1-{0}.html"
        for page in range(1, 1075):
            yield scrapy.Request(url=start_url.format(str(page)),
                                 method="get",
                                 callback=self.parse_lawyer_list,
                                 errback=self.handle_error)

    def parse_lawyer_list(self, response):
        for detail_html in response.css(".lawyerlist"):
            detail_url = detail_html.css(".uname::attr(href)").extract_first()
            yield scrapy.Request(url=detail_url,
                                 callback=self.parse_lawyer_item,
                                 errback=self.handle_error)

    def parse_lawyer_item(self, response):
        item = {}
        item["UILawNumber"] = ''.join(
            response.xpath(
                u'//p[contains(text(),"执业证号:")]/text()').extract()).replace(
                    ' ', '').replace(u'执业证号:', '')
        uiphone = ''.join(
            response.xpath(
                '//strong[@class="mobile"]/text()').extract()).replace(
                    ' ', '')
        match_count = len(re.findall(r'[1][3,4,5,6,7,8][0-9]{9}', uiphone))
        item["UIPhone"] = None if match_count == 0 else uiphone
        #如果数据库不存在执业证号
        if item["UILawNumber"] != None and len(
                item["UILawNumber"]
        ) == 17 and self.userInfoInfoData.find_lawyer_by_lawlumber(
            (item["UILawNumber"], )) == None and item["UIPhone"] != no:
            item["UIName"] = ''.join(
                response.xpath(
                    '//strong[@class="lawyername"]/text()').extract()).replace(
                        ' ', '').replace(u"律师", '')

            item["LawOrg"] = response.xpath(
                '//p[@class="jigou"][1]/a/text()').extract_first()
            item["Address"] = ''.join(
                response.xpath(
                    u'//p[contains(text(),"地\xa0\xa0\xa0\xa0址:")]/text()').
                extract()).replace(' ', '').replace(u'地\xa0\xa0\xa0\xa0址:', '')
            item["UIEmail"] = ''.join(
                response.xpath(
                    u'//p[contains(text(),"邮\xa0\xa0\xa0\xa0箱:")]/text()').
                extract()).replace(' ', '').replace(u'邮\xa0\xa0\xa0\xa0箱:', '')
            fiil_str = ''.join(
                response.xpath(u'//p[contains(text(),"专长领域:")]/text()').
                extract()).replace(' ', '').replace(u'专长领域:', '')
            desc = ''.join(
                response.xpath(
                    "//div[@class='content'][last()]/*").extract()).replace(
                        u"\xa0", '')
            desc = re.sub(
                r'(<a.*?>.*?</a>)|((class|style|color|href)="[^"]*?")|(<.*?>)|(<[/].*?>)',
                '', desc).replace("\r", '').replace("\n", '').replace(' ', '')
            s_start_index = 0 if desc.index(u'分享到:') == -1 else desc.index(
                u'分享到:')
            item["UISignature"] = None if desc == '' else desc[
                s_start_index:].replace(u'分享到:', '').replace(
                    u"\xa0", '').replace("\t", '').replace("\n", '').replace(
                        ' ', '').replace(u'&amp;', '').replace('...', '')
            province_city = response.xpath(
                '//div[@class="info_nm SG_txtc "]/text()').extract_first(
                ).replace("\r", '').replace("\n", '').split(" ")
            item["ProvinceCode"] = ''.join(
                self.areaData.find_area_by_name_return_code(
                    (province_city[0])))
            item["CityCode"] = ''.join(
                self.areaData.find_area_by_name_return_code(
                    (province_city[1])))
            item['fiil_str'] = field_info_dic.find_field_by_name(
                fiil_str.split(u"\xa0"))
            item["UIID"] = str(uuid.uuid1()).replace('-', '')
            item["UIPic"] = ''.join(
                http_util.downloadImage([
                    "http://lawyer.fabang.com" + ''.join(
                        response.css(
                            '.info_img_area img::attr(src)').extract())
                ], '/AppFile/' + item["UIID"] + '/head'))
            item["url"] = response.url
            return item

    def handle_error(self, result, *args, **kw):
        print "error url is :%s" % result.request.url
        self.logger.error("error url is :%s" % result.request.url)