Example #1
0
 def parse(self, response):
     """获取医院相关信息"""
     self.logger.info('>>>>>>正在抓取医院相关信息……')
     hospital_info = json.loads(response.text)
     for each_hospital in hospital_info[3:4]:
         is_medicare = '是' if str(each_hospital.get('Ismedicalcard', '')) == '1' else '否'
         loader = CommonLoader(item=HospitalInfoItem(), response=response)
         loader.add_value('hospital_name', each_hospital.get('hospitalname', ''))
         loader.add_value('hospital_level', each_hospital.get('levelName', ''))
         loader.add_value('hospital_addr', each_hospital.get('address', ''))
         loader.add_value('hospital_pro', '四川')
         loader.add_value('hospital_city', each_hospital.get('areaName', ''))
         loader.add_value('is_medicare', is_medicare)
         loader.add_value('dataSource_from', self.source_from)
         loader.add_value('update_time', now_day())
         hospital_item = loader.load_item()
         yield hospital_item
         hospital_id = each_hospital.get('hospitalid')
         if hospital_id:
             dept_request = FormRequest(self.dept_link,
                                        headers=self.headers,
                                        callback=self.parse_dept_info,
                                        formdata={'hospitalId': str(hospital_id)},
                                        dont_filter=True)
             self.headers['Referer'] = 'http://www.scgh114.com/web/register/gh'
             yield dept_request
Example #2
0
 def parse(self, response):
     """获取医院信息"""
     self.logger.info('>>>>>>正在抓取{}:医院信息>>>>>>'.format(self.hospital_name))
     loader = CommonLoader2(item=HospitalInfoItem(), response=response)
     loader.add_value('hospital_name', self.hospital_name)
     loader.add_value(
         'consulting_hour', '急诊和临床住院科室_24小时值班;'
         '行政及其它_上午8:00~12:00,下午14:00~17:00')
     loader.add_value('hospital_level', '三级乙等')
     loader.add_value('hospital_type', '公立')
     loader.add_value('hospital_category', '中医医院')
     loader.add_value('hospital_addr', '四川省彭州市天彭镇南大街396号')
     loader.add_value('hospital_pro', '四川省')
     loader.add_value('hospital_city', '彭州市')
     loader.add_value('hospital_county', '')
     loader.add_value('hospital_phone', '028-83701908')
     loader.add_xpath(
         'hospital_intro', '//div[@id="about-right-b"]',
         MapCompose(remove_tags, custom_remove_tags, clean_info))
     # loader.add_value('is_medicare', '是')
     # loader.add_value('medicare_type', '成都市医保、工伤保险定点医院')
     loader.add_value('registered_channel', '官网')
     loader.add_value('dataSource_from', '医院官网')
     loader.add_value('update_time', now_day())
     hospital_info_item = loader.load_item()
     yield hospital_info_item
Example #3
0
 def parse(self, response):
     """获取医院信息"""
     self.logger.info('>>>>>>正在抓取{}:医院信息>>>>>>'.format(self.hospital_name))
     loader = CommonLoader2(item=HospitalInfoItem(), response=response)
     loader.add_value('hospital_name', self.hospital_name)
     loader.add_value('consulting_hour', '门诊上午_8:00-12:00;门诊下午_14:00-17:30')
     loader.add_value('hospital_level', '二级甲等')
     loader.add_value('hospital_type', '公立')
     loader.add_value('hospital_category', '综合医院')
     loader.add_value('hospital_addr', '成都市东三环龙泉驿区十陵街道江华路8号')
     loader.add_value('hospital_pro', '四川省')
     loader.add_value('hospital_city', '成都市')
     loader.add_value('hospital_county', '龙泉驿区')
     loader.add_value(
         'hospital_phone', '急救电话_028-84615120;电话咨询_028-84604546转科室;'
         '24小时医护热线_028-84615789')
     loader.add_xpath('hospital_intro', '//article[@class="content"]/div',
                      MapCompose(remove_tags, custom_remove_tags))
     loader.add_value('is_medicare', '是')
     loader.add_value('medicare_type', '成都市医保、工伤保险定点医院')
     loader.add_value('registered_channel', '官网或官方微信公众号(工作日),法定节假日电话预约')
     loader.add_value('dataSource_from', '医院官网')
     loader.add_value('update_time', now_day())
     hospital_info_item = loader.load_item()
     yield hospital_info_item
Example #4
0
 def parse(self, response):
     """获取医院信息"""
     self.logger.info('>>>>>>正在抓取{}:医院信息>>>>>>'.format(self.hospital_name))
     loader = CommonLoader2(item=HospitalInfoItem(), response=response)
     loader.add_value('hospital_name', self.hospital_name)
     loader.add_value('consulting_hour', '上午8:00—12:00;下午2:00—5:30')
     loader.add_value('hospital_level', '三级乙等')
     loader.add_value('hospital_type', '公立')
     loader.add_value('hospital_category', '妇幼保健院')
     loader.add_value('hospital_addr', '四川省成都市双流区东升街道涧槽中街396号')
     loader.add_value('hospital_pro', '四川省')
     loader.add_value('hospital_city', '成都市')
     loader.add_value('hospital_county', '')
     loader.add_value(
         'hospital_phone', '母婴咨询热线_028-85884888(工作日);'
         '总值班电话_028-85808438;'
         '预约挂号电话_028-85801029(7:30-19:30)')
     loader.add_xpath('hospital_intro', '//div[@class="describe htmledit"]',
                      MapCompose(remove_tags, custom_remove_tags))
     loader.add_value('is_medicare', '是')
     # loader.add_value('medicare_type', '')
     loader.add_value('registered_channel', '电话预约;自助挂号机;诊室预约;'
                      '医院微信公众号;健康双流;现场')
     loader.add_value('dataSource_from', '医院官网')
     loader.add_value('update_time', now_day())
     hospital_info_item = loader.load_item()
     yield hospital_info_item
Example #5
0
 def parse(self, response):
     self.logger.info('正在抓取{}:医院信息'.format(self.hospital_name))
     loader = MedicalMapLoader(item=HospitalInfoItem(), response=response)
     # loader.add_value('hospital_id', self.hospital_id)
     loader.add_value('hospital_name', self.hospital_name)
     loader.add_value('consulting_hour', '普通门诊上午_8:00-12:00;普通门诊下午13:00-16:30')
     loader.add_value('hospital_level', '三级乙等')
     loader.add_value('hospital_type', '公立')
     loader.add_value('hospital_category', '综合医院')
     # loader.add_value('hospital_addr', '四川省金堂县赵镇金广路886号')
     loader.add_value('hospital_pro', '四川省')
     loader.add_value('hospital_city', '成都市')
     loader.add_value('hospital_county', '金堂县')
     loader.add_value('hospital_phone', '医院服务电话_028-84902884;服务质量监督投诉电话_028-84932532;'
                                        '急诊急救电话_18181938532;产科急救电话_18181938532;'
                                        '医保结算电话_028-84932721;'
                                        '预约挂号电话_028-84931443;预约挂号电话_028-84902884;预约挂号电话_028-61568616')
     loader.add_xpath('hospital_intro', '//div[@class="baseRight-intro"]/p[position()<7]/span/text()')
     loader.add_value('is_medicare', '')
     loader.add_value('medicare_type', '')
     loader.add_value('vaccine_name', '')
     loader.add_value('is_cpc', '')
     loader.add_value('is_bdc', '')
     loader.add_value('cooperative_business', '')
     loader.add_value('hospital_district', '')
     loader.add_value('registered_channel', '微信公众号_' + self.hospital_name)
     loader.add_value('dataSource_from', '官网:http://www.jintangyy.com/index.aspx')
     loader.add_value('update_time', now_day())
     hospital_info_item = loader.load_item()
     yield hospital_info_item
     request = Request(self.dep_link, callback=self.parse_hospital_dep)
     request.meta['Referer'] = response.url
     yield request
Example #6
0
 def parse(self, response):
     """获取医院信息"""
     self.logger.info('>>>>>>正在抓取{}:医院信息>>>>>>'.format(self.hospital_name))
     loader = CommonLoader2(item=HospitalInfoItem(), response=response)
     loader.add_value('hospital_name', self.hospital_name)
     loader.add_value('consulting_hour', '')
     loader.add_value('hospital_level', '三级乙等')
     loader.add_value('hospital_type', '公立')
     loader.add_value('hospital_category', '妇幼保健院')
     loader.add_value('hospital_addr', '成都市温江区万春路140')
     loader.add_value('hospital_pro', '四川省')
     loader.add_value('hospital_city', '成都市')
     loader.add_value('hospital_county', '')
     loader.add_value('hospital_phone', '24小时急救电话_028-82723131;咨询电话_围产期保健_028-82715727;'
                                        '咨询电话_妇科门诊_028-82711383;咨询电话_儿童保健_028-82711527;'
                                        '咨询电话_婚检科_028-82720337;'
                                        '投诉电话_028-82724901(上班时间);投诉电话_13688488598(下班时间)')
     loader.add_xpath('hospital_intro',
                      '//div[@id="info_txt"]',
                      MapCompose(remove_tags, custom_remove_tags))
     # loader.add_value('is_medicare', '是')
     # loader.add_value('medicare_type', '')
     loader.add_value('registered_channel', '电话预约;挂号窗口;医院微信公众号')
     loader.add_value('dataSource_from', '医院官网')
     loader.add_value('update_time', now_day())
     hospital_info_item = loader.load_item()
     yield hospital_info_item
Example #7
0
 def parse(self, response):
     """获取医院信息"""
     all_hospital_links = response.xpath('//div[@class="c-hidden disen-list-hos c-f12"]/ul/li')
     self.logger.info('该地区共{}家医院'.format(str(len(all_hospital_links))))
     for each_hospital_link in all_hospital_links:
         loader = YiHuLoader(item=HospitalInfoItem(), selector=each_hospital_link)
         loader.add_xpath('hospital_name', 'a/text()')
         loader.add_xpath('hospital_level', 'span/text()', MapCompose(remove_number2))
         hospital_link = each_hospital_link.xpath('a/@href').extract_first('')
         if hospital_link:
             # 医院信息
             hospital_detail_link = re.sub(r'/sc/', '/detail/', hospital_link)
             contact_hos_link = re.sub(r'/sc/', '/contact/', hospital_link)
             hospital_detail_request = Request(hospital_detail_link,
                                               headers=self.headers,
                                               callback=self.parse_hospital_detail,
                                               meta={'loader': loader,
                                                     'contact_hos_link': contact_hos_link})
             self.headers['Referer'] = response.url
             yield hospital_detail_request
             # 医院科室信息
             dep_request = Request(hospital_link,
                                   headers=self.headers,
                                   callback=self.parse_hospital_dep)
             self.headers['Referer'] = response.url
             yield dep_request
Example #8
0
    def parse_hospital_info(self, response):
        self.logger.info('>>>>>>正在抓取:医院信息>>>>>>')

        try:
            # 获取区或县
            hospital_address = response.xpath(
                '//div[@class="yy_js clearfix"]/div/dl/dd[1]/text()'
            ).extract_first('')
            if hospital_address:
                hospital_county = get_county2('中国|江苏省|江苏|南京市|南京',
                                              hospital_address)
            else:
                hospital_county = None

            # 获取医院信息
            loader = CommonLoader2(item=HospitalInfoItem(), response=response)
            loader.add_xpath('hospital_name',
                             '//div[@class="yy_til"]/h2/text()',
                             MapCompose(custom_remove_tags))
            loader.add_value('hospital_level',
                             response.meta.get('hospital_level'),
                             MapCompose(custom_remove_tags, clean_info))
            loader.add_xpath(
                'hospital_addr',
                '//div[@class="yy_js clearfix"]/div/dl/dd[1]/text()',
                MapCompose(custom_remove_tags))
            loader.add_value('hospital_pro', '江苏省')
            loader.add_value('hospital_city', '南京市')
            loader.add_value('hospital_county', hospital_county)
            loader.add_xpath(
                'hospital_phone',
                '//div[@class="yy_js clearfix"]/div/dl/dd[2]/text()',
                MapCompose(custom_remove_tags))
            loader.add_xpath('hospital_intro', '//dd[@id="wrap"]',
                             MapCompose(remove_tags, custom_remove_tags))
            loader.add_value('registered_channel', self.data_source_from)
            loader.add_value('dataSource_from', self.data_source_from)
            loader.add_value('hospital_url', response.url)
            loader.add_value('update_time', now_day())
            hospital_info_item = loader.load_item()
            yield hospital_info_item

            # 获取科室信息
            # self.logger.info('>>>>>>正在抓取{}:科室详细信息>>>>>>')
            all_dept_links = response.xpath(
                '//dl[@class="kfyy clearfix"]/dd/span/a/@href').extract()
            for each_dept_link in all_dept_links:
                dept_link = urljoin(
                    self.host,
                    re.sub(r';jsessionid=(.*?)\?', '?', each_dept_link))
                self.headers['Referer'] = response.url
                yield Request(dept_link,
                              headers=self.headers,
                              callback=self.parse_hospital_dep_detail)
        except Exception as e:
            self.logger.error('在抓取医院详细信息和科室的过程中出错了,原因是:{}'.format(repr(e)))
Example #9
0
 def parse(self, response):
     """获取医院信息"""
     self.logger.info('>>>>>>正在抓取{}:医院信息>>>>>>'.format(self.hospital_name))
     loader = CommonLoader2(item=HospitalInfoItem(), response=response)
     loader.add_value('hospital_name', self.hospital_name)
     loader.add_value('consulting_hour', '门诊时间_8:30-17:30;急诊时间_7*24小时')
     loader.add_value('hospital_level', '二级甲等')
     loader.add_value('hospital_type', '公立')
     loader.add_value('hospital_category', '中医医院')
     loader.add_value(
         'hospital_addr', '川化病区:青白江区化工北路41号(原川化医院);'
         '城厢病区:青白江区城厢镇大南街51号(中医血防医院);'
         '中医名医馆地址:中医医院川化病区一号楼1楼')
     loader.add_value('hospital_pro', '四川省')
     loader.add_value('hospital_city', '成都市')
     loader.add_value('hospital_county', '青白江区')
     loader.add_value('hospital_phone', '24小时急诊电话_028-83632835')
     loader.add_xpath('hospital_intro',
                      '//div[@class="right-about clearfix"]',
                      MapCompose(remove_tags))
     loader.add_value('registered_channel', '医院挂号室')
     loader.add_value('dataSource_from', '医院官网')
     loader.add_value('update_time', now_day())
     hospital_info_item = loader.load_item()
     # 医院信息
     yield hospital_info_item
     # 获取科室信息
     dept_links = response.xpath(
         '//ul[@id="navul"]/li[5]/ul/li|//ul[@id="navul"]/li[6]/ul/li')
     for each_dept_link in dept_links:
         dept_link = each_dept_link.xpath('a/@href').extract_first('')
         dept_name = each_dept_link.xpath('a/text()').extract_first('')
         if dept_link and dept_name:
             dept_request = Request(urljoin(self.host, dept_link),
                                    headers=self.headers,
                                    callback=self.parse_hospital_dep_detail,
                                    meta={'dept_name': dept_name})
             dept_request.meta['Referer'] = response.url
             yield dept_request
     # 获取医生信息
     doctor_links = response.xpath('//ul[@id="navul"]/li[8]/ul/li')
     for each_dept_link in doctor_links:
         dept_link = each_dept_link.xpath('a/@href').extract_first('')
         dept_name = each_dept_link.xpath('a/text()').extract_first('')
         if dept_link and dept_name:
             dept_request = Request(urljoin(self.host, dept_link),
                                    headers=self.headers,
                                    callback=self.parse_doctor_info,
                                    meta={'dept_name': dept_name})
             dept_request.meta['Referer'] = response.url
             yield dept_request
Example #10
0
 def parse(self, response):
     """获取医院信息"""
     self.logger.info('>>>>>>正在抓取{}:医院信息>>>>>>'.format(self.hospital_name))
     loader = CommonLoader2(item=HospitalInfoItem(), response=response)
     loader.add_value('hospital_name', self.hospital_name)
     loader.add_value('consulting_hour', '门诊上午_8:00-12:00;门诊下午_14:00-17:30')
     loader.add_value('hospital_level', '三级乙等')
     loader.add_value('hospital_type', '公立')
     loader.add_value('hospital_category', '中医医院')
     loader.add_value('hospital_addr', '四川省成都市郫都区南大街342号')
     loader.add_value('hospital_pro', '四川省')
     loader.add_value('hospital_city', '成都市')
     loader.add_value('hospital_county', '郫都区')
     loader.add_value(
         'hospital_phone',
         '咨询电话_028-87920858;急诊电话_028-87925131;预约挂号_028-87925042;'
         '投诉电话_028-87920858;人事招聘_028-87925158;医保出入院处_028-87925172;'
         '体验科_028-87922056')
     loader.add_xpath('hospital_intro',
                      '//div[@class="rightPanel"]/p[position()>2]',
                      MapCompose(remove_tags))
     loader.add_value('is_medicare', '是')
     loader.add_value('registered_channel', '官方微信公众号或华医通APP')
     loader.add_value('dataSource_from', '医院官网')
     loader.add_value('update_time', now_day())
     hospital_info_item = loader.load_item()
     # 医院信息
     yield hospital_info_item
     # 获取科室信息
     dept_request = Request(self.dept_link,
                            headers=self.headers,
                            callback=self.parse_hospital_dep,
                            dont_filter=True)
     dept_request.meta['Referer'] = self.entry_url
     yield dept_request
     # 获取医生信息
     for each_dept_link in self.doctor_links:
         dept_request = Request(each_dept_link,
                                headers=self.headers,
                                callback=self.parse_doctor_info,
                                dont_filter=True)
         dept_request.meta['Referer'] = each_dept_link
         yield dept_request
Example #11
0
 def parse(self, response):
     """获取医院信息"""
     self.logger.info('正在抓取{}:医院信息'.format(self.hospital_name))
     loader = PxfybjyLoader(item=HospitalInfoItem(), response=response)
     loader.add_value('hospital_name', self.hospital_name)
     loader.add_value('consulting_hour', '门诊上午_8:00-12:00;门诊下午14:00-17:30')
     loader.add_value('hospital_level', '三级乙等')
     loader.add_value('hospital_type', '公立')
     loader.add_value('hospital_category', '妇幼保健院')
     loader.add_value('hospital_addr', '郫都区郫筒街道新南街283号')
     loader.add_value('hospital_pro', '四川省')
     loader.add_value('hospital_city', '成都市')
     loader.add_value('hospital_county', '郫都区')
     loader.add_value(
         'hospital_phone',
         '产科急救_028-87922244;儿科急救_028-87931629;产检门诊_028-87924116;'
         '婚检_028-87885339;儿保科_028-87931911')
     loader.add_xpath(
         'hospital_intro',
         '//div[@class="FrontComContent_detail01-1468317290474_htmlbreak"]/'
         'p[position()<9]')
     # loader.add_value('is_medicare', '')
     # loader.add_value('medicare_type', '')
     # loader.add_value('vaccine_name', '')
     # loader.add_value('is_cpc', '')
     # loader.add_value('is_bdc', '')
     # loader.add_value('cooperative_business', '')
     # loader.add_value('hospital_district', '')
     # loader.add_value('registered_channel', '')
     loader.add_value('dataSource_from',
                      '官网:http://www.pxfybjy.cn/index.html')
     loader.add_value('update_time', now_day())
     hospital_info_item = loader.load_item()
     # 医院信息
     yield hospital_info_item
     # 科室信息
     request = Request(self.index_link,
                       headers=self.headers,
                       callback=self.parse_hospital_dep)
     request.meta['Referer'] = response.url
     yield request
Example #12
0
 def parse_hospital_info(self, response):
     self.logger.info('>>>>>>正在抓取:医院详细信息>>>>>>')
     try:
         # 获取医院信息
         hospital_info = json.loads(response.text)
         # 获取医院等级
         hospital_level_info = hospital_info.get('HIS_LVL')
         if hospital_level_info == '3':
             hospital_level = '三级'
         elif hospital_level_info == '2':
             hospital_level = '二级'
         elif hospital_level_info == '1':
             hospital_level = '一级'
         else:
             hospital_level = None
         # 获取医院所在区或县
         hospital_address = hospital_info.get('HIS_AD')
         if hospital_address:
             hospital_county = get_county2('中国|广东省|广东|广州市|广州',
                                           hospital_address)
         else:
             hospital_county = None
         loader = CommonLoader2(item=HospitalInfoItem(), response=response)
         loader.add_value('hospital_name', hospital_info.get('HIS_NM'))
         loader.add_value('hospital_level', hospital_level)
         loader.add_value('hospital_category', '')
         loader.add_value('hospital_addr', hospital_address)
         loader.add_value('hospital_pro', '广东省')
         loader.add_value('hospital_city', '广州市')
         loader.add_value('hospital_county', hospital_county)
         loader.add_value('hospital_phone', hospital_info.get('TEL_NO'))
         loader.add_value('hospital_intro', hospital_info.get('HIS_RM'))
         loader.add_value('registered_channel', self.data_source_from)
         loader.add_value('dataSource_from', self.data_source_from)
         loader.add_value('hospital_url', response.url)
         loader.add_value('update_time', now_day())
         hospital_info_item = loader.load_item()
         yield hospital_info_item
     except Exception as e:
         self.logger.error('在抓取医院详细信息过程中出错了,原因是:{}'.format(repr(e)))
Example #13
0
 def parse(self, response):
     """获取医院信息"""
     self.logger.info('正在抓取{}:医院信息'.format(self.hospital_name))
     loader = CommonLoader2(item=HospitalInfoItem(), response=response)
     loader.add_value('hospital_name', self.hospital_name)
     loader.add_value('consulting_hour', '8:00-17:00')
     loader.add_value('hospital_level', '三级乙等')
     loader.add_value('hospital_type', '公立')
     loader.add_value('hospital_category', '中医医院')
     loader.add_value('hospital_addr', '成都市双流区东升街道花园路二段(新院),淳化街205号(老院)')
     loader.add_value('hospital_pro', '四川省')
     loader.add_value('hospital_city', '成都市')
     loader.add_value('hospital_county', '')
     loader.add_value('hospital_phone', '预约电话_028-69803716;体检咨询_028-85808932')
     loader.add_xpath('hospital_intro',
                      '//div[@class="text"]',
                      MapCompose(remove_tags, custom_remove_tags))
     loader.add_value('registered_channel', '实名制电话预约、现场预约、自助机预约、诊间预约')
     loader.add_value('dataSource_from', '医院官网')
     loader.add_value('update_time', now_day())
     hospital_info_item = loader.load_item()
     # 医院信息
     yield hospital_info_item
Example #14
0
 def parse_hospital_info(self, response):
     self.logger.info('>>>>>>正在抓取医院详细信息>>>>>>')
     try:
         hospital_address = response.xpath(
             '//b[contains(text(),"医院地址")]/'
             'ancestor::td[1]/text()').extract_first('')
         hospital_county = get_county2('中国|广东省|广东|珠海市|珠海', hospital_address)
         loader = CommonLoader2(item=HospitalInfoItem(), response=response)
         loader.add_xpath(
             'hospital_name',
             '//b[contains(text(),"医院全称")]/ancestor::td[1]/text()',
             MapCompose(custom_remove_tags))
         loader.add_xpath(
             'hospital_level',
             '//b[contains(text(),"医院级别")]/ancestor::td[1]/text()',
             MapCompose(custom_remove_tags))
         loader.add_value('hospital_addr', hospital_address,
                          MapCompose(custom_remove_tags))
         loader.add_value('hospital_pro', '广东省')
         loader.add_value('hospital_city', '珠海市')
         loader.add_value('hospital_county', hospital_county,
                          MapCompose(custom_remove_tags))
         loader.add_xpath(
             'hospital_phone',
             '//b[contains(text(),"联系电话")]/ancestor::td[1]/text()',
             MapCompose(custom_remove_tags))
         loader.add_xpath(
             'hospital_intro', '//b[contains(text(),"简介")]/ancestor::td[1]',
             MapCompose(remove_tags, custom_remove_tags, match_special))
         loader.add_value('registered_channel', self.data_source_from)
         loader.add_value('dataSource_from', self.data_source_from)
         loader.add_value('crawled_url', response.url)
         loader.add_value('update_time', now_day())
         hospital_item = loader.load_item()
         yield hospital_item
     except Exception as e:
         self.logger.error('在抓取医院详细信息过程中出错了,原因是:{}'.format(repr(e)))
Example #15
0
 def parse(self, response):
     """获取医院信息"""
     self.logger.info('>>>>>>正在抓取{}:医院信息>>>>>>'.format(self.hospital_name))
     loader = CommonLoader2(item=HospitalInfoItem(), response=response)
     loader.add_value('hospital_name', self.hospital_name)
     loader.add_value('consulting_hour', '门诊时间(无假日医院)8:30-17:30')
     loader.add_value('hospital_level', '三级乙等')
     loader.add_value('hospital_type', '公立')
     loader.add_value('hospital_category', '专科医院')
     loader.add_value('hospital_addr', '绵阳市经开区红塔路16号(绵阳市红星街97号)')
     loader.add_value('hospital_pro', '四川省')
     loader.add_value('hospital_city', '绵阳市')
     loader.add_value('hospital_county', '')
     loader.add_value('hospital_phone', '0816-2265553;0816-2261517')
     loader.add_xpath('hospital_intro',
                      '//div[@class="content-left pull-left"]',
                      MapCompose(remove_tags, custom_remove_tags))
     loader.add_value('is_medicare', '是')
     # loader.add_value('medicare_type', '')
     loader.add_value('registered_channel', '现场预约(一楼挂号交费处);电话预约;官网')
     loader.add_value('dataSource_from', '医院官网')
     loader.add_value('update_time', now_day())
     hospital_info_item = loader.load_item()
     yield hospital_info_item
     # 科室信息
     dept_links = response.xpath(
         '//div[@class="ddsmoothmenu"]/ul/li[position()=4]/ul/li')
     for each_dept_link in dept_links[0:1]:
         dept_link = each_dept_link.xpath('a/@href').extract_first('')
         dept_name = each_dept_link.xpath('a/text()').extract_first('')
         if dept_link and dept_name:
             dept_request = Request(dept_link,
                                    headers=self.headers,
                                    callback=self.parse_hospital_dep,
                                    meta={'dept_name': dept_name})
             dept_request.meta['Referer'] = response.url
             yield dept_request
Example #16
0
    def parse(self, response):
        self.logger.info('>>>>>>正在抓取所有医院信息>>>>>>')
        all_hospitals = json.loads(response.text)
        for each_hospital in all_hospitals.get('list'):
            hospital_name = each_hospital.get('hospitalname')
            hospital_address = each_hospital.get('address')
            hospital_county = get_county2('中国|广东省|广东|中山市|中山', hospital_address)
            loader = CommonLoader2(item=HospitalInfoItem(), response=response)
            loader.add_value('hospital_name', hospital_name,
                             MapCompose(custom_remove_tags))
            loader.add_value('hospital_addr', hospital_address,
                             MapCompose(custom_remove_tags))
            loader.add_value('hospital_pro', '广东省')
            loader.add_value('hospital_city', '中山市')
            loader.add_value('hospital_county', hospital_county,
                             MapCompose(custom_remove_tags))
            loader.add_value('hospital_phone',
                             each_hospital.get('telephoneno'),
                             MapCompose(custom_remove_tags))
            loader.add_value('hospital_intro',
                             each_hospital.get('information'),
                             MapCompose(custom_remove_tags))
            loader.add_value('registered_channel', self.data_source_from)
            loader.add_value('dataSource_from', self.data_source_from)
            loader.add_value('hospital_url', response.url)
            loader.add_value('update_time', now_day())
            hospital_item = loader.load_item()
            yield hospital_item

            # 获取科室信息、医生信息
            hospital_id = each_hospital.get('hospitalid')
            if hospital_id:
                self.headers['Referer'] = self.entry_url
                yield Request(self.hospital_detail_url.format(hospital_id),
                              headers=self.headers,
                              callback=self.parse_hospital_dep,
                              meta={'hospital_name': hospital_name})
Example #17
0
    def parse(self, response):
        """获取医院信息"""
        self.logger.info('正在抓取{}:医院信息'.format(self.hospital_name))
        loader = CommonLoader2(item=HospitalInfoItem(), response=response)
        loader.add_value('hospital_name', self.hospital_name)
        loader.add_value('consulting_hour', '上午_8:00-12:00;下午14:00-17:30')
        loader.add_value('hospital_level', '二级甲等')
        loader.add_value('hospital_type', '公立')
        loader.add_value('hospital_category', '综合医院')
        loader.add_value('hospital_addr', '四川都江堰市发展路89号')
        loader.add_value('hospital_pro', '四川省')
        loader.add_value('hospital_city', '都江堰市')
        loader.add_value('hospital_county', '')
        loader.add_value(
            'hospital_phone',
            '急救电话_028-68963120;免费咨询电话_028-69219766;投诉电话_028-69263900 ')
        loader.add_xpath('hospital_intro', '//div[@class="fleft wd740"]')
        loader.add_value('registered_channel', '电话')
        loader.add_value('dataSource_from', '医院官网')
        loader.add_value('update_time', now_day())
        hospital_info_item = loader.load_item()
        # 医院信息
        yield hospital_info_item
        # 科室信息
        # 第一版 获取导航菜单子菜单中的科室信息,不全只有10个
        # dept_links = response.xpath('//div[@id="head1_ksdh"]/div/div/a')

        # if dept_links:
        #     for each_dept_link in dept_links:
        #         dept_link = each_dept_link.xpath('@href').extract_first('')
        #         dept_name = each_dept_link.xpath('text()').extract_first('')
        #         if dept_link:
        #             dept_request = Request(urljoin(self.host, dept_link),
        #                                    headers=self.headers,
        #                                    callback=self.parse_hospital_dep_detail,
        #                                    meta={'dept_name': dept_name})
        #             dept_request.meta['Referer'] = response.url
        #             yield dept_request
        # 获取科室信息,第二版
        # 获取默认页面的科室信息
        dept_default_request = Request(self.dept_link,
                                       headers=self.headers,
                                       callback=self.parse_hospital_dep_detail,
                                       meta={'dept_name': '门诊部'},
                                       dont_filter=True)
        dept_default_request.meta['Referer'] = response.url
        yield dept_default_request
        # 获取默认页面中的其他科室信息
        dept_request = Request(self.dept_link,
                               headers=self.headers,
                               callback=self.parse_hospital_dep,
                               dont_filter=True)
        dept_request.meta['Referer'] = response.url
        yield dept_request
        # 医生信息,官网翻页不太好用
        for each_doctor_link in self.doctor_link_list:
            doctor_request = Request(each_doctor_link,
                                     headers=self.headers,
                                     callback=self.parse_doctor_info)
            doctor_request.meta['Referer'] = response.url
            yield doctor_request
Example #18
0
    def parse_hospital_info(self, response):
        hospital_name = response.meta.get('hospital_name')
        self.logger.info('>>>>>>正在抓取[{}]医院信息和科室信息>>>>>>'.format(hospital_name))
        hospital_city = response.xpath(
            '//div[@class="jieshao_zi"]/p[4]/text()').extract()
        if hospital_city:
            hospital_address = custom_remove_tags(''.join(hospital_city))
            hospital_city2 = get_city(hospital_address)
            useless_info = '中国|湖南省|湖南|{}'.format(hospital_city2)
            hospital_county = get_county2(useless_info, hospital_address)
        else:
            hospital_county = None
        loader = CommonLoader2(item=HospitalInfoItem(), response=response)
        loader.add_xpath('hospital_name',
                         '//div[@class="jieshao_zi"]/p/font/text()',
                         MapCompose(custom_remove_tags))
        loader.add_xpath('hospital_level',
                         '//div[@class="jieshao_zi"]/p[2]/text()',
                         MapCompose(custom_remove_tags))
        loader.add_value('hospital_type', '公立')
        loader.add_value('hospital_category', '')
        loader.add_xpath('hospital_addr',
                         '//div[@class="jieshao_zi"]/p[4]/text()',
                         MapCompose(custom_remove_tags))
        loader.add_value('hospital_pro', '湖南省')
        loader.add_xpath('hospital_city',
                         '//div[@class="jieshao_zi"]/p[4]/text()',
                         MapCompose(custom_remove_tags, get_city))
        loader.add_value('hospital_county', hospital_county)
        loader.add_xpath('hospital_phone',
                         '//div[@class="jieshao_zi"]/p[3]/text()',
                         MapCompose(custom_remove_tags))
        loader.add_xpath(
            'hospital_intro', '//div[@id="starlist"]',
            MapCompose(remove_tags, custom_remove_tags, clean_info))
        loader.add_value('registered_channel', self.data_source_from)
        loader.add_value('dataSource_from', self.data_source_from)
        loader.add_value('update_time', now_day())
        hospital_info_item = loader.load_item()
        yield hospital_info_item

        # 获取科室信息
        self.logger.info('>>>>>>正在抓取[{}]科室信息>>>>>>'.format(hospital_name))
        dept_links = response.xpath('//div[@class="xuanze_kslb"]')
        if dept_links:
            for each_dept_link in dept_links:
                dept_type = each_dept_link.xpath(
                    'div[1]/ul/li/text()').extract_first('')
                all_dept_links = each_dept_link.xpath('div[2]/ul/li/a')
                for dept_link in all_dept_links:
                    # dept_name = dept_link.xpath('text()').extract_first('')
                    data_info = dept_link.xpath('@onclick').extract_first('')
                    if data_info:
                        data_info = ''.join(re.findall(r'\S+', data_info))
                        is_sp_time = re.search(r'isSpTime:\'(.*?)\'',
                                               data_info)
                        pay_mode = re.search(r'paymode:\'(.*?)\'', data_info)
                        dept_id = re.search(r'platformDeptId:\'(.*?)\'',
                                            data_info)
                        hos_id = re.search(r'platformHosId:\'(.*?)\'',
                                           data_info, S)
                        dept_name = re.search(r'tempDeptName:\'(.*?)\'',
                                              data_info, S)
                        org_name = re.search(r'orgname:\'(.*?)\'', data_info,
                                             S)
                        if dept_id and hos_id and dept_name and org_name:
                            is_sp_time = is_sp_time.group(1)
                            pay_mode = pay_mode.group(1)
                            dept_id = dept_id.group(1)
                            hos_id = hos_id.group(1)
                            dept_name = dept_name.group(1)
                            org_name = org_name.group(1)
                            data = {
                                'isSpTime': str(is_sp_time),
                                'paymode': quote(pay_mode),
                                'doctorCollectResult': '',
                                'platformDeptId': str(dept_id),
                                'orgname': quote(org_name),
                                'tempDeptName': quote(dept_name),
                                'platformHosId': str(hos_id),
                                'platformDoctorId': ''
                            }
                            self.headers.update({
                                'Content-Type':
                                'application/x-www-form-urlencoded',
                                'Origin': 'http://www.hnyygh.com',
                                'Referer':
                                'http://www.hnyygh.com/searchDeptmentAction.action',
                                'Pragma': 'no-cache'
                            })
                            splash_args = {
                                'url': self.dept_detail_url,
                                'headers': self.headers,
                                'lua_source': self.dept_script,
                                'data': data
                            }
                            yield SplashRequest(
                                self.dept_detail_url,
                                endpoint='execute',
                                args=splash_args,
                                dont_filter=True,
                                headers=self.headers,
                                callback=self.parse_hospital_dep_detail,
                                meta={
                                    'dept_type': dept_type,
                                    'dept_name': dept_name,
                                    'hospital_name': org_name
                                })
                            # 获取医生信息
                            data = {
                                'platformDeptId': dept_id,
                                'platformHosId': hos_id,
                                'platformDoctorId': '',
                                'nextNumInfo': '0'
                            }
                            self.headers.update({
                                'Content-Type':
                                'application/x-www-form-urlencoded',
                                'Origin':
                                'http://www.hnyygh.com',
                                'Referer':
                                'http://www.hnyygh.com/searchOrderNumInfoAction.action'
                            })
                            splash_args = {
                                'url': self.doctor_url,
                                'headers': self.headers,
                                'lua_source': self.dept_script,
                                'data': data
                            }
                            yield SplashRequest(
                                self.doctor_url,
                                endpoint='execute',
                                args=splash_args,
                                dont_filter=True,
                                headers=self.headers,
                                callback=self.parse_doctor_info,
                                meta={
                                    'dept_type': dept_type,
                                    'dept_name': dept_name,
                                    'dept_id': dept_id,
                                    'hospital_name': org_name
                                })
Example #19
0
    def parse_hospital_info(self, response):
        self.logger.info('>>>>>>正在抓取:医院详细信息>>>>>>')
        try:
            # 获取医院信息

            # 获取医院等级与地区
            hospital_info = response.xpath(
                '//p[@class="yygh_box_top_p2"]').extract()
            hospital_info2 = custom_remove_tags(
                remove_tags(''.join(hospital_info)))
            hospital_level = hospital_info2.split(':')[1].replace('区域', '')
            hospital_county = hospital_info2.split(':')[2].replace('分类', '')
            if hospital_level:
                res = re.search(r'(.*等|.*级|.*合格|.*甲)(.*?)$', hospital_level)
                if res:
                    h_l = res.group(1)
                    h_c = res.group(2)
                    if h_c:
                        h_c_2 = re.sub(r'合格|医院', '', h_c)
                        if h_c_2:
                            h_c = '{0}{1}'.format(h_c_2, '医院')
                else:
                    h_l = h_c = None
            else:
                h_l = h_c = None
            loader = CommonLoader2(item=HospitalInfoItem(), response=response)
            loader.add_xpath('hospital_name',
                             '//p[@class="yygh_box_top_p"]/strong/text()',
                             MapCompose(custom_remove_tags))
            loader.add_value('hospital_level', h_l,
                             MapCompose(custom_remove_tags))
            loader.add_value('hospital_category', h_c)
            loader.add_xpath(
                'hospital_addr',
                '//span[@class="yygh_box_con_dl_span1"]/ancestor::dl[1]/dd[1]/p/text()',
                MapCompose(custom_remove_tags))
            loader.add_value('hospital_pro', '')
            loader.add_value('hospital_city', '北京市')
            loader.add_value('hospital_county', hospital_county,
                             MapCompose(custom_remove_tags))
            loader.add_xpath(
                'hospital_phone',
                '//span[@class="yygh_box_con_dl_span3"]/ancestor::dl[1]/dd[1]/p/text()',
                MapCompose(custom_remove_tags))
            loader.add_value('registered_channel', self.data_source_from)
            loader.add_value('dataSource_from', self.data_source_from)
            loader.add_value('hospital_url', response.url)
            loader.add_value('update_time', now_day())

            # 获取医院介绍
            hospital_intro_link = response.xpath(
                '//a[contains(text(),"医院介绍")]/@href').extract_first('')
            if hospital_intro_link:
                hospital_intro_link = urljoin(self.host, hospital_intro_link)
                self.headers['Referer'] = response.url
                yield Request(hospital_intro_link,
                              headers=self.headers,
                              callback=self.parse_hospital_detail_info,
                              meta={'loader': loader})

            # 获取科室信息
            all_dept_links = response.xpath('//div[@class="kfyuks_yyksbox"]')
            for each_dept_link in all_dept_links:
                dept_type = each_dept_link.xpath(
                    'div[1]/text()').extract_first('')
                dept_info = each_dept_link.xpath(
                    'div[2]/div/ul/li/a/text()').extract()
                for each_dept_info in dept_info:
                    dept_loader = CommonLoader2(item=HospitalDepItem(),
                                                response=response)
                    dept_loader.add_value('dept_name', each_dept_info,
                                          MapCompose(custom_remove_tags))
                    dept_loader.add_value('dept_type', dept_type,
                                          MapCompose(custom_remove_tags))
                    dept_loader.add_xpath(
                        'hospital_name',
                        '//p[@class="yygh_box_top_p"]/strong/text()',
                        MapCompose(custom_remove_tags))
                    dept_loader.add_value('dept_info', '')
                    dept_loader.add_value('dataSource_from',
                                          self.data_source_from)
                    dept_loader.add_value('update_time', now_day())
                    dept_item = dept_loader.load_item()
                    yield dept_item
        except Exception as e:
            self.logger.error('在抓取医院详细信息过程中出错了,原因是:{}'.format(repr(e)))
Example #20
0
    def parse_hospital_info(self, response):
        self.logger.info('>>>>>>正在抓取:医院详细信息和科室信息>>>>>>')
        try:
            # 获取医院信息
            hospital_type = response.meta.get('hospital_type')
            hospital_category = '{0}{1}'.format(
                hospital_type, '医院') if hospital_type else None
            hospital_info = custom_remove_tags(
                remove_tags(''.join(
                    response.xpath('//td[@class='
                                   '"title_yh14"]').extract())))
            hospital_address = get_hospital_info(hospital_info, '地址:', '电话:')
            hospital_address = hospital_address.replace(
                '查看地图', '') if hospital_address else None
            hospital_phone = get_hospital_info(hospital_info, '电话:', '官网')
            hospital_intro = get_hospital_info(hospital_info, '简介:', '$')
            hospital_intro = hospital_intro.replace(
                '...更多&gt;&gt;', '') if hospital_intro else None
            loader = CommonLoader2(item=HospitalInfoItem(), response=response)
            loader.add_xpath('hospital_name', '//span[@class="title"]/text()',
                             MapCompose(custom_remove_tags))
            loader.add_xpath('hospital_level', '//span[@class="dj"]/text()',
                             MapCompose(custom_remove_tags))
            loader.add_value('hospital_category', hospital_category)
            loader.add_value('hospital_addr', hospital_address,
                             MapCompose(custom_remove_tags))
            loader.add_value('hospital_pro', '山西省')
            loader.add_xpath(
                'hospital_city',
                '//td[contains(text(),"山西")]/ancestor::tr[1]/td[1]/a[1]/text()',
                MapCompose(custom_remove_tags))
            loader.add_xpath(
                'hospital_county',
                '//td[contains(text(),"山西")]/ancestor::tr[1]/td[1]/a[2]/text()',
                MapCompose(custom_remove_tags))
            loader.add_value('hospital_phone', hospital_phone,
                             MapCompose(custom_remove_tags))
            loader.add_value('hospital_intro', hospital_intro,
                             MapCompose(custom_remove_tags))
            loader.add_value('registered_channel', self.data_source_from)
            loader.add_value('dataSource_from', self.data_source_from)
            loader.add_value('update_time', now_day())
            hospital_info_item = loader.load_item()
            yield hospital_info_item

            # 获取科室信息
            self.logger.info('>>>>>>正在抓取科室详细信息>>>>>>')
            all_dept_links = response.xpath('//tr[@class="h_bottom"]')
            for each_dept_link in all_dept_links:
                dept_type = each_dept_link.xpath('td[1]/text()').extract_first(
                    '')
                dept_name = each_dept_link.xpath(
                    'td[2]/table/tr/td/a/text()').extract()
                for each_dept_name in dept_name:
                    dept_loader = CommonLoader2(item=HospitalDepItem(),
                                                response=response)
                    dept_loader.add_value(
                        'dept_name', each_dept_name,
                        MapCompose(custom_remove_tags, match_special2))
                    dept_loader.add_value(
                        'dept_type', dept_type,
                        MapCompose(custom_remove_tags, match_special2))
                    dept_loader.add_xpath('hospital_name',
                                          '//span[@class="title"]/text()',
                                          MapCompose(custom_remove_tags))
                    dept_loader.add_value('dept_info', '')
                    dept_loader.add_value('dataSource_from',
                                          self.data_source_from)
                    dept_loader.add_value('update_time', now_day())
                    dept_item = dept_loader.load_item()
                    yield dept_item
        except Exception as e:
            self.logger.error('在抓取医院详细信息和科室的过程中出错了,原因是:{}'.format(repr(e)))
Example #21
0
    def parse_hospital_info(self, response):
        hospital_name = response.meta.get('hospital_name')
        self.logger.info('>>>>>>正在抓取[{}]医院详细信息>>>>>>'.format(hospital_name))
        try:
            hospital_id = response.meta.get('hospital_id')
            data_type = response.meta.get('data_type')
            hospital_pro = response.meta.get('province_name')
            if data_type == '1':
                hospital_address = response.xpath('///div[@class="search-result-hospital-text"]/'
                                                  'p[4]/text()').extract_first('')
                hospital_phone = response.xpath('//div[@class="search-result-hospital-text"]/'
                                                'p[3]/text()').extract_first('')
                check_phone = re.search('(\d{6,})', hospital_phone)
                if not check_phone and not hospital_address:
                    hospital_address = hospital_phone
                    hospital_phone = ''
                # hospital_city = get_city('', hospital_address)
                # hospital_county = get_county2('', match_special2(hospital_address))
                df = transform([hospital_address])
                # hospital_pro = df.head()['省'][0]
                hospital_city = df.head()['市'][0]
                hospital_county = df.head()['区'][0]
                if hospital_pro in MUNICIPALITY2:
                    hospital_city = '{0}{1}'.format(hospital_pro, '市')
                    hospital_pro = ''
                else:
                    hospital_pro = '{0}{1}'.format(hospital_pro, '省')
                loader = CommonLoader2(item=HospitalInfoItem(), response=response)
                loader.add_xpath('hospital_name',
                                 '//span[@class="search-result-hospital-name"]/text()',
                                 MapCompose(custom_remove_tags))
                loader.add_xpath('hospital_level',
                                 '//div[@class="search-result-hospital-text"]/p[2]/text()',
                                 MapCompose(custom_remove_tags, clean_info2))
                loader.add_value('hospital_addr', hospital_address, MapCompose(custom_remove_tags))
                loader.add_value('hospital_pro', hospital_pro)
                loader.add_value('hospital_city', hospital_city)
                loader.add_value('hospital_county', hospital_county, MapCompose(custom_remove_tags))
                loader.add_value('hospital_phone', hospital_phone, MapCompose(custom_remove_tags))
                loader.add_xpath('hospital_intro',
                                 '//li[@id="info"]/p',
                                 MapCompose(remove_tags, custom_remove_tags))
                loader.add_value('registered_channel', self.data_source_from)
                loader.add_value('dataSource_from', self.data_source_from)
                loader.add_value('crawled_url', response.url)
                loader.add_value('update_time', now_day())
                loader.add_xpath('hospital_route',
                                 '//div[@class="search-result-hospital-text"]/p[5]/text()',
                                 MapCompose(custom_remove_tags, match_special2))
                loader.add_xpath('hospital_img_url', 'div[@class="search-result-hospital-img"]/img/@src')
                loader.add_value('hospital_tags', '1')
                loader.add_value('gmt_created', now_time())
                loader.add_value('gmt_modified', now_time())
                loader.add_value('hospital_id', hospital_id)
                hospital_item = loader.load_item()
                yield hospital_item

                # 获取科室信息
                # 从一级科室获取二级科室信息
                all_dept = response.xpath('//ul[@id="parent-list"]/li[@id]')
                for each_dept in all_dept:
                    each_dept_id = each_dept.xpath('@id').extract_first('')
                    each_dept_type = each_dept.xpath('div/span/text()').extract_first('')
                    self.headers['Referer'] = response.url
                    dept_link = self.dept_url.format(hospital_id, each_dept_id)
                    yield Request(dept_link,
                                  headers=self.headers,
                                  callback=self.parse_hospital_dep,
                                  meta={
                                      'hospital_name': hospital_name,
                                      'hospital_id': hospital_id,
                                      'dept_type': each_dept_type
                                  })

                # 获取医生信息
                self.headers['Referer'] = response.url
                doctor_info_link = self.doctor_url.format(hospital_id, '1')
                yield Request(doctor_info_link,
                              headers=self.headers,
                              callback=self.parse_doctor_info,
                              meta={
                                  'hospital_name': hospital_name,
                                  'hospital_id': hospital_id
                              })
            elif data_type == '2':
                hospital_address = response.xpath('//p[@class="hospital-private-address-line fc-6"]'
                                                  '[contains(text(),"地址")]/text()').extract_first('')
                # hospital_city = get_city('', hospital_address)
                # hospital_county = get_county2('', match_special2(hospital_address))
                df = transform([hospital_address])
                # hospital_pro = df.head()['省'][0]
                hospital_city = df.head()['市'][0]
                hospital_county = df.head()['区'][0]
                if hospital_pro in MUNICIPALITY2:
                    hospital_city = '{0}{1}'.format(hospital_pro, '市')
                    hospital_pro = ''
                else:
                    hospital_pro = '{0}{1}'.format(hospital_pro, '省')
                loader = CommonLoader2(item=HospitalInfoItem(), response=response)
                loader.add_xpath('hospital_name',
                                 '//p[@class="hospital-private-content-tit"]/text()',
                                 MapCompose(custom_remove_tags))
                loader.add_value('hospital_addr', hospital_address, MapCompose(custom_remove_tags, match_special2))
                loader.add_value('hospital_pro', hospital_pro)
                loader.add_value('hospital_city', hospital_city)
                loader.add_value('hospital_county', hospital_county, MapCompose(custom_remove_tags))
                loader.add_xpath('hospital_phone',
                                 '//div[@class="search-result-hospital-text"]/p[3]/text()',
                                 MapCompose(custom_remove_tags))
                loader.add_xpath('hospital_intro',
                                 '//li[@id="info"]/p',
                                 MapCompose(remove_tags, custom_remove_tags))
                loader.add_value('registered_channel', self.data_source_from)
                loader.add_value('dataSource_from', self.data_source_from)
                loader.add_value('crawled_url', response.url)
                loader.add_value('update_time', now_day())
                loader.add_xpath('hospital_route',
                                 '//li[@id="address"]/p[3]/text()',
                                 MapCompose(custom_remove_tags, match_special2))
                # loader.add_xpath('hospital_img_url', 'div[@class="search-result-hospital-img"]/img/@src')
                loader.add_value('hospital_tags', '2')
                loader.add_value('gmt_created', now_time())
                loader.add_value('gmt_modified', now_time())
                loader.add_value('hospital_id', hospital_id)
                hospital_item = loader.load_item()
                yield hospital_item

                # 获取科室信息
                # 从一级科室获取二级科室信息
                all_dept = response.xpath('//ul[@id="parent-list"]/li[position()>1]')
                for each_dept in all_dept:
                    dept_id = each_dept.xpath('div/@id').extract_first('')
                    dept_name = each_dept.xpath('div/span/text()').extract_first('')
                    dept_loader = CommonLoader2(item=HospitalDepItem(), response=response)
                    dept_loader.add_value('dept_name', dept_name, MapCompose(custom_remove_tags))
                    dept_loader.add_value('hospital_name', hospital_name, MapCompose(custom_remove_tags))
                    dept_loader.add_value('dataSource_from', self.data_source_from)
                    dept_loader.add_value('crawled_url', response.url)
                    dept_loader.add_value('update_time', now_day())
                    dept_loader.add_value('dept_id', dept_id.replace('subDepLi-', ''))
                    dept_loader.add_value('dept_url', response.url)
                    dept_loader.add_value('gmt_created', now_time())
                    dept_loader.add_value('gmt_modified', now_time())
                    dept_item = dept_loader.load_item()
                    yield dept_item

                    # 获取医生信息
                    self.headers['Referer'] = response.url
                    doctor_info_link = self.doctor_url.format(hospital_id, '1')
                    yield Request(doctor_info_link,
                                  headers=self.headers,
                                  callback=self.parse_doctor_info,
                                  meta={
                                      'hospital_name': hospital_name,
                                      'hospital_id': hospital_id,
                                      'dept_name': dept_name
                                  })
        except Exception as e:
            self.logger.error('在抓取医院详细信息过程中出错了,原因是:{}'.format(repr(e)))
Example #22
0
    def parse_hospital_info(self, response):
        hospital_name = response.meta.get('hospital_name')
        self.logger.info('>>>>>>正在抓取:[{}]医院详细信息>>>>>>'.format(hospital_name))
        try:
            # 获取医院等级与类别
            l_a_c = response.xpath(
                '//div[@class="l"]/h2/span/i/text()').extract()
            l_a_c = custom_remove_tags(remove_tags('|'.join(l_a_c)))
            h_l = h_c = m_t = None
            if l_a_c:

                # 等级
                level = re.search(r'(.*等|.*级|.*甲)', l_a_c)
                if level:
                    h_l = level.group(1).split('|')[-1]

                # 类别
                category = re.search(r'(.*?医院)', l_a_c.replace('医保定点医院', ''))
                if category:
                    h_c = category.group(1).split('|')[-1]

                # 医保类型
                medical_type = re.search(r'(.*定点)', l_a_c)
                if medical_type:
                    m_t = medical_type.group(1).split('|')[-1]
            else:
                h_l = h_c = None

            # 获取省市信息
            hospital_pro = response.meta.get('hospital_pro')
            hospital_city = hospital_county = None
            h_a = response.xpath(
                '//dt[contains(text(),"地址")]/ancestor::dl[1]/dd').extract()
            hospital_address = custom_remove_tags(
                remove_tags(''.join(h_a).replace('查看地图', '')))
            if hospital_pro and hospital_address:
                if hospital_pro in MUNICIPALITY2:
                    hospital_city = hospital_pro
                    hospital_pro = ''
                    hos_c = hospital_city.replace('市', '')
                    useless_info = '{}{}|{}'.format(hos_c, '市', hos_c)
                    single_address = match_special2(
                        hospital_address.split(';')[0])
                    hospital_county = get_county2(useless_info, single_address)
                else:
                    hos_p = hospital_pro
                    hospital_pro = '{0}{1}'.format(hospital_pro, '省')
                    single_address = match_special2(
                        hospital_address.split(';')[0])
                    hospital_city = get_city(hospital_pro, single_address)
                    if hospital_city:
                        hos_c = hospital_city.replace('市', '')
                        useless_info = '{}|{}|{}|{}'.format(
                            hospital_pro, hos_p, hospital_city, hos_c)
                        hospital_county = get_county2(useless_info,
                                                      single_address)

            # 公立/私立
            h_t = custom_remove_tags(
                response.xpath(
                    '//li/b[contains(text(),"国营")]/text()').extract_first(''))
            hospital_type = '公立' if h_t == '国营' else ''

            # 医院信息item
            loader = CommonLoader2(item=HospitalInfoItem(), response=response)
            loader.add_xpath('hospital_name', '//div[@class="l"]/h2/text()',
                             MapCompose(custom_remove_tags))
            loader.add_value('hospital_level', h_l,
                             MapCompose(custom_remove_tags))
            loader.add_value('hospital_type', hospital_type)
            loader.add_value('hospital_category', h_c,
                             MapCompose(custom_remove_tags))
            loader.add_value('hospital_addr', hospital_address,
                             MapCompose(custom_remove_tags))
            loader.add_value('hospital_pro', hospital_pro,
                             MapCompose(custom_remove_tags))
            loader.add_value('hospital_city', hospital_city,
                             MapCompose(custom_remove_tags))
            loader.add_value('hospital_county', hospital_county,
                             MapCompose(custom_remove_tags))
            loader.add_xpath('hospital_phone',
                             '//dt[contains(text(),"电话")]/ancestor::dl[1]/dd',
                             MapCompose(remove_tags, custom_remove_tags))
            loader.add_xpath(
                'hospital_intro',
                '//dt/strong[contains(text(),"简介")]/ancestor::dl[1]/dd',
                MapCompose(remove_tags, custom_remove_tags))
            loader.add_value('medicare_type', m_t,
                             MapCompose(custom_remove_tags))
            # loader.add_value('registered_channel', self.data_source_from)
            loader.add_value('dataSource_from', self.data_source_from)
            loader.add_value('hospital_url', response.url)
            loader.add_value('update_time', now_day())
            hospital_item = loader.load_item()
            yield hospital_item

            # 获取医院别名
            hospital_alias = response.xpath(
                '//div[@class="l"]/p/text()').extract_first('')
            if hospital_alias:
                h_s = custom_remove_tags(hospital_alias)
                if h_s:
                    all_hospital_alias = h_s.split(',')
                    for each_alias in all_hospital_alias:
                        if each_alias != hospital_name:
                            alias_loader = CommonLoader2(
                                item=HospitalAliasItem(), response=response)
                            alias_loader.add_xpath(
                                'hospital_name', '//div[@class="l"]/h2/text()',
                                MapCompose(custom_remove_tags))
                            alias_loader.add_value(
                                'hospital_alias_name', each_alias,
                                MapCompose(custom_remove_tags, match_special))
                            alias_loader.add_value('dataSource_from',
                                                   self.data_source_from)
                            alias_loader.add_value('update_time', now_day())
                            alias_item = alias_loader.load_item()
                            yield alias_item
        except Exception as e:
            self.logger.error('在抓取医院详细信息过程中出错了,原因是:{}'.format(repr(e)))
Example #23
0
 def parse_doctor_website(self, response):
     self.logger.info('>>>>>>正在抓取医生个人主页相关信息……')
     # 获取医生相关信息
     loader = YiHuLoader(item=DoctorInfoItem(), response=response)
     loader.add_xpath('doctor_name', '//span[@class="c-f22 c-333"]/text()')
     loader.add_xpath('dept_name',
                      '//div[@class="doctor-info"]/dl/dd[2]/a[2]/text()')
     loader.add_xpath('hospital_name',
                      '//div[@class="doctor-info"]/dl/dd[2]/a[1]/text()')
     loader.add_xpath('doctor_level',
                      '//div[@class="doctor-info"]/dl/dd[1]/text()')
     loader.add_xpath(
         'doctor_intro',
         '//table[@class="pop-myinfo-tb"]/tr[@class="last"]/td/p/text()')
     loader.add_xpath('doctor_goodAt',
                      '//table[@class="pop-myinfo-tb"]/tr[5]/td/text()')
     loader.add_value('update_time', now_day())
     doctor_item = loader.load_item()
     yield doctor_item
     # 获取医院相关信息
     hos_link = response.xpath(
         '//div[@class="doctor-info"]/dl/dd[2]/a[1]/@href').extract_first(
             '')
     dept_link = response.xpath(
         '//div[@class="doctor-info"]/dl/dd[2]/a[2]/@href').extract_first(
             '')
     # 抓取医院详细信息
     if hos_link:
         hos_id = re.search(r'/sc/(.*?).shtml', hos_link)
         if hos_id and hos_id.group(1) not in self.crawled_ids:
             self.crawled_ids.add(hos_id.group(1))
             hos_intro_link = re.sub(r'/sc/', '/detail/', hos_link)
             hos_con_link = re.sub(r'/sc/', '/contact/', hos_link)
             hos_loader = YiHuLoader(item=HospitalInfoItem(),
                                     response=response)
             hos_loader.add_xpath(
                 'hospital_name',
                 '//div[@class="doctor-info"]/dl/dd[2]/a[1]/text()')
             hospital_detail_request = Request(
                 hos_intro_link,
                 headers=self.headers,
                 callback=self.parse_hospital_detail,
                 meta={
                     'loader': hos_loader,
                     'contact_hos_link': hos_con_link
                 })
             hospital_detail_request.meta['Referer'] = response.url
             yield hospital_detail_request
     # 存储科室信息
     if dept_link:
         dept_link_id = re.search(r'/arrange/(.*?).shtml', dept_link)
         if dept_link_id and dept_link_id.group(1) not in self.crawled_dept:
             self.crawled_dept.add(dept_link_id.group(1))
             dept_loader = YiHuLoader(item=HospitalDepItem(),
                                      response=response)
             dept_loader.add_xpath(
                 'dept_name',
                 '//div[@class="doctor-info"]/dl/dd[2]/a[2]/text()')
             dept_loader.add_xpath(
                 'hospital_name',
                 '//div[@class="doctor-info"]/dl/dd[2]/a[1]/text()')
             dept_loader.add_value('update_time', now_day())
             dept_item = dept_loader.load_item()
             yield dept_item
Example #24
0
    def parse_hospital_info(self, response):
        hospital_name = response.meta.get('hospital_name')
        self.logger.info('>>>>>>正在抓取医院详细信息>>>>>>')
        try:
            hospital_id = response.meta.get('hospital_id')
            hospital_img_url = response.xpath(
                '//div[@class="divLeft_Img"]/img/@src').extract_first('')
            hospital_img_url = urljoin(
                self.host, hospital_img_url) if hospital_img_url else ''
            hospital_address = response.xpath(
                '//li[contains(text(),"地址")]/text()').extract_first('')
            hospital_county = get_county2('中国|福建省|福建|厦门市|厦门',
                                          match_special2(hospital_address))
            loader = CommonLoader2(item=HospitalInfoItem(), response=response)
            loader.add_xpath(
                'hospital_name',
                '//div[@class="divLeft_Info"]/ul/li[1]/span/text()',
                MapCompose(custom_remove_tags))
            loader.add_value('hospital_addr', hospital_address,
                             MapCompose(custom_remove_tags, match_special2))
            loader.add_value('hospital_pro', '福建省')
            loader.add_value('hospital_city', '厦门市')
            loader.add_value('hospital_county', hospital_county,
                             MapCompose(custom_remove_tags))
            loader.add_xpath('hospital_phone',
                             '//li[contains(text(),"电话")]/text()',
                             MapCompose(custom_remove_tags, match_special2))
            loader.add_xpath('hospital_intro', '//div[@class="introduceSpan"]',
                             MapCompose(remove_tags, custom_remove_tags))
            loader.add_value('registered_channel', self.data_source_from)
            loader.add_value('dataSource_from', self.data_source_from)
            loader.add_value('crawled_url', response.url)
            loader.add_value('update_time', now_day())
            loader.add_xpath('hospital_official_website',
                             '//li[contains(text(),"官网")]/text()',
                             MapCompose(custom_remove_tags, match_special2))
            loader.add_xpath('hospital_route',
                             '//li[contains(text(),"公交线路")]/text()',
                             MapCompose(custom_remove_tags, match_special2))
            loader.add_value('hospital_img_url', hospital_img_url)
            loader.add_value('gmt_created', now_time())
            loader.add_value('gmt_modified', now_time())
            loader.add_value('hospital_id', hospital_id)
            hospital_item = loader.load_item()
            yield hospital_item

            # 科室信息
            all_dept_info = response.xpath(
                '//div[@class="medicineOne"]|//div[@class="medicineTwo"]')
            for each_dept_info in all_dept_info:
                dept_type = each_dept_info.xpath(
                    'div[1]/span/text()').extract_first('')
                dept_names = each_dept_info.xpath('div[2]/div[1]')
                for each_dept_name in dept_names:
                    dept_name = each_dept_name.xpath('a/text()').extract_first(
                        '')
                    dept_link = each_dept_name.xpath('a/@href').extract_first(
                        '')
                    doctor_num_of_dept = each_dept_name.xpath(
                        'span/text()').extract_first('')

                    # 获取科室人数
                    if doctor_num_of_dept:
                        dept_person_num = re.search(r'(\d+)',
                                                    doctor_num_of_dept)
                        dept_person_num = int(dept_person_num.group(
                            1)) if dept_person_num else None
                    else:
                        dept_person_num = None

                    # 获取科室详细信息
                    dept_loader = CommonLoader2(item=HospitalDepItem(),
                                                response=response)
                    dept_loader.add_value('dept_name', dept_name,
                                          MapCompose(custom_remove_tags))
                    dept_loader.add_value('hospital_name', hospital_name,
                                          MapCompose(custom_remove_tags))
                    dept_loader.add_value('dept_type', dept_type,
                                          MapCompose(custom_remove_tags))
                    dept_loader.add_value('dataSource_from',
                                          self.data_source_from)
                    dept_info = ''.join(
                        response.xpath(
                            '//p[contains(text(),"科室简介")]/ancestor::tr[1]').
                        extract())
                    dept_loader.add_value(
                        'dept_info', dept_info,
                        MapCompose(remove_tags, custom_remove_tags,
                                   match_special2))
                    dept_loader.add_value('crawled_url', response.url)
                    dept_loader.add_value('update_time', now_day())
                    dept_loader.add_value('dept_id', dept_link,
                                          MapCompose(match_special2))
                    dept_loader.add_value('hospital_id', hospital_id)
                    dept_loader.add_value('dept_person_num', dept_person_num)
                    dept_loader.add_value('dept_url',
                                          urljoin(self.host, dept_link))
                    dept_loader.add_value('gmt_created', now_time())
                    dept_loader.add_value('gmt_modified', now_time())
                    dept_item = dept_loader.load_item()
                    yield dept_item

                    # 获取医生信息
                    if dept_link and dept_person_num:
                        self.headers['Referer'] = response.url
                        yield Request(urljoin(self.host, dept_link),
                                      headers=self.headers,
                                      callback=self.parse_doctor_info,
                                      dont_filter=True,
                                      meta={
                                          'hospital_name': hospital_name,
                                          'dept_name': dept_name,
                                      })
        except Exception as e:
            self.logger.error('在抓取医院详细信息过程中出错了,原因是:{}'.format(repr(e)))