def parse(self, response): """获取医院相关信息""" self.logger.info('>>>>>>正在抓取医院相关信息……') hospital_info = json.loads(response.text) for each_hospital in hospital_info[3:4]: is_medicare = '是' if str(each_hospital.get('Ismedicalcard', '')) == '1' else '否' loader = CommonLoader(item=HospitalInfoItem(), response=response) loader.add_value('hospital_name', each_hospital.get('hospitalname', '')) loader.add_value('hospital_level', each_hospital.get('levelName', '')) loader.add_value('hospital_addr', each_hospital.get('address', '')) loader.add_value('hospital_pro', '四川') loader.add_value('hospital_city', each_hospital.get('areaName', '')) loader.add_value('is_medicare', is_medicare) loader.add_value('dataSource_from', self.source_from) loader.add_value('update_time', now_day()) hospital_item = loader.load_item() yield hospital_item hospital_id = each_hospital.get('hospitalid') if hospital_id: dept_request = FormRequest(self.dept_link, headers=self.headers, callback=self.parse_dept_info, formdata={'hospitalId': str(hospital_id)}, dont_filter=True) self.headers['Referer'] = 'http://www.scgh114.com/web/register/gh' yield dept_request
def parse(self, response): """获取医院信息""" self.logger.info('>>>>>>正在抓取{}:医院信息>>>>>>'.format(self.hospital_name)) loader = CommonLoader2(item=HospitalInfoItem(), response=response) loader.add_value('hospital_name', self.hospital_name) loader.add_value( 'consulting_hour', '急诊和临床住院科室_24小时值班;' '行政及其它_上午8:00~12:00,下午14:00~17:00') loader.add_value('hospital_level', '三级乙等') loader.add_value('hospital_type', '公立') loader.add_value('hospital_category', '中医医院') loader.add_value('hospital_addr', '四川省彭州市天彭镇南大街396号') loader.add_value('hospital_pro', '四川省') loader.add_value('hospital_city', '彭州市') loader.add_value('hospital_county', '') loader.add_value('hospital_phone', '028-83701908') loader.add_xpath( 'hospital_intro', '//div[@id="about-right-b"]', MapCompose(remove_tags, custom_remove_tags, clean_info)) # loader.add_value('is_medicare', '是') # loader.add_value('medicare_type', '成都市医保、工伤保险定点医院') loader.add_value('registered_channel', '官网') loader.add_value('dataSource_from', '医院官网') loader.add_value('update_time', now_day()) hospital_info_item = loader.load_item() yield hospital_info_item
def parse(self, response): """获取医院信息""" self.logger.info('>>>>>>正在抓取{}:医院信息>>>>>>'.format(self.hospital_name)) loader = CommonLoader2(item=HospitalInfoItem(), response=response) loader.add_value('hospital_name', self.hospital_name) loader.add_value('consulting_hour', '门诊上午_8:00-12:00;门诊下午_14:00-17:30') loader.add_value('hospital_level', '二级甲等') loader.add_value('hospital_type', '公立') loader.add_value('hospital_category', '综合医院') loader.add_value('hospital_addr', '成都市东三环龙泉驿区十陵街道江华路8号') loader.add_value('hospital_pro', '四川省') loader.add_value('hospital_city', '成都市') loader.add_value('hospital_county', '龙泉驿区') loader.add_value( 'hospital_phone', '急救电话_028-84615120;电话咨询_028-84604546转科室;' '24小时医护热线_028-84615789') loader.add_xpath('hospital_intro', '//article[@class="content"]/div', MapCompose(remove_tags, custom_remove_tags)) loader.add_value('is_medicare', '是') loader.add_value('medicare_type', '成都市医保、工伤保险定点医院') loader.add_value('registered_channel', '官网或官方微信公众号(工作日),法定节假日电话预约') loader.add_value('dataSource_from', '医院官网') loader.add_value('update_time', now_day()) hospital_info_item = loader.load_item() yield hospital_info_item
def parse(self, response): """获取医院信息""" self.logger.info('>>>>>>正在抓取{}:医院信息>>>>>>'.format(self.hospital_name)) loader = CommonLoader2(item=HospitalInfoItem(), response=response) loader.add_value('hospital_name', self.hospital_name) loader.add_value('consulting_hour', '上午8:00—12:00;下午2:00—5:30') loader.add_value('hospital_level', '三级乙等') loader.add_value('hospital_type', '公立') loader.add_value('hospital_category', '妇幼保健院') loader.add_value('hospital_addr', '四川省成都市双流区东升街道涧槽中街396号') loader.add_value('hospital_pro', '四川省') loader.add_value('hospital_city', '成都市') loader.add_value('hospital_county', '') loader.add_value( 'hospital_phone', '母婴咨询热线_028-85884888(工作日);' '总值班电话_028-85808438;' '预约挂号电话_028-85801029(7:30-19:30)') loader.add_xpath('hospital_intro', '//div[@class="describe htmledit"]', MapCompose(remove_tags, custom_remove_tags)) loader.add_value('is_medicare', '是') # loader.add_value('medicare_type', '') loader.add_value('registered_channel', '电话预约;自助挂号机;诊室预约;' '医院微信公众号;健康双流;现场') loader.add_value('dataSource_from', '医院官网') loader.add_value('update_time', now_day()) hospital_info_item = loader.load_item() yield hospital_info_item
def parse(self, response): self.logger.info('正在抓取{}:医院信息'.format(self.hospital_name)) loader = MedicalMapLoader(item=HospitalInfoItem(), response=response) # loader.add_value('hospital_id', self.hospital_id) loader.add_value('hospital_name', self.hospital_name) loader.add_value('consulting_hour', '普通门诊上午_8:00-12:00;普通门诊下午13:00-16:30') loader.add_value('hospital_level', '三级乙等') loader.add_value('hospital_type', '公立') loader.add_value('hospital_category', '综合医院') # loader.add_value('hospital_addr', '四川省金堂县赵镇金广路886号') loader.add_value('hospital_pro', '四川省') loader.add_value('hospital_city', '成都市') loader.add_value('hospital_county', '金堂县') loader.add_value('hospital_phone', '医院服务电话_028-84902884;服务质量监督投诉电话_028-84932532;' '急诊急救电话_18181938532;产科急救电话_18181938532;' '医保结算电话_028-84932721;' '预约挂号电话_028-84931443;预约挂号电话_028-84902884;预约挂号电话_028-61568616') loader.add_xpath('hospital_intro', '//div[@class="baseRight-intro"]/p[position()<7]/span/text()') loader.add_value('is_medicare', '') loader.add_value('medicare_type', '') loader.add_value('vaccine_name', '') loader.add_value('is_cpc', '') loader.add_value('is_bdc', '') loader.add_value('cooperative_business', '') loader.add_value('hospital_district', '') loader.add_value('registered_channel', '微信公众号_' + self.hospital_name) loader.add_value('dataSource_from', '官网:http://www.jintangyy.com/index.aspx') loader.add_value('update_time', now_day()) hospital_info_item = loader.load_item() yield hospital_info_item request = Request(self.dep_link, callback=self.parse_hospital_dep) request.meta['Referer'] = response.url yield request
def parse(self, response): """获取医院信息""" self.logger.info('>>>>>>正在抓取{}:医院信息>>>>>>'.format(self.hospital_name)) loader = CommonLoader2(item=HospitalInfoItem(), response=response) loader.add_value('hospital_name', self.hospital_name) loader.add_value('consulting_hour', '') loader.add_value('hospital_level', '三级乙等') loader.add_value('hospital_type', '公立') loader.add_value('hospital_category', '妇幼保健院') loader.add_value('hospital_addr', '成都市温江区万春路140') loader.add_value('hospital_pro', '四川省') loader.add_value('hospital_city', '成都市') loader.add_value('hospital_county', '') loader.add_value('hospital_phone', '24小时急救电话_028-82723131;咨询电话_围产期保健_028-82715727;' '咨询电话_妇科门诊_028-82711383;咨询电话_儿童保健_028-82711527;' '咨询电话_婚检科_028-82720337;' '投诉电话_028-82724901(上班时间);投诉电话_13688488598(下班时间)') loader.add_xpath('hospital_intro', '//div[@id="info_txt"]', MapCompose(remove_tags, custom_remove_tags)) # loader.add_value('is_medicare', '是') # loader.add_value('medicare_type', '') loader.add_value('registered_channel', '电话预约;挂号窗口;医院微信公众号') loader.add_value('dataSource_from', '医院官网') loader.add_value('update_time', now_day()) hospital_info_item = loader.load_item() yield hospital_info_item
def parse(self, response): """获取医院信息""" all_hospital_links = response.xpath('//div[@class="c-hidden disen-list-hos c-f12"]/ul/li') self.logger.info('该地区共{}家医院'.format(str(len(all_hospital_links)))) for each_hospital_link in all_hospital_links: loader = YiHuLoader(item=HospitalInfoItem(), selector=each_hospital_link) loader.add_xpath('hospital_name', 'a/text()') loader.add_xpath('hospital_level', 'span/text()', MapCompose(remove_number2)) hospital_link = each_hospital_link.xpath('a/@href').extract_first('') if hospital_link: # 医院信息 hospital_detail_link = re.sub(r'/sc/', '/detail/', hospital_link) contact_hos_link = re.sub(r'/sc/', '/contact/', hospital_link) hospital_detail_request = Request(hospital_detail_link, headers=self.headers, callback=self.parse_hospital_detail, meta={'loader': loader, 'contact_hos_link': contact_hos_link}) self.headers['Referer'] = response.url yield hospital_detail_request # 医院科室信息 dep_request = Request(hospital_link, headers=self.headers, callback=self.parse_hospital_dep) self.headers['Referer'] = response.url yield dep_request
def parse_hospital_info(self, response): self.logger.info('>>>>>>正在抓取:医院信息>>>>>>') try: # 获取区或县 hospital_address = response.xpath( '//div[@class="yy_js clearfix"]/div/dl/dd[1]/text()' ).extract_first('') if hospital_address: hospital_county = get_county2('中国|江苏省|江苏|南京市|南京', hospital_address) else: hospital_county = None # 获取医院信息 loader = CommonLoader2(item=HospitalInfoItem(), response=response) loader.add_xpath('hospital_name', '//div[@class="yy_til"]/h2/text()', MapCompose(custom_remove_tags)) loader.add_value('hospital_level', response.meta.get('hospital_level'), MapCompose(custom_remove_tags, clean_info)) loader.add_xpath( 'hospital_addr', '//div[@class="yy_js clearfix"]/div/dl/dd[1]/text()', MapCompose(custom_remove_tags)) loader.add_value('hospital_pro', '江苏省') loader.add_value('hospital_city', '南京市') loader.add_value('hospital_county', hospital_county) loader.add_xpath( 'hospital_phone', '//div[@class="yy_js clearfix"]/div/dl/dd[2]/text()', MapCompose(custom_remove_tags)) loader.add_xpath('hospital_intro', '//dd[@id="wrap"]', MapCompose(remove_tags, custom_remove_tags)) loader.add_value('registered_channel', self.data_source_from) loader.add_value('dataSource_from', self.data_source_from) loader.add_value('hospital_url', response.url) loader.add_value('update_time', now_day()) hospital_info_item = loader.load_item() yield hospital_info_item # 获取科室信息 # self.logger.info('>>>>>>正在抓取{}:科室详细信息>>>>>>') all_dept_links = response.xpath( '//dl[@class="kfyy clearfix"]/dd/span/a/@href').extract() for each_dept_link in all_dept_links: dept_link = urljoin( self.host, re.sub(r';jsessionid=(.*?)\?', '?', each_dept_link)) self.headers['Referer'] = response.url yield Request(dept_link, headers=self.headers, callback=self.parse_hospital_dep_detail) except Exception as e: self.logger.error('在抓取医院详细信息和科室的过程中出错了,原因是:{}'.format(repr(e)))
def parse(self, response): """获取医院信息""" self.logger.info('>>>>>>正在抓取{}:医院信息>>>>>>'.format(self.hospital_name)) loader = CommonLoader2(item=HospitalInfoItem(), response=response) loader.add_value('hospital_name', self.hospital_name) loader.add_value('consulting_hour', '门诊时间_8:30-17:30;急诊时间_7*24小时') loader.add_value('hospital_level', '二级甲等') loader.add_value('hospital_type', '公立') loader.add_value('hospital_category', '中医医院') loader.add_value( 'hospital_addr', '川化病区:青白江区化工北路41号(原川化医院);' '城厢病区:青白江区城厢镇大南街51号(中医血防医院);' '中医名医馆地址:中医医院川化病区一号楼1楼') loader.add_value('hospital_pro', '四川省') loader.add_value('hospital_city', '成都市') loader.add_value('hospital_county', '青白江区') loader.add_value('hospital_phone', '24小时急诊电话_028-83632835') loader.add_xpath('hospital_intro', '//div[@class="right-about clearfix"]', MapCompose(remove_tags)) loader.add_value('registered_channel', '医院挂号室') loader.add_value('dataSource_from', '医院官网') loader.add_value('update_time', now_day()) hospital_info_item = loader.load_item() # 医院信息 yield hospital_info_item # 获取科室信息 dept_links = response.xpath( '//ul[@id="navul"]/li[5]/ul/li|//ul[@id="navul"]/li[6]/ul/li') for each_dept_link in dept_links: dept_link = each_dept_link.xpath('a/@href').extract_first('') dept_name = each_dept_link.xpath('a/text()').extract_first('') if dept_link and dept_name: dept_request = Request(urljoin(self.host, dept_link), headers=self.headers, callback=self.parse_hospital_dep_detail, meta={'dept_name': dept_name}) dept_request.meta['Referer'] = response.url yield dept_request # 获取医生信息 doctor_links = response.xpath('//ul[@id="navul"]/li[8]/ul/li') for each_dept_link in doctor_links: dept_link = each_dept_link.xpath('a/@href').extract_first('') dept_name = each_dept_link.xpath('a/text()').extract_first('') if dept_link and dept_name: dept_request = Request(urljoin(self.host, dept_link), headers=self.headers, callback=self.parse_doctor_info, meta={'dept_name': dept_name}) dept_request.meta['Referer'] = response.url yield dept_request
def parse(self, response): """获取医院信息""" self.logger.info('>>>>>>正在抓取{}:医院信息>>>>>>'.format(self.hospital_name)) loader = CommonLoader2(item=HospitalInfoItem(), response=response) loader.add_value('hospital_name', self.hospital_name) loader.add_value('consulting_hour', '门诊上午_8:00-12:00;门诊下午_14:00-17:30') loader.add_value('hospital_level', '三级乙等') loader.add_value('hospital_type', '公立') loader.add_value('hospital_category', '中医医院') loader.add_value('hospital_addr', '四川省成都市郫都区南大街342号') loader.add_value('hospital_pro', '四川省') loader.add_value('hospital_city', '成都市') loader.add_value('hospital_county', '郫都区') loader.add_value( 'hospital_phone', '咨询电话_028-87920858;急诊电话_028-87925131;预约挂号_028-87925042;' '投诉电话_028-87920858;人事招聘_028-87925158;医保出入院处_028-87925172;' '体验科_028-87922056') loader.add_xpath('hospital_intro', '//div[@class="rightPanel"]/p[position()>2]', MapCompose(remove_tags)) loader.add_value('is_medicare', '是') loader.add_value('registered_channel', '官方微信公众号或华医通APP') loader.add_value('dataSource_from', '医院官网') loader.add_value('update_time', now_day()) hospital_info_item = loader.load_item() # 医院信息 yield hospital_info_item # 获取科室信息 dept_request = Request(self.dept_link, headers=self.headers, callback=self.parse_hospital_dep, dont_filter=True) dept_request.meta['Referer'] = self.entry_url yield dept_request # 获取医生信息 for each_dept_link in self.doctor_links: dept_request = Request(each_dept_link, headers=self.headers, callback=self.parse_doctor_info, dont_filter=True) dept_request.meta['Referer'] = each_dept_link yield dept_request
def parse(self, response): """获取医院信息""" self.logger.info('正在抓取{}:医院信息'.format(self.hospital_name)) loader = PxfybjyLoader(item=HospitalInfoItem(), response=response) loader.add_value('hospital_name', self.hospital_name) loader.add_value('consulting_hour', '门诊上午_8:00-12:00;门诊下午14:00-17:30') loader.add_value('hospital_level', '三级乙等') loader.add_value('hospital_type', '公立') loader.add_value('hospital_category', '妇幼保健院') loader.add_value('hospital_addr', '郫都区郫筒街道新南街283号') loader.add_value('hospital_pro', '四川省') loader.add_value('hospital_city', '成都市') loader.add_value('hospital_county', '郫都区') loader.add_value( 'hospital_phone', '产科急救_028-87922244;儿科急救_028-87931629;产检门诊_028-87924116;' '婚检_028-87885339;儿保科_028-87931911') loader.add_xpath( 'hospital_intro', '//div[@class="FrontComContent_detail01-1468317290474_htmlbreak"]/' 'p[position()<9]') # loader.add_value('is_medicare', '') # loader.add_value('medicare_type', '') # loader.add_value('vaccine_name', '') # loader.add_value('is_cpc', '') # loader.add_value('is_bdc', '') # loader.add_value('cooperative_business', '') # loader.add_value('hospital_district', '') # loader.add_value('registered_channel', '') loader.add_value('dataSource_from', '官网:http://www.pxfybjy.cn/index.html') loader.add_value('update_time', now_day()) hospital_info_item = loader.load_item() # 医院信息 yield hospital_info_item # 科室信息 request = Request(self.index_link, headers=self.headers, callback=self.parse_hospital_dep) request.meta['Referer'] = response.url yield request
def parse_hospital_info(self, response): self.logger.info('>>>>>>正在抓取:医院详细信息>>>>>>') try: # 获取医院信息 hospital_info = json.loads(response.text) # 获取医院等级 hospital_level_info = hospital_info.get('HIS_LVL') if hospital_level_info == '3': hospital_level = '三级' elif hospital_level_info == '2': hospital_level = '二级' elif hospital_level_info == '1': hospital_level = '一级' else: hospital_level = None # 获取医院所在区或县 hospital_address = hospital_info.get('HIS_AD') if hospital_address: hospital_county = get_county2('中国|广东省|广东|广州市|广州', hospital_address) else: hospital_county = None loader = CommonLoader2(item=HospitalInfoItem(), response=response) loader.add_value('hospital_name', hospital_info.get('HIS_NM')) loader.add_value('hospital_level', hospital_level) loader.add_value('hospital_category', '') loader.add_value('hospital_addr', hospital_address) loader.add_value('hospital_pro', '广东省') loader.add_value('hospital_city', '广州市') loader.add_value('hospital_county', hospital_county) loader.add_value('hospital_phone', hospital_info.get('TEL_NO')) loader.add_value('hospital_intro', hospital_info.get('HIS_RM')) loader.add_value('registered_channel', self.data_source_from) loader.add_value('dataSource_from', self.data_source_from) loader.add_value('hospital_url', response.url) loader.add_value('update_time', now_day()) hospital_info_item = loader.load_item() yield hospital_info_item except Exception as e: self.logger.error('在抓取医院详细信息过程中出错了,原因是:{}'.format(repr(e)))
def parse(self, response): """获取医院信息""" self.logger.info('正在抓取{}:医院信息'.format(self.hospital_name)) loader = CommonLoader2(item=HospitalInfoItem(), response=response) loader.add_value('hospital_name', self.hospital_name) loader.add_value('consulting_hour', '8:00-17:00') loader.add_value('hospital_level', '三级乙等') loader.add_value('hospital_type', '公立') loader.add_value('hospital_category', '中医医院') loader.add_value('hospital_addr', '成都市双流区东升街道花园路二段(新院),淳化街205号(老院)') loader.add_value('hospital_pro', '四川省') loader.add_value('hospital_city', '成都市') loader.add_value('hospital_county', '') loader.add_value('hospital_phone', '预约电话_028-69803716;体检咨询_028-85808932') loader.add_xpath('hospital_intro', '//div[@class="text"]', MapCompose(remove_tags, custom_remove_tags)) loader.add_value('registered_channel', '实名制电话预约、现场预约、自助机预约、诊间预约') loader.add_value('dataSource_from', '医院官网') loader.add_value('update_time', now_day()) hospital_info_item = loader.load_item() # 医院信息 yield hospital_info_item
def parse_hospital_info(self, response): self.logger.info('>>>>>>正在抓取医院详细信息>>>>>>') try: hospital_address = response.xpath( '//b[contains(text(),"医院地址")]/' 'ancestor::td[1]/text()').extract_first('') hospital_county = get_county2('中国|广东省|广东|珠海市|珠海', hospital_address) loader = CommonLoader2(item=HospitalInfoItem(), response=response) loader.add_xpath( 'hospital_name', '//b[contains(text(),"医院全称")]/ancestor::td[1]/text()', MapCompose(custom_remove_tags)) loader.add_xpath( 'hospital_level', '//b[contains(text(),"医院级别")]/ancestor::td[1]/text()', MapCompose(custom_remove_tags)) loader.add_value('hospital_addr', hospital_address, MapCompose(custom_remove_tags)) loader.add_value('hospital_pro', '广东省') loader.add_value('hospital_city', '珠海市') loader.add_value('hospital_county', hospital_county, MapCompose(custom_remove_tags)) loader.add_xpath( 'hospital_phone', '//b[contains(text(),"联系电话")]/ancestor::td[1]/text()', MapCompose(custom_remove_tags)) loader.add_xpath( 'hospital_intro', '//b[contains(text(),"简介")]/ancestor::td[1]', MapCompose(remove_tags, custom_remove_tags, match_special)) loader.add_value('registered_channel', self.data_source_from) loader.add_value('dataSource_from', self.data_source_from) loader.add_value('crawled_url', response.url) loader.add_value('update_time', now_day()) hospital_item = loader.load_item() yield hospital_item except Exception as e: self.logger.error('在抓取医院详细信息过程中出错了,原因是:{}'.format(repr(e)))
def parse(self, response): """获取医院信息""" self.logger.info('>>>>>>正在抓取{}:医院信息>>>>>>'.format(self.hospital_name)) loader = CommonLoader2(item=HospitalInfoItem(), response=response) loader.add_value('hospital_name', self.hospital_name) loader.add_value('consulting_hour', '门诊时间(无假日医院)8:30-17:30') loader.add_value('hospital_level', '三级乙等') loader.add_value('hospital_type', '公立') loader.add_value('hospital_category', '专科医院') loader.add_value('hospital_addr', '绵阳市经开区红塔路16号(绵阳市红星街97号)') loader.add_value('hospital_pro', '四川省') loader.add_value('hospital_city', '绵阳市') loader.add_value('hospital_county', '') loader.add_value('hospital_phone', '0816-2265553;0816-2261517') loader.add_xpath('hospital_intro', '//div[@class="content-left pull-left"]', MapCompose(remove_tags, custom_remove_tags)) loader.add_value('is_medicare', '是') # loader.add_value('medicare_type', '') loader.add_value('registered_channel', '现场预约(一楼挂号交费处);电话预约;官网') loader.add_value('dataSource_from', '医院官网') loader.add_value('update_time', now_day()) hospital_info_item = loader.load_item() yield hospital_info_item # 科室信息 dept_links = response.xpath( '//div[@class="ddsmoothmenu"]/ul/li[position()=4]/ul/li') for each_dept_link in dept_links[0:1]: dept_link = each_dept_link.xpath('a/@href').extract_first('') dept_name = each_dept_link.xpath('a/text()').extract_first('') if dept_link and dept_name: dept_request = Request(dept_link, headers=self.headers, callback=self.parse_hospital_dep, meta={'dept_name': dept_name}) dept_request.meta['Referer'] = response.url yield dept_request
def parse(self, response): self.logger.info('>>>>>>正在抓取所有医院信息>>>>>>') all_hospitals = json.loads(response.text) for each_hospital in all_hospitals.get('list'): hospital_name = each_hospital.get('hospitalname') hospital_address = each_hospital.get('address') hospital_county = get_county2('中国|广东省|广东|中山市|中山', hospital_address) loader = CommonLoader2(item=HospitalInfoItem(), response=response) loader.add_value('hospital_name', hospital_name, MapCompose(custom_remove_tags)) loader.add_value('hospital_addr', hospital_address, MapCompose(custom_remove_tags)) loader.add_value('hospital_pro', '广东省') loader.add_value('hospital_city', '中山市') loader.add_value('hospital_county', hospital_county, MapCompose(custom_remove_tags)) loader.add_value('hospital_phone', each_hospital.get('telephoneno'), MapCompose(custom_remove_tags)) loader.add_value('hospital_intro', each_hospital.get('information'), MapCompose(custom_remove_tags)) loader.add_value('registered_channel', self.data_source_from) loader.add_value('dataSource_from', self.data_source_from) loader.add_value('hospital_url', response.url) loader.add_value('update_time', now_day()) hospital_item = loader.load_item() yield hospital_item # 获取科室信息、医生信息 hospital_id = each_hospital.get('hospitalid') if hospital_id: self.headers['Referer'] = self.entry_url yield Request(self.hospital_detail_url.format(hospital_id), headers=self.headers, callback=self.parse_hospital_dep, meta={'hospital_name': hospital_name})
def parse(self, response): """获取医院信息""" self.logger.info('正在抓取{}:医院信息'.format(self.hospital_name)) loader = CommonLoader2(item=HospitalInfoItem(), response=response) loader.add_value('hospital_name', self.hospital_name) loader.add_value('consulting_hour', '上午_8:00-12:00;下午14:00-17:30') loader.add_value('hospital_level', '二级甲等') loader.add_value('hospital_type', '公立') loader.add_value('hospital_category', '综合医院') loader.add_value('hospital_addr', '四川都江堰市发展路89号') loader.add_value('hospital_pro', '四川省') loader.add_value('hospital_city', '都江堰市') loader.add_value('hospital_county', '') loader.add_value( 'hospital_phone', '急救电话_028-68963120;免费咨询电话_028-69219766;投诉电话_028-69263900 ') loader.add_xpath('hospital_intro', '//div[@class="fleft wd740"]') loader.add_value('registered_channel', '电话') loader.add_value('dataSource_from', '医院官网') loader.add_value('update_time', now_day()) hospital_info_item = loader.load_item() # 医院信息 yield hospital_info_item # 科室信息 # 第一版 获取导航菜单子菜单中的科室信息,不全只有10个 # dept_links = response.xpath('//div[@id="head1_ksdh"]/div/div/a') # if dept_links: # for each_dept_link in dept_links: # dept_link = each_dept_link.xpath('@href').extract_first('') # dept_name = each_dept_link.xpath('text()').extract_first('') # if dept_link: # dept_request = Request(urljoin(self.host, dept_link), # headers=self.headers, # callback=self.parse_hospital_dep_detail, # meta={'dept_name': dept_name}) # dept_request.meta['Referer'] = response.url # yield dept_request # 获取科室信息,第二版 # 获取默认页面的科室信息 dept_default_request = Request(self.dept_link, headers=self.headers, callback=self.parse_hospital_dep_detail, meta={'dept_name': '门诊部'}, dont_filter=True) dept_default_request.meta['Referer'] = response.url yield dept_default_request # 获取默认页面中的其他科室信息 dept_request = Request(self.dept_link, headers=self.headers, callback=self.parse_hospital_dep, dont_filter=True) dept_request.meta['Referer'] = response.url yield dept_request # 医生信息,官网翻页不太好用 for each_doctor_link in self.doctor_link_list: doctor_request = Request(each_doctor_link, headers=self.headers, callback=self.parse_doctor_info) doctor_request.meta['Referer'] = response.url yield doctor_request
def parse_hospital_info(self, response): hospital_name = response.meta.get('hospital_name') self.logger.info('>>>>>>正在抓取[{}]医院信息和科室信息>>>>>>'.format(hospital_name)) hospital_city = response.xpath( '//div[@class="jieshao_zi"]/p[4]/text()').extract() if hospital_city: hospital_address = custom_remove_tags(''.join(hospital_city)) hospital_city2 = get_city(hospital_address) useless_info = '中国|湖南省|湖南|{}'.format(hospital_city2) hospital_county = get_county2(useless_info, hospital_address) else: hospital_county = None loader = CommonLoader2(item=HospitalInfoItem(), response=response) loader.add_xpath('hospital_name', '//div[@class="jieshao_zi"]/p/font/text()', MapCompose(custom_remove_tags)) loader.add_xpath('hospital_level', '//div[@class="jieshao_zi"]/p[2]/text()', MapCompose(custom_remove_tags)) loader.add_value('hospital_type', '公立') loader.add_value('hospital_category', '') loader.add_xpath('hospital_addr', '//div[@class="jieshao_zi"]/p[4]/text()', MapCompose(custom_remove_tags)) loader.add_value('hospital_pro', '湖南省') loader.add_xpath('hospital_city', '//div[@class="jieshao_zi"]/p[4]/text()', MapCompose(custom_remove_tags, get_city)) loader.add_value('hospital_county', hospital_county) loader.add_xpath('hospital_phone', '//div[@class="jieshao_zi"]/p[3]/text()', MapCompose(custom_remove_tags)) loader.add_xpath( 'hospital_intro', '//div[@id="starlist"]', MapCompose(remove_tags, custom_remove_tags, clean_info)) loader.add_value('registered_channel', self.data_source_from) loader.add_value('dataSource_from', self.data_source_from) loader.add_value('update_time', now_day()) hospital_info_item = loader.load_item() yield hospital_info_item # 获取科室信息 self.logger.info('>>>>>>正在抓取[{}]科室信息>>>>>>'.format(hospital_name)) dept_links = response.xpath('//div[@class="xuanze_kslb"]') if dept_links: for each_dept_link in dept_links: dept_type = each_dept_link.xpath( 'div[1]/ul/li/text()').extract_first('') all_dept_links = each_dept_link.xpath('div[2]/ul/li/a') for dept_link in all_dept_links: # dept_name = dept_link.xpath('text()').extract_first('') data_info = dept_link.xpath('@onclick').extract_first('') if data_info: data_info = ''.join(re.findall(r'\S+', data_info)) is_sp_time = re.search(r'isSpTime:\'(.*?)\'', data_info) pay_mode = re.search(r'paymode:\'(.*?)\'', data_info) dept_id = re.search(r'platformDeptId:\'(.*?)\'', data_info) hos_id = re.search(r'platformHosId:\'(.*?)\'', data_info, S) dept_name = re.search(r'tempDeptName:\'(.*?)\'', data_info, S) org_name = re.search(r'orgname:\'(.*?)\'', data_info, S) if dept_id and hos_id and dept_name and org_name: is_sp_time = is_sp_time.group(1) pay_mode = pay_mode.group(1) dept_id = dept_id.group(1) hos_id = hos_id.group(1) dept_name = dept_name.group(1) org_name = org_name.group(1) data = { 'isSpTime': str(is_sp_time), 'paymode': quote(pay_mode), 'doctorCollectResult': '', 'platformDeptId': str(dept_id), 'orgname': quote(org_name), 'tempDeptName': quote(dept_name), 'platformHosId': str(hos_id), 'platformDoctorId': '' } self.headers.update({ 'Content-Type': 'application/x-www-form-urlencoded', 'Origin': 'http://www.hnyygh.com', 'Referer': 'http://www.hnyygh.com/searchDeptmentAction.action', 'Pragma': 'no-cache' }) splash_args = { 'url': self.dept_detail_url, 'headers': self.headers, 'lua_source': self.dept_script, 'data': data } yield SplashRequest( self.dept_detail_url, endpoint='execute', args=splash_args, dont_filter=True, headers=self.headers, callback=self.parse_hospital_dep_detail, meta={ 'dept_type': dept_type, 'dept_name': dept_name, 'hospital_name': org_name }) # 获取医生信息 data = { 'platformDeptId': dept_id, 'platformHosId': hos_id, 'platformDoctorId': '', 'nextNumInfo': '0' } self.headers.update({ 'Content-Type': 'application/x-www-form-urlencoded', 'Origin': 'http://www.hnyygh.com', 'Referer': 'http://www.hnyygh.com/searchOrderNumInfoAction.action' }) splash_args = { 'url': self.doctor_url, 'headers': self.headers, 'lua_source': self.dept_script, 'data': data } yield SplashRequest( self.doctor_url, endpoint='execute', args=splash_args, dont_filter=True, headers=self.headers, callback=self.parse_doctor_info, meta={ 'dept_type': dept_type, 'dept_name': dept_name, 'dept_id': dept_id, 'hospital_name': org_name })
def parse_hospital_info(self, response): self.logger.info('>>>>>>正在抓取:医院详细信息>>>>>>') try: # 获取医院信息 # 获取医院等级与地区 hospital_info = response.xpath( '//p[@class="yygh_box_top_p2"]').extract() hospital_info2 = custom_remove_tags( remove_tags(''.join(hospital_info))) hospital_level = hospital_info2.split(':')[1].replace('区域', '') hospital_county = hospital_info2.split(':')[2].replace('分类', '') if hospital_level: res = re.search(r'(.*等|.*级|.*合格|.*甲)(.*?)$', hospital_level) if res: h_l = res.group(1) h_c = res.group(2) if h_c: h_c_2 = re.sub(r'合格|医院', '', h_c) if h_c_2: h_c = '{0}{1}'.format(h_c_2, '医院') else: h_l = h_c = None else: h_l = h_c = None loader = CommonLoader2(item=HospitalInfoItem(), response=response) loader.add_xpath('hospital_name', '//p[@class="yygh_box_top_p"]/strong/text()', MapCompose(custom_remove_tags)) loader.add_value('hospital_level', h_l, MapCompose(custom_remove_tags)) loader.add_value('hospital_category', h_c) loader.add_xpath( 'hospital_addr', '//span[@class="yygh_box_con_dl_span1"]/ancestor::dl[1]/dd[1]/p/text()', MapCompose(custom_remove_tags)) loader.add_value('hospital_pro', '') loader.add_value('hospital_city', '北京市') loader.add_value('hospital_county', hospital_county, MapCompose(custom_remove_tags)) loader.add_xpath( 'hospital_phone', '//span[@class="yygh_box_con_dl_span3"]/ancestor::dl[1]/dd[1]/p/text()', MapCompose(custom_remove_tags)) loader.add_value('registered_channel', self.data_source_from) loader.add_value('dataSource_from', self.data_source_from) loader.add_value('hospital_url', response.url) loader.add_value('update_time', now_day()) # 获取医院介绍 hospital_intro_link = response.xpath( '//a[contains(text(),"医院介绍")]/@href').extract_first('') if hospital_intro_link: hospital_intro_link = urljoin(self.host, hospital_intro_link) self.headers['Referer'] = response.url yield Request(hospital_intro_link, headers=self.headers, callback=self.parse_hospital_detail_info, meta={'loader': loader}) # 获取科室信息 all_dept_links = response.xpath('//div[@class="kfyuks_yyksbox"]') for each_dept_link in all_dept_links: dept_type = each_dept_link.xpath( 'div[1]/text()').extract_first('') dept_info = each_dept_link.xpath( 'div[2]/div/ul/li/a/text()').extract() for each_dept_info in dept_info: dept_loader = CommonLoader2(item=HospitalDepItem(), response=response) dept_loader.add_value('dept_name', each_dept_info, MapCompose(custom_remove_tags)) dept_loader.add_value('dept_type', dept_type, MapCompose(custom_remove_tags)) dept_loader.add_xpath( 'hospital_name', '//p[@class="yygh_box_top_p"]/strong/text()', MapCompose(custom_remove_tags)) dept_loader.add_value('dept_info', '') dept_loader.add_value('dataSource_from', self.data_source_from) dept_loader.add_value('update_time', now_day()) dept_item = dept_loader.load_item() yield dept_item except Exception as e: self.logger.error('在抓取医院详细信息过程中出错了,原因是:{}'.format(repr(e)))
def parse_hospital_info(self, response): self.logger.info('>>>>>>正在抓取:医院详细信息和科室信息>>>>>>') try: # 获取医院信息 hospital_type = response.meta.get('hospital_type') hospital_category = '{0}{1}'.format( hospital_type, '医院') if hospital_type else None hospital_info = custom_remove_tags( remove_tags(''.join( response.xpath('//td[@class=' '"title_yh14"]').extract()))) hospital_address = get_hospital_info(hospital_info, '地址:', '电话:') hospital_address = hospital_address.replace( '查看地图', '') if hospital_address else None hospital_phone = get_hospital_info(hospital_info, '电话:', '官网') hospital_intro = get_hospital_info(hospital_info, '简介:', '$') hospital_intro = hospital_intro.replace( '...更多>>', '') if hospital_intro else None loader = CommonLoader2(item=HospitalInfoItem(), response=response) loader.add_xpath('hospital_name', '//span[@class="title"]/text()', MapCompose(custom_remove_tags)) loader.add_xpath('hospital_level', '//span[@class="dj"]/text()', MapCompose(custom_remove_tags)) loader.add_value('hospital_category', hospital_category) loader.add_value('hospital_addr', hospital_address, MapCompose(custom_remove_tags)) loader.add_value('hospital_pro', '山西省') loader.add_xpath( 'hospital_city', '//td[contains(text(),"山西")]/ancestor::tr[1]/td[1]/a[1]/text()', MapCompose(custom_remove_tags)) loader.add_xpath( 'hospital_county', '//td[contains(text(),"山西")]/ancestor::tr[1]/td[1]/a[2]/text()', MapCompose(custom_remove_tags)) loader.add_value('hospital_phone', hospital_phone, MapCompose(custom_remove_tags)) loader.add_value('hospital_intro', hospital_intro, MapCompose(custom_remove_tags)) loader.add_value('registered_channel', self.data_source_from) loader.add_value('dataSource_from', self.data_source_from) loader.add_value('update_time', now_day()) hospital_info_item = loader.load_item() yield hospital_info_item # 获取科室信息 self.logger.info('>>>>>>正在抓取科室详细信息>>>>>>') all_dept_links = response.xpath('//tr[@class="h_bottom"]') for each_dept_link in all_dept_links: dept_type = each_dept_link.xpath('td[1]/text()').extract_first( '') dept_name = each_dept_link.xpath( 'td[2]/table/tr/td/a/text()').extract() for each_dept_name in dept_name: dept_loader = CommonLoader2(item=HospitalDepItem(), response=response) dept_loader.add_value( 'dept_name', each_dept_name, MapCompose(custom_remove_tags, match_special2)) dept_loader.add_value( 'dept_type', dept_type, MapCompose(custom_remove_tags, match_special2)) dept_loader.add_xpath('hospital_name', '//span[@class="title"]/text()', MapCompose(custom_remove_tags)) dept_loader.add_value('dept_info', '') dept_loader.add_value('dataSource_from', self.data_source_from) dept_loader.add_value('update_time', now_day()) dept_item = dept_loader.load_item() yield dept_item except Exception as e: self.logger.error('在抓取医院详细信息和科室的过程中出错了,原因是:{}'.format(repr(e)))
def parse_hospital_info(self, response): hospital_name = response.meta.get('hospital_name') self.logger.info('>>>>>>正在抓取[{}]医院详细信息>>>>>>'.format(hospital_name)) try: hospital_id = response.meta.get('hospital_id') data_type = response.meta.get('data_type') hospital_pro = response.meta.get('province_name') if data_type == '1': hospital_address = response.xpath('///div[@class="search-result-hospital-text"]/' 'p[4]/text()').extract_first('') hospital_phone = response.xpath('//div[@class="search-result-hospital-text"]/' 'p[3]/text()').extract_first('') check_phone = re.search('(\d{6,})', hospital_phone) if not check_phone and not hospital_address: hospital_address = hospital_phone hospital_phone = '' # hospital_city = get_city('', hospital_address) # hospital_county = get_county2('', match_special2(hospital_address)) df = transform([hospital_address]) # hospital_pro = df.head()['省'][0] hospital_city = df.head()['市'][0] hospital_county = df.head()['区'][0] if hospital_pro in MUNICIPALITY2: hospital_city = '{0}{1}'.format(hospital_pro, '市') hospital_pro = '' else: hospital_pro = '{0}{1}'.format(hospital_pro, '省') loader = CommonLoader2(item=HospitalInfoItem(), response=response) loader.add_xpath('hospital_name', '//span[@class="search-result-hospital-name"]/text()', MapCompose(custom_remove_tags)) loader.add_xpath('hospital_level', '//div[@class="search-result-hospital-text"]/p[2]/text()', MapCompose(custom_remove_tags, clean_info2)) loader.add_value('hospital_addr', hospital_address, MapCompose(custom_remove_tags)) loader.add_value('hospital_pro', hospital_pro) loader.add_value('hospital_city', hospital_city) loader.add_value('hospital_county', hospital_county, MapCompose(custom_remove_tags)) loader.add_value('hospital_phone', hospital_phone, MapCompose(custom_remove_tags)) loader.add_xpath('hospital_intro', '//li[@id="info"]/p', MapCompose(remove_tags, custom_remove_tags)) loader.add_value('registered_channel', self.data_source_from) loader.add_value('dataSource_from', self.data_source_from) loader.add_value('crawled_url', response.url) loader.add_value('update_time', now_day()) loader.add_xpath('hospital_route', '//div[@class="search-result-hospital-text"]/p[5]/text()', MapCompose(custom_remove_tags, match_special2)) loader.add_xpath('hospital_img_url', 'div[@class="search-result-hospital-img"]/img/@src') loader.add_value('hospital_tags', '1') loader.add_value('gmt_created', now_time()) loader.add_value('gmt_modified', now_time()) loader.add_value('hospital_id', hospital_id) hospital_item = loader.load_item() yield hospital_item # 获取科室信息 # 从一级科室获取二级科室信息 all_dept = response.xpath('//ul[@id="parent-list"]/li[@id]') for each_dept in all_dept: each_dept_id = each_dept.xpath('@id').extract_first('') each_dept_type = each_dept.xpath('div/span/text()').extract_first('') self.headers['Referer'] = response.url dept_link = self.dept_url.format(hospital_id, each_dept_id) yield Request(dept_link, headers=self.headers, callback=self.parse_hospital_dep, meta={ 'hospital_name': hospital_name, 'hospital_id': hospital_id, 'dept_type': each_dept_type }) # 获取医生信息 self.headers['Referer'] = response.url doctor_info_link = self.doctor_url.format(hospital_id, '1') yield Request(doctor_info_link, headers=self.headers, callback=self.parse_doctor_info, meta={ 'hospital_name': hospital_name, 'hospital_id': hospital_id }) elif data_type == '2': hospital_address = response.xpath('//p[@class="hospital-private-address-line fc-6"]' '[contains(text(),"地址")]/text()').extract_first('') # hospital_city = get_city('', hospital_address) # hospital_county = get_county2('', match_special2(hospital_address)) df = transform([hospital_address]) # hospital_pro = df.head()['省'][0] hospital_city = df.head()['市'][0] hospital_county = df.head()['区'][0] if hospital_pro in MUNICIPALITY2: hospital_city = '{0}{1}'.format(hospital_pro, '市') hospital_pro = '' else: hospital_pro = '{0}{1}'.format(hospital_pro, '省') loader = CommonLoader2(item=HospitalInfoItem(), response=response) loader.add_xpath('hospital_name', '//p[@class="hospital-private-content-tit"]/text()', MapCompose(custom_remove_tags)) loader.add_value('hospital_addr', hospital_address, MapCompose(custom_remove_tags, match_special2)) loader.add_value('hospital_pro', hospital_pro) loader.add_value('hospital_city', hospital_city) loader.add_value('hospital_county', hospital_county, MapCompose(custom_remove_tags)) loader.add_xpath('hospital_phone', '//div[@class="search-result-hospital-text"]/p[3]/text()', MapCompose(custom_remove_tags)) loader.add_xpath('hospital_intro', '//li[@id="info"]/p', MapCompose(remove_tags, custom_remove_tags)) loader.add_value('registered_channel', self.data_source_from) loader.add_value('dataSource_from', self.data_source_from) loader.add_value('crawled_url', response.url) loader.add_value('update_time', now_day()) loader.add_xpath('hospital_route', '//li[@id="address"]/p[3]/text()', MapCompose(custom_remove_tags, match_special2)) # loader.add_xpath('hospital_img_url', 'div[@class="search-result-hospital-img"]/img/@src') loader.add_value('hospital_tags', '2') loader.add_value('gmt_created', now_time()) loader.add_value('gmt_modified', now_time()) loader.add_value('hospital_id', hospital_id) hospital_item = loader.load_item() yield hospital_item # 获取科室信息 # 从一级科室获取二级科室信息 all_dept = response.xpath('//ul[@id="parent-list"]/li[position()>1]') for each_dept in all_dept: dept_id = each_dept.xpath('div/@id').extract_first('') dept_name = each_dept.xpath('div/span/text()').extract_first('') dept_loader = CommonLoader2(item=HospitalDepItem(), response=response) dept_loader.add_value('dept_name', dept_name, MapCompose(custom_remove_tags)) dept_loader.add_value('hospital_name', hospital_name, MapCompose(custom_remove_tags)) dept_loader.add_value('dataSource_from', self.data_source_from) dept_loader.add_value('crawled_url', response.url) dept_loader.add_value('update_time', now_day()) dept_loader.add_value('dept_id', dept_id.replace('subDepLi-', '')) dept_loader.add_value('dept_url', response.url) dept_loader.add_value('gmt_created', now_time()) dept_loader.add_value('gmt_modified', now_time()) dept_item = dept_loader.load_item() yield dept_item # 获取医生信息 self.headers['Referer'] = response.url doctor_info_link = self.doctor_url.format(hospital_id, '1') yield Request(doctor_info_link, headers=self.headers, callback=self.parse_doctor_info, meta={ 'hospital_name': hospital_name, 'hospital_id': hospital_id, 'dept_name': dept_name }) except Exception as e: self.logger.error('在抓取医院详细信息过程中出错了,原因是:{}'.format(repr(e)))
def parse_hospital_info(self, response): hospital_name = response.meta.get('hospital_name') self.logger.info('>>>>>>正在抓取:[{}]医院详细信息>>>>>>'.format(hospital_name)) try: # 获取医院等级与类别 l_a_c = response.xpath( '//div[@class="l"]/h2/span/i/text()').extract() l_a_c = custom_remove_tags(remove_tags('|'.join(l_a_c))) h_l = h_c = m_t = None if l_a_c: # 等级 level = re.search(r'(.*等|.*级|.*甲)', l_a_c) if level: h_l = level.group(1).split('|')[-1] # 类别 category = re.search(r'(.*?医院)', l_a_c.replace('医保定点医院', '')) if category: h_c = category.group(1).split('|')[-1] # 医保类型 medical_type = re.search(r'(.*定点)', l_a_c) if medical_type: m_t = medical_type.group(1).split('|')[-1] else: h_l = h_c = None # 获取省市信息 hospital_pro = response.meta.get('hospital_pro') hospital_city = hospital_county = None h_a = response.xpath( '//dt[contains(text(),"地址")]/ancestor::dl[1]/dd').extract() hospital_address = custom_remove_tags( remove_tags(''.join(h_a).replace('查看地图', ''))) if hospital_pro and hospital_address: if hospital_pro in MUNICIPALITY2: hospital_city = hospital_pro hospital_pro = '' hos_c = hospital_city.replace('市', '') useless_info = '{}{}|{}'.format(hos_c, '市', hos_c) single_address = match_special2( hospital_address.split(';')[0]) hospital_county = get_county2(useless_info, single_address) else: hos_p = hospital_pro hospital_pro = '{0}{1}'.format(hospital_pro, '省') single_address = match_special2( hospital_address.split(';')[0]) hospital_city = get_city(hospital_pro, single_address) if hospital_city: hos_c = hospital_city.replace('市', '') useless_info = '{}|{}|{}|{}'.format( hospital_pro, hos_p, hospital_city, hos_c) hospital_county = get_county2(useless_info, single_address) # 公立/私立 h_t = custom_remove_tags( response.xpath( '//li/b[contains(text(),"国营")]/text()').extract_first('')) hospital_type = '公立' if h_t == '国营' else '' # 医院信息item loader = CommonLoader2(item=HospitalInfoItem(), response=response) loader.add_xpath('hospital_name', '//div[@class="l"]/h2/text()', MapCompose(custom_remove_tags)) loader.add_value('hospital_level', h_l, MapCompose(custom_remove_tags)) loader.add_value('hospital_type', hospital_type) loader.add_value('hospital_category', h_c, MapCompose(custom_remove_tags)) loader.add_value('hospital_addr', hospital_address, MapCompose(custom_remove_tags)) loader.add_value('hospital_pro', hospital_pro, MapCompose(custom_remove_tags)) loader.add_value('hospital_city', hospital_city, MapCompose(custom_remove_tags)) loader.add_value('hospital_county', hospital_county, MapCompose(custom_remove_tags)) loader.add_xpath('hospital_phone', '//dt[contains(text(),"电话")]/ancestor::dl[1]/dd', MapCompose(remove_tags, custom_remove_tags)) loader.add_xpath( 'hospital_intro', '//dt/strong[contains(text(),"简介")]/ancestor::dl[1]/dd', MapCompose(remove_tags, custom_remove_tags)) loader.add_value('medicare_type', m_t, MapCompose(custom_remove_tags)) # loader.add_value('registered_channel', self.data_source_from) loader.add_value('dataSource_from', self.data_source_from) loader.add_value('hospital_url', response.url) loader.add_value('update_time', now_day()) hospital_item = loader.load_item() yield hospital_item # 获取医院别名 hospital_alias = response.xpath( '//div[@class="l"]/p/text()').extract_first('') if hospital_alias: h_s = custom_remove_tags(hospital_alias) if h_s: all_hospital_alias = h_s.split(',') for each_alias in all_hospital_alias: if each_alias != hospital_name: alias_loader = CommonLoader2( item=HospitalAliasItem(), response=response) alias_loader.add_xpath( 'hospital_name', '//div[@class="l"]/h2/text()', MapCompose(custom_remove_tags)) alias_loader.add_value( 'hospital_alias_name', each_alias, MapCompose(custom_remove_tags, match_special)) alias_loader.add_value('dataSource_from', self.data_source_from) alias_loader.add_value('update_time', now_day()) alias_item = alias_loader.load_item() yield alias_item except Exception as e: self.logger.error('在抓取医院详细信息过程中出错了,原因是:{}'.format(repr(e)))
def parse_doctor_website(self, response): self.logger.info('>>>>>>正在抓取医生个人主页相关信息……') # 获取医生相关信息 loader = YiHuLoader(item=DoctorInfoItem(), response=response) loader.add_xpath('doctor_name', '//span[@class="c-f22 c-333"]/text()') loader.add_xpath('dept_name', '//div[@class="doctor-info"]/dl/dd[2]/a[2]/text()') loader.add_xpath('hospital_name', '//div[@class="doctor-info"]/dl/dd[2]/a[1]/text()') loader.add_xpath('doctor_level', '//div[@class="doctor-info"]/dl/dd[1]/text()') loader.add_xpath( 'doctor_intro', '//table[@class="pop-myinfo-tb"]/tr[@class="last"]/td/p/text()') loader.add_xpath('doctor_goodAt', '//table[@class="pop-myinfo-tb"]/tr[5]/td/text()') loader.add_value('update_time', now_day()) doctor_item = loader.load_item() yield doctor_item # 获取医院相关信息 hos_link = response.xpath( '//div[@class="doctor-info"]/dl/dd[2]/a[1]/@href').extract_first( '') dept_link = response.xpath( '//div[@class="doctor-info"]/dl/dd[2]/a[2]/@href').extract_first( '') # 抓取医院详细信息 if hos_link: hos_id = re.search(r'/sc/(.*?).shtml', hos_link) if hos_id and hos_id.group(1) not in self.crawled_ids: self.crawled_ids.add(hos_id.group(1)) hos_intro_link = re.sub(r'/sc/', '/detail/', hos_link) hos_con_link = re.sub(r'/sc/', '/contact/', hos_link) hos_loader = YiHuLoader(item=HospitalInfoItem(), response=response) hos_loader.add_xpath( 'hospital_name', '//div[@class="doctor-info"]/dl/dd[2]/a[1]/text()') hospital_detail_request = Request( hos_intro_link, headers=self.headers, callback=self.parse_hospital_detail, meta={ 'loader': hos_loader, 'contact_hos_link': hos_con_link }) hospital_detail_request.meta['Referer'] = response.url yield hospital_detail_request # 存储科室信息 if dept_link: dept_link_id = re.search(r'/arrange/(.*?).shtml', dept_link) if dept_link_id and dept_link_id.group(1) not in self.crawled_dept: self.crawled_dept.add(dept_link_id.group(1)) dept_loader = YiHuLoader(item=HospitalDepItem(), response=response) dept_loader.add_xpath( 'dept_name', '//div[@class="doctor-info"]/dl/dd[2]/a[2]/text()') dept_loader.add_xpath( 'hospital_name', '//div[@class="doctor-info"]/dl/dd[2]/a[1]/text()') dept_loader.add_value('update_time', now_day()) dept_item = dept_loader.load_item() yield dept_item
def parse_hospital_info(self, response): hospital_name = response.meta.get('hospital_name') self.logger.info('>>>>>>正在抓取医院详细信息>>>>>>') try: hospital_id = response.meta.get('hospital_id') hospital_img_url = response.xpath( '//div[@class="divLeft_Img"]/img/@src').extract_first('') hospital_img_url = urljoin( self.host, hospital_img_url) if hospital_img_url else '' hospital_address = response.xpath( '//li[contains(text(),"地址")]/text()').extract_first('') hospital_county = get_county2('中国|福建省|福建|厦门市|厦门', match_special2(hospital_address)) loader = CommonLoader2(item=HospitalInfoItem(), response=response) loader.add_xpath( 'hospital_name', '//div[@class="divLeft_Info"]/ul/li[1]/span/text()', MapCompose(custom_remove_tags)) loader.add_value('hospital_addr', hospital_address, MapCompose(custom_remove_tags, match_special2)) loader.add_value('hospital_pro', '福建省') loader.add_value('hospital_city', '厦门市') loader.add_value('hospital_county', hospital_county, MapCompose(custom_remove_tags)) loader.add_xpath('hospital_phone', '//li[contains(text(),"电话")]/text()', MapCompose(custom_remove_tags, match_special2)) loader.add_xpath('hospital_intro', '//div[@class="introduceSpan"]', MapCompose(remove_tags, custom_remove_tags)) loader.add_value('registered_channel', self.data_source_from) loader.add_value('dataSource_from', self.data_source_from) loader.add_value('crawled_url', response.url) loader.add_value('update_time', now_day()) loader.add_xpath('hospital_official_website', '//li[contains(text(),"官网")]/text()', MapCompose(custom_remove_tags, match_special2)) loader.add_xpath('hospital_route', '//li[contains(text(),"公交线路")]/text()', MapCompose(custom_remove_tags, match_special2)) loader.add_value('hospital_img_url', hospital_img_url) loader.add_value('gmt_created', now_time()) loader.add_value('gmt_modified', now_time()) loader.add_value('hospital_id', hospital_id) hospital_item = loader.load_item() yield hospital_item # 科室信息 all_dept_info = response.xpath( '//div[@class="medicineOne"]|//div[@class="medicineTwo"]') for each_dept_info in all_dept_info: dept_type = each_dept_info.xpath( 'div[1]/span/text()').extract_first('') dept_names = each_dept_info.xpath('div[2]/div[1]') for each_dept_name in dept_names: dept_name = each_dept_name.xpath('a/text()').extract_first( '') dept_link = each_dept_name.xpath('a/@href').extract_first( '') doctor_num_of_dept = each_dept_name.xpath( 'span/text()').extract_first('') # 获取科室人数 if doctor_num_of_dept: dept_person_num = re.search(r'(\d+)', doctor_num_of_dept) dept_person_num = int(dept_person_num.group( 1)) if dept_person_num else None else: dept_person_num = None # 获取科室详细信息 dept_loader = CommonLoader2(item=HospitalDepItem(), response=response) dept_loader.add_value('dept_name', dept_name, MapCompose(custom_remove_tags)) dept_loader.add_value('hospital_name', hospital_name, MapCompose(custom_remove_tags)) dept_loader.add_value('dept_type', dept_type, MapCompose(custom_remove_tags)) dept_loader.add_value('dataSource_from', self.data_source_from) dept_info = ''.join( response.xpath( '//p[contains(text(),"科室简介")]/ancestor::tr[1]'). extract()) dept_loader.add_value( 'dept_info', dept_info, MapCompose(remove_tags, custom_remove_tags, match_special2)) dept_loader.add_value('crawled_url', response.url) dept_loader.add_value('update_time', now_day()) dept_loader.add_value('dept_id', dept_link, MapCompose(match_special2)) dept_loader.add_value('hospital_id', hospital_id) dept_loader.add_value('dept_person_num', dept_person_num) dept_loader.add_value('dept_url', urljoin(self.host, dept_link)) dept_loader.add_value('gmt_created', now_time()) dept_loader.add_value('gmt_modified', now_time()) dept_item = dept_loader.load_item() yield dept_item # 获取医生信息 if dept_link and dept_person_num: self.headers['Referer'] = response.url yield Request(urljoin(self.host, dept_link), headers=self.headers, callback=self.parse_doctor_info, dont_filter=True, meta={ 'hospital_name': hospital_name, 'dept_name': dept_name, }) except Exception as e: self.logger.error('在抓取医院详细信息过程中出错了,原因是:{}'.format(repr(e)))