def parse_hospital_info(self, response): self.logger.info('>>>>>>正在抓取:医院信息>>>>>>') try: # 获取区或县 hospital_address = response.xpath( '//div[@class="yy_js clearfix"]/div/dl/dd[1]/text()' ).extract_first('') if hospital_address: hospital_county = get_county2('中国|江苏省|江苏|南京市|南京', hospital_address) else: hospital_county = None # 获取医院信息 loader = CommonLoader2(item=HospitalInfoItem(), response=response) loader.add_xpath('hospital_name', '//div[@class="yy_til"]/h2/text()', MapCompose(custom_remove_tags)) loader.add_value('hospital_level', response.meta.get('hospital_level'), MapCompose(custom_remove_tags, clean_info)) loader.add_xpath( 'hospital_addr', '//div[@class="yy_js clearfix"]/div/dl/dd[1]/text()', MapCompose(custom_remove_tags)) loader.add_value('hospital_pro', '江苏省') loader.add_value('hospital_city', '南京市') loader.add_value('hospital_county', hospital_county) loader.add_xpath( 'hospital_phone', '//div[@class="yy_js clearfix"]/div/dl/dd[2]/text()', MapCompose(custom_remove_tags)) loader.add_xpath('hospital_intro', '//dd[@id="wrap"]', MapCompose(remove_tags, custom_remove_tags)) loader.add_value('registered_channel', self.data_source_from) loader.add_value('dataSource_from', self.data_source_from) loader.add_value('hospital_url', response.url) loader.add_value('update_time', now_day()) hospital_info_item = loader.load_item() yield hospital_info_item # 获取科室信息 # self.logger.info('>>>>>>正在抓取{}:科室详细信息>>>>>>') all_dept_links = response.xpath( '//dl[@class="kfyy clearfix"]/dd/span/a/@href').extract() for each_dept_link in all_dept_links: dept_link = urljoin( self.host, re.sub(r';jsessionid=(.*?)\?', '?', each_dept_link)) self.headers['Referer'] = response.url yield Request(dept_link, headers=self.headers, callback=self.parse_hospital_dep_detail) except Exception as e: self.logger.error('在抓取医院详细信息和科室的过程中出错了,原因是:{}'.format(repr(e)))
def parse_hospital_info(self, response): self.logger.info('>>>>>>正在抓取:医院详细信息>>>>>>') try: # 获取医院信息 hospital_info = json.loads(response.text) # 获取医院等级 hospital_level_info = hospital_info.get('HIS_LVL') if hospital_level_info == '3': hospital_level = '三级' elif hospital_level_info == '2': hospital_level = '二级' elif hospital_level_info == '1': hospital_level = '一级' else: hospital_level = None # 获取医院所在区或县 hospital_address = hospital_info.get('HIS_AD') if hospital_address: hospital_county = get_county2('中国|广东省|广东|广州市|广州', hospital_address) else: hospital_county = None loader = CommonLoader2(item=HospitalInfoItem(), response=response) loader.add_value('hospital_name', hospital_info.get('HIS_NM')) loader.add_value('hospital_level', hospital_level) loader.add_value('hospital_category', '') loader.add_value('hospital_addr', hospital_address) loader.add_value('hospital_pro', '广东省') loader.add_value('hospital_city', '广州市') loader.add_value('hospital_county', hospital_county) loader.add_value('hospital_phone', hospital_info.get('TEL_NO')) loader.add_value('hospital_intro', hospital_info.get('HIS_RM')) loader.add_value('registered_channel', self.data_source_from) loader.add_value('dataSource_from', self.data_source_from) loader.add_value('hospital_url', response.url) loader.add_value('update_time', now_day()) hospital_info_item = loader.load_item() yield hospital_info_item except Exception as e: self.logger.error('在抓取医院详细信息过程中出错了,原因是:{}'.format(repr(e)))
def parse_hospital_info(self, response): self.logger.info('>>>>>>正在抓取医院详细信息>>>>>>') try: hospital_address = response.xpath( '//b[contains(text(),"医院地址")]/' 'ancestor::td[1]/text()').extract_first('') hospital_county = get_county2('中国|广东省|广东|珠海市|珠海', hospital_address) loader = CommonLoader2(item=HospitalInfoItem(), response=response) loader.add_xpath( 'hospital_name', '//b[contains(text(),"医院全称")]/ancestor::td[1]/text()', MapCompose(custom_remove_tags)) loader.add_xpath( 'hospital_level', '//b[contains(text(),"医院级别")]/ancestor::td[1]/text()', MapCompose(custom_remove_tags)) loader.add_value('hospital_addr', hospital_address, MapCompose(custom_remove_tags)) loader.add_value('hospital_pro', '广东省') loader.add_value('hospital_city', '珠海市') loader.add_value('hospital_county', hospital_county, MapCompose(custom_remove_tags)) loader.add_xpath( 'hospital_phone', '//b[contains(text(),"联系电话")]/ancestor::td[1]/text()', MapCompose(custom_remove_tags)) loader.add_xpath( 'hospital_intro', '//b[contains(text(),"简介")]/ancestor::td[1]', MapCompose(remove_tags, custom_remove_tags, match_special)) loader.add_value('registered_channel', self.data_source_from) loader.add_value('dataSource_from', self.data_source_from) loader.add_value('crawled_url', response.url) loader.add_value('update_time', now_day()) hospital_item = loader.load_item() yield hospital_item except Exception as e: self.logger.error('在抓取医院详细信息过程中出错了,原因是:{}'.format(repr(e)))
def parse(self, response): self.logger.info('>>>>>>正在抓取所有医院信息>>>>>>') all_hospitals = json.loads(response.text) for each_hospital in all_hospitals.get('list'): hospital_name = each_hospital.get('hospitalname') hospital_address = each_hospital.get('address') hospital_county = get_county2('中国|广东省|广东|中山市|中山', hospital_address) loader = CommonLoader2(item=HospitalInfoItem(), response=response) loader.add_value('hospital_name', hospital_name, MapCompose(custom_remove_tags)) loader.add_value('hospital_addr', hospital_address, MapCompose(custom_remove_tags)) loader.add_value('hospital_pro', '广东省') loader.add_value('hospital_city', '中山市') loader.add_value('hospital_county', hospital_county, MapCompose(custom_remove_tags)) loader.add_value('hospital_phone', each_hospital.get('telephoneno'), MapCompose(custom_remove_tags)) loader.add_value('hospital_intro', each_hospital.get('information'), MapCompose(custom_remove_tags)) loader.add_value('registered_channel', self.data_source_from) loader.add_value('dataSource_from', self.data_source_from) loader.add_value('hospital_url', response.url) loader.add_value('update_time', now_day()) hospital_item = loader.load_item() yield hospital_item # 获取科室信息、医生信息 hospital_id = each_hospital.get('hospitalid') if hospital_id: self.headers['Referer'] = self.entry_url yield Request(self.hospital_detail_url.format(hospital_id), headers=self.headers, callback=self.parse_hospital_dep, meta={'hospital_name': hospital_name})
def parse_hospital_info(self, response): hospital_name = response.meta.get('hospital_name') self.logger.info('>>>>>>正在抓取:[{}]医院详细信息>>>>>>'.format(hospital_name)) try: # 获取医院等级与类别 l_a_c = response.xpath( '//div[@class="l"]/h2/span/i/text()').extract() l_a_c = custom_remove_tags(remove_tags('|'.join(l_a_c))) h_l = h_c = m_t = None if l_a_c: # 等级 level = re.search(r'(.*等|.*级|.*甲)', l_a_c) if level: h_l = level.group(1).split('|')[-1] # 类别 category = re.search(r'(.*?医院)', l_a_c.replace('医保定点医院', '')) if category: h_c = category.group(1).split('|')[-1] # 医保类型 medical_type = re.search(r'(.*定点)', l_a_c) if medical_type: m_t = medical_type.group(1).split('|')[-1] else: h_l = h_c = None # 获取省市信息 hospital_pro = response.meta.get('hospital_pro') hospital_city = hospital_county = None h_a = response.xpath( '//dt[contains(text(),"地址")]/ancestor::dl[1]/dd').extract() hospital_address = custom_remove_tags( remove_tags(''.join(h_a).replace('查看地图', ''))) if hospital_pro and hospital_address: if hospital_pro in MUNICIPALITY2: hospital_city = hospital_pro hospital_pro = '' hos_c = hospital_city.replace('市', '') useless_info = '{}{}|{}'.format(hos_c, '市', hos_c) single_address = match_special2( hospital_address.split(';')[0]) hospital_county = get_county2(useless_info, single_address) else: hos_p = hospital_pro hospital_pro = '{0}{1}'.format(hospital_pro, '省') single_address = match_special2( hospital_address.split(';')[0]) hospital_city = get_city(hospital_pro, single_address) if hospital_city: hos_c = hospital_city.replace('市', '') useless_info = '{}|{}|{}|{}'.format( hospital_pro, hos_p, hospital_city, hos_c) hospital_county = get_county2(useless_info, single_address) # 公立/私立 h_t = custom_remove_tags( response.xpath( '//li/b[contains(text(),"国营")]/text()').extract_first('')) hospital_type = '公立' if h_t == '国营' else '' # 医院信息item loader = CommonLoader2(item=HospitalInfoItem(), response=response) loader.add_xpath('hospital_name', '//div[@class="l"]/h2/text()', MapCompose(custom_remove_tags)) loader.add_value('hospital_level', h_l, MapCompose(custom_remove_tags)) loader.add_value('hospital_type', hospital_type) loader.add_value('hospital_category', h_c, MapCompose(custom_remove_tags)) loader.add_value('hospital_addr', hospital_address, MapCompose(custom_remove_tags)) loader.add_value('hospital_pro', hospital_pro, MapCompose(custom_remove_tags)) loader.add_value('hospital_city', hospital_city, MapCompose(custom_remove_tags)) loader.add_value('hospital_county', hospital_county, MapCompose(custom_remove_tags)) loader.add_xpath('hospital_phone', '//dt[contains(text(),"电话")]/ancestor::dl[1]/dd', MapCompose(remove_tags, custom_remove_tags)) loader.add_xpath( 'hospital_intro', '//dt/strong[contains(text(),"简介")]/ancestor::dl[1]/dd', MapCompose(remove_tags, custom_remove_tags)) loader.add_value('medicare_type', m_t, MapCompose(custom_remove_tags)) # loader.add_value('registered_channel', self.data_source_from) loader.add_value('dataSource_from', self.data_source_from) loader.add_value('hospital_url', response.url) loader.add_value('update_time', now_day()) hospital_item = loader.load_item() yield hospital_item # 获取医院别名 hospital_alias = response.xpath( '//div[@class="l"]/p/text()').extract_first('') if hospital_alias: h_s = custom_remove_tags(hospital_alias) if h_s: all_hospital_alias = h_s.split(',') for each_alias in all_hospital_alias: if each_alias != hospital_name: alias_loader = CommonLoader2( item=HospitalAliasItem(), response=response) alias_loader.add_xpath( 'hospital_name', '//div[@class="l"]/h2/text()', MapCompose(custom_remove_tags)) alias_loader.add_value( 'hospital_alias_name', each_alias, MapCompose(custom_remove_tags, match_special)) alias_loader.add_value('dataSource_from', self.data_source_from) alias_loader.add_value('update_time', now_day()) alias_item = alias_loader.load_item() yield alias_item except Exception as e: self.logger.error('在抓取医院详细信息过程中出错了,原因是:{}'.format(repr(e)))
def parse_hospital_info(self, response): hospital_name = response.meta.get('hospital_name') self.logger.info('>>>>>>正在抓取[{}]医院信息和科室信息>>>>>>'.format(hospital_name)) hospital_city = response.xpath( '//div[@class="jieshao_zi"]/p[4]/text()').extract() if hospital_city: hospital_address = custom_remove_tags(''.join(hospital_city)) hospital_city2 = get_city(hospital_address) useless_info = '中国|湖南省|湖南|{}'.format(hospital_city2) hospital_county = get_county2(useless_info, hospital_address) else: hospital_county = None loader = CommonLoader2(item=HospitalInfoItem(), response=response) loader.add_xpath('hospital_name', '//div[@class="jieshao_zi"]/p/font/text()', MapCompose(custom_remove_tags)) loader.add_xpath('hospital_level', '//div[@class="jieshao_zi"]/p[2]/text()', MapCompose(custom_remove_tags)) loader.add_value('hospital_type', '公立') loader.add_value('hospital_category', '') loader.add_xpath('hospital_addr', '//div[@class="jieshao_zi"]/p[4]/text()', MapCompose(custom_remove_tags)) loader.add_value('hospital_pro', '湖南省') loader.add_xpath('hospital_city', '//div[@class="jieshao_zi"]/p[4]/text()', MapCompose(custom_remove_tags, get_city)) loader.add_value('hospital_county', hospital_county) loader.add_xpath('hospital_phone', '//div[@class="jieshao_zi"]/p[3]/text()', MapCompose(custom_remove_tags)) loader.add_xpath( 'hospital_intro', '//div[@id="starlist"]', MapCompose(remove_tags, custom_remove_tags, clean_info)) loader.add_value('registered_channel', self.data_source_from) loader.add_value('dataSource_from', self.data_source_from) loader.add_value('update_time', now_day()) hospital_info_item = loader.load_item() yield hospital_info_item # 获取科室信息 self.logger.info('>>>>>>正在抓取[{}]科室信息>>>>>>'.format(hospital_name)) dept_links = response.xpath('//div[@class="xuanze_kslb"]') if dept_links: for each_dept_link in dept_links: dept_type = each_dept_link.xpath( 'div[1]/ul/li/text()').extract_first('') all_dept_links = each_dept_link.xpath('div[2]/ul/li/a') for dept_link in all_dept_links: # dept_name = dept_link.xpath('text()').extract_first('') data_info = dept_link.xpath('@onclick').extract_first('') if data_info: data_info = ''.join(re.findall(r'\S+', data_info)) is_sp_time = re.search(r'isSpTime:\'(.*?)\'', data_info) pay_mode = re.search(r'paymode:\'(.*?)\'', data_info) dept_id = re.search(r'platformDeptId:\'(.*?)\'', data_info) hos_id = re.search(r'platformHosId:\'(.*?)\'', data_info, S) dept_name = re.search(r'tempDeptName:\'(.*?)\'', data_info, S) org_name = re.search(r'orgname:\'(.*?)\'', data_info, S) if dept_id and hos_id and dept_name and org_name: is_sp_time = is_sp_time.group(1) pay_mode = pay_mode.group(1) dept_id = dept_id.group(1) hos_id = hos_id.group(1) dept_name = dept_name.group(1) org_name = org_name.group(1) data = { 'isSpTime': str(is_sp_time), 'paymode': quote(pay_mode), 'doctorCollectResult': '', 'platformDeptId': str(dept_id), 'orgname': quote(org_name), 'tempDeptName': quote(dept_name), 'platformHosId': str(hos_id), 'platformDoctorId': '' } self.headers.update({ 'Content-Type': 'application/x-www-form-urlencoded', 'Origin': 'http://www.hnyygh.com', 'Referer': 'http://www.hnyygh.com/searchDeptmentAction.action', 'Pragma': 'no-cache' }) splash_args = { 'url': self.dept_detail_url, 'headers': self.headers, 'lua_source': self.dept_script, 'data': data } yield SplashRequest( self.dept_detail_url, endpoint='execute', args=splash_args, dont_filter=True, headers=self.headers, callback=self.parse_hospital_dep_detail, meta={ 'dept_type': dept_type, 'dept_name': dept_name, 'hospital_name': org_name }) # 获取医生信息 data = { 'platformDeptId': dept_id, 'platformHosId': hos_id, 'platformDoctorId': '', 'nextNumInfo': '0' } self.headers.update({ 'Content-Type': 'application/x-www-form-urlencoded', 'Origin': 'http://www.hnyygh.com', 'Referer': 'http://www.hnyygh.com/searchOrderNumInfoAction.action' }) splash_args = { 'url': self.doctor_url, 'headers': self.headers, 'lua_source': self.dept_script, 'data': data } yield SplashRequest( self.doctor_url, endpoint='execute', args=splash_args, dont_filter=True, headers=self.headers, callback=self.parse_doctor_info, meta={ 'dept_type': dept_type, 'dept_name': dept_name, 'dept_id': dept_id, 'hospital_name': org_name })
def parse_hospital_info(self, response): hospital_name = response.meta.get('hospital_name') self.logger.info('>>>>>>正在抓取医院详细信息>>>>>>') try: hospital_id = response.meta.get('hospital_id') hospital_img_url = response.xpath( '//div[@class="divLeft_Img"]/img/@src').extract_first('') hospital_img_url = urljoin( self.host, hospital_img_url) if hospital_img_url else '' hospital_address = response.xpath( '//li[contains(text(),"地址")]/text()').extract_first('') hospital_county = get_county2('中国|福建省|福建|厦门市|厦门', match_special2(hospital_address)) loader = CommonLoader2(item=HospitalInfoItem(), response=response) loader.add_xpath( 'hospital_name', '//div[@class="divLeft_Info"]/ul/li[1]/span/text()', MapCompose(custom_remove_tags)) loader.add_value('hospital_addr', hospital_address, MapCompose(custom_remove_tags, match_special2)) loader.add_value('hospital_pro', '福建省') loader.add_value('hospital_city', '厦门市') loader.add_value('hospital_county', hospital_county, MapCompose(custom_remove_tags)) loader.add_xpath('hospital_phone', '//li[contains(text(),"电话")]/text()', MapCompose(custom_remove_tags, match_special2)) loader.add_xpath('hospital_intro', '//div[@class="introduceSpan"]', MapCompose(remove_tags, custom_remove_tags)) loader.add_value('registered_channel', self.data_source_from) loader.add_value('dataSource_from', self.data_source_from) loader.add_value('crawled_url', response.url) loader.add_value('update_time', now_day()) loader.add_xpath('hospital_official_website', '//li[contains(text(),"官网")]/text()', MapCompose(custom_remove_tags, match_special2)) loader.add_xpath('hospital_route', '//li[contains(text(),"公交线路")]/text()', MapCompose(custom_remove_tags, match_special2)) loader.add_value('hospital_img_url', hospital_img_url) loader.add_value('gmt_created', now_time()) loader.add_value('gmt_modified', now_time()) loader.add_value('hospital_id', hospital_id) hospital_item = loader.load_item() yield hospital_item # 科室信息 all_dept_info = response.xpath( '//div[@class="medicineOne"]|//div[@class="medicineTwo"]') for each_dept_info in all_dept_info: dept_type = each_dept_info.xpath( 'div[1]/span/text()').extract_first('') dept_names = each_dept_info.xpath('div[2]/div[1]') for each_dept_name in dept_names: dept_name = each_dept_name.xpath('a/text()').extract_first( '') dept_link = each_dept_name.xpath('a/@href').extract_first( '') doctor_num_of_dept = each_dept_name.xpath( 'span/text()').extract_first('') # 获取科室人数 if doctor_num_of_dept: dept_person_num = re.search(r'(\d+)', doctor_num_of_dept) dept_person_num = int(dept_person_num.group( 1)) if dept_person_num else None else: dept_person_num = None # 获取科室详细信息 dept_loader = CommonLoader2(item=HospitalDepItem(), response=response) dept_loader.add_value('dept_name', dept_name, MapCompose(custom_remove_tags)) dept_loader.add_value('hospital_name', hospital_name, MapCompose(custom_remove_tags)) dept_loader.add_value('dept_type', dept_type, MapCompose(custom_remove_tags)) dept_loader.add_value('dataSource_from', self.data_source_from) dept_info = ''.join( response.xpath( '//p[contains(text(),"科室简介")]/ancestor::tr[1]'). extract()) dept_loader.add_value( 'dept_info', dept_info, MapCompose(remove_tags, custom_remove_tags, match_special2)) dept_loader.add_value('crawled_url', response.url) dept_loader.add_value('update_time', now_day()) dept_loader.add_value('dept_id', dept_link, MapCompose(match_special2)) dept_loader.add_value('hospital_id', hospital_id) dept_loader.add_value('dept_person_num', dept_person_num) dept_loader.add_value('dept_url', urljoin(self.host, dept_link)) dept_loader.add_value('gmt_created', now_time()) dept_loader.add_value('gmt_modified', now_time()) dept_item = dept_loader.load_item() yield dept_item # 获取医生信息 if dept_link and dept_person_num: self.headers['Referer'] = response.url yield Request(urljoin(self.host, dept_link), headers=self.headers, callback=self.parse_doctor_info, dont_filter=True, meta={ 'hospital_name': hospital_name, 'dept_name': dept_name, }) except Exception as e: self.logger.error('在抓取医院详细信息过程中出错了,原因是:{}'.format(repr(e)))