def parse_hospital_dep(self, response): """ 获取科室信息 """ self.logger.info('>>>>>>正在抓取科室信息……') hospital_name = response.xpath('//div[@class="hos-info"]/h1/text()').extract_first('') all_dept_links = response.xpath('//dd[@class="ks-2"]/ul/li') self.logger.info('{}:共有{}个科室'.format(hospital_name, str(len(all_dept_links)))) for each_dept_link in all_dept_links: # 获取科室信息 dep_loader = YiHuLoader(item=HospitalDepItem(), selector=each_dept_link) dep_loader.add_xpath('dept_type', 'a/text()') dep_loader.add_xpath('dept_name', 'a/text()') dep_loader.add_value('hospital_name', hospital_name) dep_loader.add_value('update_time', now_day()) dep_item = dep_loader.load_item() yield dep_item # 获取科室医生信息 dept_link = each_dept_link.xpath('a/@href').extract_first('') if dept_link: dept_link = urljoin(self.host, dept_link) # doctor_link = re.sub(r'/arrange/', '/7002/', dept_link) dept_request = Request(dept_link, headers=self.headers, callback=self.parse_dept_link) self.headers['Referer'] = response.url yield dept_request
def parse_dept_info(self, response): self.logger.info('>>>>>>正在抓取医院科室相关信息……') dept_info = json.loads(response.text) for each_dept in dept_info['responseData']['data']['data']['depart']: loader = CommonLoader(item=HospitalDepItem(), response=response) loader.add_value('dept_name', each_dept['deptname']) loader.add_value('hospital_name', dept_info['responseData']['data']['data']['hospital']['hospitalName']) loader.add_value('update_time', now_day()) dept_item = loader.load_item() yield dept_item dept_id = each_dept.get('deptid', '') if dept_id: data = { 'key': '', 'deptId': str(dept_id), 'pageIndex': '1', 'pageSize': '100' } doctor_request = FormRequest(self.doctor_link, headers=self.headers, callback=self.parse_doctor_info, formdata=data, meta={'dept_id': dept_id}, dont_filter=True) self.headers['Referer'] = 'http://www.scgh114.com/web/register/doctor' yield doctor_request
def parse_hospital_dep(self, response): """科室信息""" self.logger.info('正在抓取{}:科室信息'.format(self.hospital_name)) dep_type = response.xpath('//div[@class="part"]/div[@class="part01"]') for each_dep_type in dep_type: dep_type = each_dep_type.xpath('div/div[1]/text()').extract_first('') all_dept_names = each_dep_type.xpath('ul/li') self.logger.info('总共有{}科室'.format(str(len(all_dept_names)))) for each_dep_name in all_dept_names: loader = MedicalMapLoader(item=HospitalDepItem(), selector=each_dep_name) loader.add_value('dept_type', dep_type) dep_link = each_dep_name.xpath('div/a[1]/@href').extract_first('') dep_doctor_link = each_dep_name.xpath('div/a[2]/@href').extract_first('') loader.add_value('hospital_name', self.hospital_name) loader.add_xpath('dept_name', 'h3/text()') if dep_link: dep_link = urljoin(self.host, dep_link) self.headers['Referer'] = dep_link dep_detail_link = dep_link.replace('sectionshow', 'classsysdetail') yield Request(dep_detail_link, headers=self.headers, callback=self.parse_dep_detail, meta={'loader': loader}) if dep_doctor_link: dep_doctor_link = urljoin(self.host, dep_doctor_link) yield Request(dep_doctor_link, headers=self.headers, callback=self.parse_doctor_info)
def parse_hospital_dep_detail(self, response): self.logger.info('>>>>>>正在抓取{}:科室详细信息>>>>>>'.format( self.hospital_name)) loader = CommonLoader2(item=HospitalDepItem(), response=response) loader.add_xpath('dept_type', '//div[@class="title"]/h3/text()', MapCompose(custom_remove_tags)) loader.add_xpath('dept_name', '//div[@class="title"]/h3/text()', MapCompose(custom_remove_tags)) loader.add_value('hospital_name', self.hospital_name) # loader.add_value('dept_type', response.meta['dept_type'], MapCompose(custom_remove_tags)) loader.add_xpath('dept_info', '//div[@class="content"]', MapCompose(remove_tags, custom_remove_tags)) loader.add_value('update_time', now_day()) dept_item = loader.load_item() yield dept_item # 其他科室信息 self.logger.info('>>>>>>正在抓取{}:科室信息>>>>>>'.format(self.hospital_name)) dept_links = response.xpath( '//ul[@class="list2"]/li[position()>1]/a/@href').extract() self.dept_crawled_cnt += 1 if dept_links and self.dept_crawled_cnt == 1: for each_dept_link in dept_links: dept_request = Request(urljoin(self.host, each_dept_link), headers=self.headers, callback=self.parse_hospital_dep_detail, dont_filter=True) dept_request.meta['Referer'] = response.url yield dept_request
def parse_hospital_dep(self, response): hospital_name = response.meta.get('hospital_name') self.logger.info('>>>>>>正在抓取:[{}]科室信息>>>>>>'.format(hospital_name)) try: all_dept_links = response.xpath( '//div[@id="one_2"]/div/div/table/tbody/tr/td[@class="contentTd"]/a' ) for each_dept_link in all_dept_links: dept_name = each_dept_link.xpath('text()').extract_first('') dept_detail_link = each_dept_link.xpath('@href').extract_first( '') dept_loader = CommonLoader2(item=HospitalDepItem(), response=response) dept_loader.add_value('dept_name', dept_name, MapCompose(custom_remove_tags)) dept_loader.add_value('hospital_name', hospital_name, MapCompose(custom_remove_tags)) dept_loader.add_value('dataSource_from', self.data_source_from) dept_loader.add_value('update_time', now_day()) # 获取科室详细信息 if dept_name and dept_detail_link: self.headers['Referer'] = response.url yield Request(urljoin(self.host, dept_detail_link), headers=self.headers, callback=self.parse_hospital_dep_detail, meta={ 'dept_name': dept_name, 'dept_loader': dept_loader, 'hospital_name': hospital_name }, dont_filter=True) except Exception as e: self.logger.error('在抓取医院科室信息过程中出错了,原因是:{}'.format(repr(e)))
def parse_dept_info(self, response): dep_type = response.meta['dep_type'] self.logger.info('正在抓取[{}]科室信息'.format(custom_remove_tags(dep_type))) all_dept_names = response.xpath('//div[@class="pic"]') if all_dept_names: # 一级科室有二级科室 for each_dept_name in all_dept_names: dept_detail_link = each_dept_name.xpath( 'a/@href').extract_first('') if dept_detail_link: dept_detail_link = urljoin(self.host, dept_detail_link) request = Request(dept_detail_link, headers=self.headers, callback=self.parse_dept_detail, meta={'dep_type': dep_type}) request.meta['Referer'] = response.url yield request else: # 一级科室没有二级科室 loader = PxfybjyLoader(item=HospitalDepItem(), response=response) loader.add_value('dept_type', dep_type) loader.add_value('hospital_name', self.hospital_name) loader.add_value('update_time', now_day()) hospital_dep_item = loader.load_item() yield hospital_dep_item
def parse_hospital_dep(self, response): hospital_name = response.meta.get('hospital_name') dept_type = response.meta.get('dept_type') self.logger.info('>>>>>>正在抓取:[{}]医院-[{}]科室信息>>>>>>'.format(hospital_name, dept_type)) try: dept_info = json.loads(response.text) sub_dept_list = dept_info.get('data').get('subDepList') for each_dept_info in sub_dept_list: dept_name = each_dept_info.get('name') dept_id = each_dept_info.get('id') dept_loader = CommonLoader2(item=HospitalDepItem(), response=response) dept_loader.add_value('dept_name', dept_name, MapCompose(custom_remove_tags)) dept_loader.add_value('dept_type', dept_type, MapCompose(custom_remove_tags)) dept_loader.add_value('hospital_name', hospital_name, MapCompose(custom_remove_tags)) dept_loader.add_value('dataSource_from', self.data_source_from) dept_loader.add_value('crawled_url', response.url) dept_loader.add_value('update_time', now_day()) dept_loader.add_value('dept_id', dept_id) dept_loader.add_value('dept_url', response.url) dept_loader.add_value('gmt_created', now_time()) dept_loader.add_value('gmt_modified', now_time()) dept_item = dept_loader.load_item() yield dept_item except Exception as e: self.logger.error('在抓取医院科室信息过程中出错了,原因是:{}'.format(repr(e)))
def parse_hospital_dep_detail(self, response): dept_name = response.meta['dept_name'] loader = CommonLoader2(item=HospitalDepItem(), response=response) loader.add_value('dept_name', dept_name) loader.add_value('hospital_name', self.hospital_name) loader.add_xpath('dept_info', '//div[@class="fleft wd740"]') loader.add_value('update_time', now_day()) dept_item = loader.load_item() yield dept_item
def parse_hospital_dep_detail(self, response): dept_name = response.meta['dept_name'] loader = CommonLoader2(item=HospitalDepItem(), response=response) loader.add_value('dept_name', dept_name, MapCompose(match_special2)) loader.add_value('hospital_name', self.hospital_name) loader.add_xpath('dept_info', '//div[@style="text-indent: 2em"]', MapCompose(remove_tags)) loader.add_value('update_time', now_day()) dept_item = loader.load_item() yield dept_item
def parse_hospital_dep(self, response): self.logger.info('>>>>>>正在抓取科室信息>>>>>>') try: hospital_name = response.meta.get('hospital_name') all_dept_links = response.xpath( '//div[@class="deptList-block mb20 clearfix"]') for each_dept_link in all_dept_links: dept_type = each_dept_link.xpath('b/text()').extract_first('') dept_info = each_dept_link.xpath('ul/li/a') for each_dept_info in dept_info: # 获取科室信息 dept_name = each_dept_info.xpath('@title').extract_first( '') dept_link = each_dept_info.xpath('@onclick').extract_first( '') dept_link2 = each_dept_info.xpath('@href').extract_first( '') dept_loader = CommonLoader2(item=HospitalDepItem(), response=response) dept_loader.add_value('dept_name', dept_name) dept_loader.add_value('dept_type', dept_type) dept_loader.add_value('hospital_name', hospital_name) dept_loader.add_value('dept_info', '') dept_loader.add_value('dataSource_from', self.data_source_from) dept_loader.add_value('update_time', now_day()) dept_item = dept_loader.load_item() yield dept_item # 获取医生信息 if dept_link: res = re.search(r'goNext\((.*?),\'(.*)\'\);', dept_link) if res: hospital_id = res.group(1) dept_id = res.group(2) doctor_list_url = self.doctor_list_url.format( hospital_id, dept_id) else: doctor_list_url = None else: doctor_list_url = urljoin(self.host, dept_link2) if doctor_list_url: self.headers['Referer'] = response.url yield Request(doctor_list_url, headers=self.headers, callback=self.parse_doctor_info, meta={ 'dept_name': dept_name, 'hospital_name': hospital_name }) except Exception as e: self.logger.error('在抓取科室信息过程中出错了,原因是:{}'.format(repr(e)))
def parse_hospital_dep(self, response): self.logger.info('>>>>>>正在抓取{}:科室信息>>>>>>'.format(self.hospital_name)) has_more_dept = response.xpath('//div[@id="current"]/span/a[contains(text(),"更多")]/@href').extract() if has_more_dept: for each_dept_link in has_more_dept: dept_request = Request(urljoin(self.host, each_dept_link), headers=self.headers, callback=self.parse_hospital_dep) self.headers['Referer'] = response.url yield dept_request else: dept_detail_link = response.xpath('//div[@class="list1"]/ul/li/a[contains(text(),"科室介绍") or ' 'contains(text(), "简介")]/@href').extract_first('') dept_name1 = response.xpath('//div[@class="list1"]/ul/li[2]/a/text()').extract_first('') dept_name2 = response.xpath('//div[@id="current"]/a[3]/text()').extract_first('') dept_detail_link2 = response.xpath('//div[@class="list1"]/ul/li[2]/a/@href').extract_first('') if dept_detail_link: # 科室介绍的名称中包含科室介绍或简介 dept_detail_request = Request(urljoin(self.host, dept_detail_link), headers=self.headers, callback=self.parse_hospital_dep_detail) self.headers['Referer'] = response.url yield dept_detail_request elif dept_name1 == dept_name2 and dept_detail_link2: # 科室介绍的名称中不包含科室介绍或简介 dept_detail_request = Request(urljoin(self.host, dept_detail_link2), headers=self.headers, callback=self.parse_hospital_dep_detail) self.headers['Referer'] = response.url yield dept_detail_request else: # 不存在二级科室 loader = CommonLoader2(item=HospitalDepItem(), response=response) loader.add_xpath('dept_type', '//div[@id="current"]/a[2]/text()', MapCompose(custom_remove_tags)) loader.add_value('hospital_name', self.hospital_name) loader.add_xpath('dept_name', '//div[@id="current"]/a[2]/text()', MapCompose(custom_remove_tags)) loader.add_value('update_time', now_day()) dept_item = loader.load_item() yield dept_item # 抓取其他科室信息 other_dept_links = response.xpath('//div[@id="left1"]/span[position()>1]/a/@href').extract() self.dept_crawled_cnt += 1 if self.dept_crawled_cnt <= 1 and other_dept_links: for each_other_dept in other_dept_links: dept_request = Request(urljoin(self.host, each_other_dept), headers=self.headers, callback=self.parse_hospital_dep) self.headers['Referer'] = response.url yield dept_request
def parse_hospital_dep_detail(self, response): self.logger.info('>>>>>>正在抓取{}:科室详细信息>>>>>>'.format( self.hospital_name)) dept_name = response.meta['dept_name'] loader = CommonLoader2(item=HospitalDepItem(), response=response) loader.add_value('dept_name', dept_name) loader.add_value('hospital_name', self.hospital_name) loader.add_xpath('dept_info', '//div[@class="right-about clearfix"]', MapCompose(remove_tags, clean_info)) loader.add_value('update_time', now_day()) dept_item = loader.load_item() yield dept_item
def parse_hospital_dep_detail(self, response): self.logger.info('>>>>>>正在抓取{}:科室详细信息>>>>>>'.format( self.hospital_name)) loader = CommonLoader2(item=HospitalDepItem(), response=response) loader.add_xpath('dept_name', '//div[@class="list-item fl"]/h1/text()', MapCompose(custom_remove_tags)) loader.add_value('hospital_name', self.hospital_name) # loader.add_value('dept_type', dept_type) loader.add_xpath('dept_info', '//div[@class="list-item fl"]/p', MapCompose(remove_tags, custom_remove_tags)) loader.add_value('update_time', now_day()) dept_item = loader.load_item() yield dept_item
def parse_hospital_dep_detail(self, response): self.logger.info('>>>>>>正在抓取科室详细信息>>>>>>') loader = CommonLoader2(item=HospitalDepItem(), response=response) loader.add_xpath('dept_name', '//div[@class="zrys"]/p/strong/text()', MapCompose(custom_remove_tags)) loader.add_xpath('hospital_name', '//div[@class="yy_til"]/h2/text()', MapCompose(custom_remove_tags)) loader.add_xpath('dept_info', '//div[@class="zrys"]/dl/dd', MapCompose(remove_tags, custom_remove_tags)) loader.add_value('dataSource_from', self.data_source_from) loader.add_value('crawled_url', response.url) loader.add_value('update_time', now_day()) dept_item = loader.load_item() yield dept_item
def parse_hospital_dep_detail(self, response): self.logger.info('>>>>>>正在抓取{}:科室详细信息>>>>>>'.format( self.hospital_name)) dept_type = response.meta['dept_type'] dept_name = response.meta['dept_name'] loader = CommonLoader2(item=HospitalDepItem(), response=response) loader.add_value('dept_name', dept_name) loader.add_value('hospital_name', self.hospital_name) loader.add_value('dept_type', dept_type) loader.add_xpath('dept_info', '//div[@class="rightPanel"]/p[position()>2]', MapCompose(remove_tags, custom_remove_tags)) loader.add_value('update_time', now_day()) dept_item = loader.load_item() yield dept_item
def parse_hospital_dep_detail(self, response): self.logger.info('>>>>>>正在抓取{}:科室详细信息>>>>>>'.format( self.hospital_name)) loader = CommonLoader2(item=HospitalDepItem(), response=response) loader.add_value('dept_name', response.meta['dept_name'], MapCompose(custom_remove_tags)) loader.add_value('hospital_name', self.hospital_name) # loader.add_value('dept_type', dept_type) loader.add_xpath( 'dept_info', '//div[@class="content-left pull-left departmentintro"]', MapCompose(remove_tags, custom_remove_tags)) loader.add_value('update_time', now_day()) dept_item = loader.load_item() yield dept_item
def parse_dept_detail(self, response): """医院科室详细信息""" self.logger.info('正在抓取{}:科室详细信息'.format(self.hospital_name)) loader = PxfybjyLoader(item=HospitalDepItem(), response=response) dept_name = response.xpath( '//li[@class="name1"]/text()').extract_first('') dept_info = response.xpath( '//div[@class="FrontProducts_detail02-1468396987105_htmlbreak"]/p' ).extract() loader.add_value('dept_type', response.meta['dep_type']) loader.add_value('dept_name', dept_name) loader.add_value('hospital_name', self.hospital_name) loader.add_value('dept_info', dept_info) loader.add_value('update_time', now_day()) hospital_dep_item = loader.load_item() yield hospital_dep_item
def parse_hospital_dep_detail(self, response): self.logger.info('>>>>>>正在抓取{}:科室详细信息>>>>>>'.format( self.hospital_name)) loader = CommonLoader2(item=HospitalDepItem(), response=response) loader.add_xpath('dept_name', '//div[@class="page_sum2_tit"]/text()', MapCompose(custom_remove_tags, clean_info)) loader.add_value('hospital_name', self.hospital_name) loader.add_value('dept_type', response.meta['dept_type']) loader.add_xpath( 'dept_info', '//div[@class="page_sum2"]/*[' 'not(contains(@class,"listsum_block2")) and' 'not(contains(@class,"page_tit")) and' 'not(contains(@class,"page_sum2_tit"))]', MapCompose(remove_tags, custom_remove_tags)) loader.add_value('update_time', now_day()) dept_item = loader.load_item() yield dept_item
def parse_hospital_dep(self, response): hospital_name = response.meta.get('hospital_name') self.logger.info('>>>>>>正在抓取:[{}]科室信息>>>>>>'.format(hospital_name)) try: all_dept = response.xpath('//table[@id="deptlist"]/tr') for each_dept in all_dept: dept_type = each_dept.xpath( 'td[1][@rowspan]/text()').extract_first('') if dept_type: self.temp_dept_type = dept_type all_dept_td = each_dept.xpath('td[not(@rowspan)]') for each_dept_td in all_dept_td: dept_name = each_dept_td.xpath('a/text()').extract_first( '') dept_detail_link = each_dept_td.xpath( 'a/@href').extract_first('') if dept_name and dept_detail_link: dept_loader = CommonLoader2(item=HospitalDepItem(), response=response) dept_loader.add_value('dept_name', dept_name, MapCompose(custom_remove_tags)) dept_loader.add_value('hospital_name', hospital_name, MapCompose(custom_remove_tags)) dept_loader.add_value('dept_type', self.temp_dept_type, MapCompose(custom_remove_tags)) dept_loader.add_value('dataSource_from', self.data_source_from) dept_loader.add_value('update_time', now_day()) # 获取科室详细信息 if dept_name and dept_detail_link: self.headers['Referer'] = response.url yield Request( urljoin(self.host, dept_detail_link), headers=self.headers, callback=self.parse_hospital_dep_detail, meta={ 'dept_name': dept_name, 'dept_loader': dept_loader, 'hospital_name': hospital_name }, dont_filter=True) except Exception as e: self.logger.error('在抓取医院科室信息过程中出错了,原因是:{}'.format(repr(e)))
def parse_hospital_dep(self, response): hospital_name = response.meta.get('hospital_name') self.logger.info('>>>>>>正在抓取:[{}]科室信息>>>>>>'.format(hospital_name)) try: all_dept_links = response.xpath('//div[@class="lab-list"]/div') for each_dept_link in all_dept_links: dept_type = each_dept_link.xpath('div/a/text()').extract_first( '') dept_info = each_dept_link.xpath('ul/li') for each_dept_info in dept_info: dept_name = each_dept_info.xpath('a/text()').extract_first( '') dept_doctor_cnt = each_dept_info.xpath( 'span/b[1]/text()').extract_first('') dept_detail_link = each_dept_info.xpath( 'a/@href').extract_first('') dept_loader = CommonLoader2(item=HospitalDepItem(), response=response) dept_loader.add_value('dept_name', dept_name, MapCompose(custom_remove_tags)) dept_loader.add_value('dept_type', dept_type, MapCompose(custom_remove_tags)) dept_loader.add_xpath('hospital_name', '//div[@class="l"]/h2/text()', MapCompose(custom_remove_tags)) dept_loader.add_value('dataSource_from', self.data_source_from) dept_loader.add_value('update_time', now_day()) # 获取科室详细信息 if dept_name and dept_detail_link: self.headers['Referer'] = response.url yield Request(urljoin(self.host, dept_detail_link), headers=self.headers, callback=self.parse_hospital_dep_detail, meta={ 'dept_name': dept_name, 'dept_loader': dept_loader, 'dept_doctor_cnt': dept_doctor_cnt, 'hospital_name': hospital_name }, dont_filter=True) except Exception as e: self.logger.error('在抓取医院科室信息过程中出错了,原因是:{}'.format(repr(e)))
def parse_hospital_dep_detail(self, response): hospital_name = response.meta.get('hospital_name') self.logger.info('>>>>>>正在抓取[{}]科室详细信息>>>>>>'.format(hospital_name)) dept_type = response.meta.get('dept_type') dept_name = response.meta.get('dept_name') if dept_name and hospital_name: loader = CommonLoader2(item=HospitalDepItem(), response=response) loader.add_value('dept_name', dept_name, MapCompose(custom_remove_tags)) # loader.add_xpath('hospital_name', # '//div[@class="schedule_zi"]/p[1]/font[1]/text()', # MapCompose(custom_remove_tags)) loader.add_value('hospital_name', hospital_name, MapCompose(custom_remove_tags)) loader.add_value('dept_type', dept_type, MapCompose(custom_remove_tags)) loader.add_xpath('dept_info', '//div[@id="schedule_jienr"]', MapCompose(remove_tags, custom_remove_tags)) loader.add_value('dataSource_from', self.data_source_from) loader.add_value('update_time', now_day()) dept_item = loader.load_item() yield dept_item
def parse_hospital_info(self, response): self.logger.info('>>>>>>正在抓取:医院详细信息和科室信息>>>>>>') try: # 获取医院信息 hospital_type = response.meta.get('hospital_type') hospital_category = '{0}{1}'.format( hospital_type, '医院') if hospital_type else None hospital_info = custom_remove_tags( remove_tags(''.join( response.xpath('//td[@class=' '"title_yh14"]').extract()))) hospital_address = get_hospital_info(hospital_info, '地址:', '电话:') hospital_address = hospital_address.replace( '查看地图', '') if hospital_address else None hospital_phone = get_hospital_info(hospital_info, '电话:', '官网') hospital_intro = get_hospital_info(hospital_info, '简介:', '$') hospital_intro = hospital_intro.replace( '...更多>>', '') if hospital_intro else None loader = CommonLoader2(item=HospitalInfoItem(), response=response) loader.add_xpath('hospital_name', '//span[@class="title"]/text()', MapCompose(custom_remove_tags)) loader.add_xpath('hospital_level', '//span[@class="dj"]/text()', MapCompose(custom_remove_tags)) loader.add_value('hospital_category', hospital_category) loader.add_value('hospital_addr', hospital_address, MapCompose(custom_remove_tags)) loader.add_value('hospital_pro', '山西省') loader.add_xpath( 'hospital_city', '//td[contains(text(),"山西")]/ancestor::tr[1]/td[1]/a[1]/text()', MapCompose(custom_remove_tags)) loader.add_xpath( 'hospital_county', '//td[contains(text(),"山西")]/ancestor::tr[1]/td[1]/a[2]/text()', MapCompose(custom_remove_tags)) loader.add_value('hospital_phone', hospital_phone, MapCompose(custom_remove_tags)) loader.add_value('hospital_intro', hospital_intro, MapCompose(custom_remove_tags)) loader.add_value('registered_channel', self.data_source_from) loader.add_value('dataSource_from', self.data_source_from) loader.add_value('update_time', now_day()) hospital_info_item = loader.load_item() yield hospital_info_item # 获取科室信息 self.logger.info('>>>>>>正在抓取科室详细信息>>>>>>') all_dept_links = response.xpath('//tr[@class="h_bottom"]') for each_dept_link in all_dept_links: dept_type = each_dept_link.xpath('td[1]/text()').extract_first( '') dept_name = each_dept_link.xpath( 'td[2]/table/tr/td/a/text()').extract() for each_dept_name in dept_name: dept_loader = CommonLoader2(item=HospitalDepItem(), response=response) dept_loader.add_value( 'dept_name', each_dept_name, MapCompose(custom_remove_tags, match_special2)) dept_loader.add_value( 'dept_type', dept_type, MapCompose(custom_remove_tags, match_special2)) dept_loader.add_xpath('hospital_name', '//span[@class="title"]/text()', MapCompose(custom_remove_tags)) dept_loader.add_value('dept_info', '') dept_loader.add_value('dataSource_from', self.data_source_from) dept_loader.add_value('update_time', now_day()) dept_item = dept_loader.load_item() yield dept_item except Exception as e: self.logger.error('在抓取医院详细信息和科室的过程中出错了,原因是:{}'.format(repr(e)))
def parse_hospital_info(self, response): hospital_name = response.meta.get('hospital_name') self.logger.info('>>>>>>正在抓取[{}]医院详细信息>>>>>>'.format(hospital_name)) try: hospital_id = response.meta.get('hospital_id') data_type = response.meta.get('data_type') hospital_pro = response.meta.get('province_name') if data_type == '1': hospital_address = response.xpath('///div[@class="search-result-hospital-text"]/' 'p[4]/text()').extract_first('') hospital_phone = response.xpath('//div[@class="search-result-hospital-text"]/' 'p[3]/text()').extract_first('') check_phone = re.search('(\d{6,})', hospital_phone) if not check_phone and not hospital_address: hospital_address = hospital_phone hospital_phone = '' # hospital_city = get_city('', hospital_address) # hospital_county = get_county2('', match_special2(hospital_address)) df = transform([hospital_address]) # hospital_pro = df.head()['省'][0] hospital_city = df.head()['市'][0] hospital_county = df.head()['区'][0] if hospital_pro in MUNICIPALITY2: hospital_city = '{0}{1}'.format(hospital_pro, '市') hospital_pro = '' else: hospital_pro = '{0}{1}'.format(hospital_pro, '省') loader = CommonLoader2(item=HospitalInfoItem(), response=response) loader.add_xpath('hospital_name', '//span[@class="search-result-hospital-name"]/text()', MapCompose(custom_remove_tags)) loader.add_xpath('hospital_level', '//div[@class="search-result-hospital-text"]/p[2]/text()', MapCompose(custom_remove_tags, clean_info2)) loader.add_value('hospital_addr', hospital_address, MapCompose(custom_remove_tags)) loader.add_value('hospital_pro', hospital_pro) loader.add_value('hospital_city', hospital_city) loader.add_value('hospital_county', hospital_county, MapCompose(custom_remove_tags)) loader.add_value('hospital_phone', hospital_phone, MapCompose(custom_remove_tags)) loader.add_xpath('hospital_intro', '//li[@id="info"]/p', MapCompose(remove_tags, custom_remove_tags)) loader.add_value('registered_channel', self.data_source_from) loader.add_value('dataSource_from', self.data_source_from) loader.add_value('crawled_url', response.url) loader.add_value('update_time', now_day()) loader.add_xpath('hospital_route', '//div[@class="search-result-hospital-text"]/p[5]/text()', MapCompose(custom_remove_tags, match_special2)) loader.add_xpath('hospital_img_url', 'div[@class="search-result-hospital-img"]/img/@src') loader.add_value('hospital_tags', '1') loader.add_value('gmt_created', now_time()) loader.add_value('gmt_modified', now_time()) loader.add_value('hospital_id', hospital_id) hospital_item = loader.load_item() yield hospital_item # 获取科室信息 # 从一级科室获取二级科室信息 all_dept = response.xpath('//ul[@id="parent-list"]/li[@id]') for each_dept in all_dept: each_dept_id = each_dept.xpath('@id').extract_first('') each_dept_type = each_dept.xpath('div/span/text()').extract_first('') self.headers['Referer'] = response.url dept_link = self.dept_url.format(hospital_id, each_dept_id) yield Request(dept_link, headers=self.headers, callback=self.parse_hospital_dep, meta={ 'hospital_name': hospital_name, 'hospital_id': hospital_id, 'dept_type': each_dept_type }) # 获取医生信息 self.headers['Referer'] = response.url doctor_info_link = self.doctor_url.format(hospital_id, '1') yield Request(doctor_info_link, headers=self.headers, callback=self.parse_doctor_info, meta={ 'hospital_name': hospital_name, 'hospital_id': hospital_id }) elif data_type == '2': hospital_address = response.xpath('//p[@class="hospital-private-address-line fc-6"]' '[contains(text(),"地址")]/text()').extract_first('') # hospital_city = get_city('', hospital_address) # hospital_county = get_county2('', match_special2(hospital_address)) df = transform([hospital_address]) # hospital_pro = df.head()['省'][0] hospital_city = df.head()['市'][0] hospital_county = df.head()['区'][0] if hospital_pro in MUNICIPALITY2: hospital_city = '{0}{1}'.format(hospital_pro, '市') hospital_pro = '' else: hospital_pro = '{0}{1}'.format(hospital_pro, '省') loader = CommonLoader2(item=HospitalInfoItem(), response=response) loader.add_xpath('hospital_name', '//p[@class="hospital-private-content-tit"]/text()', MapCompose(custom_remove_tags)) loader.add_value('hospital_addr', hospital_address, MapCompose(custom_remove_tags, match_special2)) loader.add_value('hospital_pro', hospital_pro) loader.add_value('hospital_city', hospital_city) loader.add_value('hospital_county', hospital_county, MapCompose(custom_remove_tags)) loader.add_xpath('hospital_phone', '//div[@class="search-result-hospital-text"]/p[3]/text()', MapCompose(custom_remove_tags)) loader.add_xpath('hospital_intro', '//li[@id="info"]/p', MapCompose(remove_tags, custom_remove_tags)) loader.add_value('registered_channel', self.data_source_from) loader.add_value('dataSource_from', self.data_source_from) loader.add_value('crawled_url', response.url) loader.add_value('update_time', now_day()) loader.add_xpath('hospital_route', '//li[@id="address"]/p[3]/text()', MapCompose(custom_remove_tags, match_special2)) # loader.add_xpath('hospital_img_url', 'div[@class="search-result-hospital-img"]/img/@src') loader.add_value('hospital_tags', '2') loader.add_value('gmt_created', now_time()) loader.add_value('gmt_modified', now_time()) loader.add_value('hospital_id', hospital_id) hospital_item = loader.load_item() yield hospital_item # 获取科室信息 # 从一级科室获取二级科室信息 all_dept = response.xpath('//ul[@id="parent-list"]/li[position()>1]') for each_dept in all_dept: dept_id = each_dept.xpath('div/@id').extract_first('') dept_name = each_dept.xpath('div/span/text()').extract_first('') dept_loader = CommonLoader2(item=HospitalDepItem(), response=response) dept_loader.add_value('dept_name', dept_name, MapCompose(custom_remove_tags)) dept_loader.add_value('hospital_name', hospital_name, MapCompose(custom_remove_tags)) dept_loader.add_value('dataSource_from', self.data_source_from) dept_loader.add_value('crawled_url', response.url) dept_loader.add_value('update_time', now_day()) dept_loader.add_value('dept_id', dept_id.replace('subDepLi-', '')) dept_loader.add_value('dept_url', response.url) dept_loader.add_value('gmt_created', now_time()) dept_loader.add_value('gmt_modified', now_time()) dept_item = dept_loader.load_item() yield dept_item # 获取医生信息 self.headers['Referer'] = response.url doctor_info_link = self.doctor_url.format(hospital_id, '1') yield Request(doctor_info_link, headers=self.headers, callback=self.parse_doctor_info, meta={ 'hospital_name': hospital_name, 'hospital_id': hospital_id, 'dept_name': dept_name }) except Exception as e: self.logger.error('在抓取医院详细信息过程中出错了,原因是:{}'.format(repr(e)))
def parse_hospital_info(self, response): hospital_name = response.meta.get('hospital_name') self.logger.info('>>>>>>正在抓取医院详细信息>>>>>>') try: hospital_id = response.meta.get('hospital_id') hospital_img_url = response.xpath( '//div[@class="divLeft_Img"]/img/@src').extract_first('') hospital_img_url = urljoin( self.host, hospital_img_url) if hospital_img_url else '' hospital_address = response.xpath( '//li[contains(text(),"地址")]/text()').extract_first('') hospital_county = get_county2('中国|福建省|福建|厦门市|厦门', match_special2(hospital_address)) loader = CommonLoader2(item=HospitalInfoItem(), response=response) loader.add_xpath( 'hospital_name', '//div[@class="divLeft_Info"]/ul/li[1]/span/text()', MapCompose(custom_remove_tags)) loader.add_value('hospital_addr', hospital_address, MapCompose(custom_remove_tags, match_special2)) loader.add_value('hospital_pro', '福建省') loader.add_value('hospital_city', '厦门市') loader.add_value('hospital_county', hospital_county, MapCompose(custom_remove_tags)) loader.add_xpath('hospital_phone', '//li[contains(text(),"电话")]/text()', MapCompose(custom_remove_tags, match_special2)) loader.add_xpath('hospital_intro', '//div[@class="introduceSpan"]', MapCompose(remove_tags, custom_remove_tags)) loader.add_value('registered_channel', self.data_source_from) loader.add_value('dataSource_from', self.data_source_from) loader.add_value('crawled_url', response.url) loader.add_value('update_time', now_day()) loader.add_xpath('hospital_official_website', '//li[contains(text(),"官网")]/text()', MapCompose(custom_remove_tags, match_special2)) loader.add_xpath('hospital_route', '//li[contains(text(),"公交线路")]/text()', MapCompose(custom_remove_tags, match_special2)) loader.add_value('hospital_img_url', hospital_img_url) loader.add_value('gmt_created', now_time()) loader.add_value('gmt_modified', now_time()) loader.add_value('hospital_id', hospital_id) hospital_item = loader.load_item() yield hospital_item # 科室信息 all_dept_info = response.xpath( '//div[@class="medicineOne"]|//div[@class="medicineTwo"]') for each_dept_info in all_dept_info: dept_type = each_dept_info.xpath( 'div[1]/span/text()').extract_first('') dept_names = each_dept_info.xpath('div[2]/div[1]') for each_dept_name in dept_names: dept_name = each_dept_name.xpath('a/text()').extract_first( '') dept_link = each_dept_name.xpath('a/@href').extract_first( '') doctor_num_of_dept = each_dept_name.xpath( 'span/text()').extract_first('') # 获取科室人数 if doctor_num_of_dept: dept_person_num = re.search(r'(\d+)', doctor_num_of_dept) dept_person_num = int(dept_person_num.group( 1)) if dept_person_num else None else: dept_person_num = None # 获取科室详细信息 dept_loader = CommonLoader2(item=HospitalDepItem(), response=response) dept_loader.add_value('dept_name', dept_name, MapCompose(custom_remove_tags)) dept_loader.add_value('hospital_name', hospital_name, MapCompose(custom_remove_tags)) dept_loader.add_value('dept_type', dept_type, MapCompose(custom_remove_tags)) dept_loader.add_value('dataSource_from', self.data_source_from) dept_info = ''.join( response.xpath( '//p[contains(text(),"科室简介")]/ancestor::tr[1]'). extract()) dept_loader.add_value( 'dept_info', dept_info, MapCompose(remove_tags, custom_remove_tags, match_special2)) dept_loader.add_value('crawled_url', response.url) dept_loader.add_value('update_time', now_day()) dept_loader.add_value('dept_id', dept_link, MapCompose(match_special2)) dept_loader.add_value('hospital_id', hospital_id) dept_loader.add_value('dept_person_num', dept_person_num) dept_loader.add_value('dept_url', urljoin(self.host, dept_link)) dept_loader.add_value('gmt_created', now_time()) dept_loader.add_value('gmt_modified', now_time()) dept_item = dept_loader.load_item() yield dept_item # 获取医生信息 if dept_link and dept_person_num: self.headers['Referer'] = response.url yield Request(urljoin(self.host, dept_link), headers=self.headers, callback=self.parse_doctor_info, dont_filter=True, meta={ 'hospital_name': hospital_name, 'dept_name': dept_name, }) except Exception as e: self.logger.error('在抓取医院详细信息过程中出错了,原因是:{}'.format(repr(e)))
def parse_area(self, response): hospital_city = response.meta.get('area_city', '默认城市') self.logger.info('>>>>>>正在抓取[{}]医院列表……>>>>>>'.format(hospital_city)) # 获取省市县等信息 municipality = ['北京市', '上海市', '重庆市', '天津市'] pro_or_city = response.xpath( '//table[@class="nav"]/tr/' 'td/a[3]/text()').extract_first('').replace('医院列表', '') if pro_or_city: if pro_or_city.strip() in municipality: # 直辖市,包括市、区等信息 hos_prov = '' hos_city = pro_or_city hos_county = response.xpath('//h1[@id="firstHeading"]/text()' ).extract_first('').replace( hos_city, '') else: # 非直辖市,包括省、市、县或区等信息 hos_prov = pro_or_city hos_city = response.xpath('//h1[@id="firstHeading"]' '/text()').extract_first('').replace( '医院列表', '').replace(hos_prov, '') hos_county = '' else: hos_prov = hos_city = hos_county = None # 有医院最终页的医院 # all_hospital_list = response.xpath('//div[@id="bodyContent"]/ul[3]/li/b/a/@href').extract() all_hospital_list2 = response.xpath( '//h2/span[contains(text(),"医院列表")]/' 'following::ul[1]/li/b/a[not(contains(@href,"index"))]') special_hospital_list = response.xpath( '//h2/span[contains(text(),"医院列表")]/' 'following::ul[1]/li/b/a[(contains(@href,"index"))]/ancestor::li[1]' ) area_hos_cnt = len(all_hospital_list2) + len(special_hospital_list) self.logger.info('>>>>>>[{}]总共有{}家医院……>>>>>>'.format( hospital_city, str(area_hos_cnt))) self.total_hospital_cnt += area_hos_cnt self.crawler.signals.connect(self.output_statistics, signals.spider_closed) try: # 有医院最终页的 for each_hospital in all_hospital_list2: hospital_name = each_hospital.xpath('text()').extract_first('') hospital_link = each_hospital.xpath('@href').extract_first('') self.headers['Referer'] = response.url yield Request(urljoin(self.host, hospital_link), headers=self.headers, callback=self.parse_hospital_detail, meta={'hospital_name': hospital_name}, dont_filter=True) # 没有医院最终页的 for each_special_hospital in special_hospital_list: hospital_name = each_special_hospital.xpath( 'b/a/text()').extract_first('') hospital_url = each_special_hospital.xpath( 'b/a/@href').extract_first('') hospital_address = each_special_hospital.xpath( 'ul[1]/li/b[contains(text(),' '"医院地址")]/ancestor::li[1]/text()').extract_first('') hos_county = hos_county if hos_county else get_county( hos_prov, hos_city, hospital_address) loader = CommonLoader2(item=HospitalInfoTestItem(), selector=each_special_hospital) loader.add_value('hospital_name', hospital_name) loader.add_xpath( 'hospital_level', 'ul[1]/li/b[contains(text(),"医院等级")]/ancestor::li[1]', MapCompose(remove_tags, custom_remove_tags, match_special)) loader.add_xpath( 'hospital_category', 'ul[1]/li/b[contains(text(),"医院类型")]/ancestor::li[1]', MapCompose(remove_tags, custom_remove_tags, match_special)) loader.add_xpath( 'hospital_addr', 'ul[1]/li/b[contains(text(),"医院地址")]/ancestor::li[1]', MapCompose(remove_tags, custom_remove_tags, match_special)) loader.add_value('hospital_pro', hos_prov, MapCompose(custom_remove_tags, match_special)) loader.add_value('hospital_city', hos_city, MapCompose(custom_remove_tags, match_special)) loader.add_value('hospital_county', hos_county, MapCompose(custom_remove_tags, match_special)) loader.add_xpath( 'hospital_phone', 'ul[1]/li/b[contains(text(),"联系电话")]/ancestor::li[1]', MapCompose(remove_tags, custom_remove_tags, match_special)) loader.add_value('hospital_intro', '') loader.add_xpath( 'hospital_postcode', 'ul[1]/li/b[contains(text(),"邮政编码")]/ancestor::li[1]', MapCompose(remove_tags, custom_remove_tags, match_special)) loader.add_xpath( 'hospital_email', 'ul[1]/li/b[contains(text(),"电子邮箱")]/ancestor::li[1]', MapCompose(remove_tags, custom_remove_tags, match_special)) loader.add_xpath( 'hospital_website', 'ul[1]/li/b[contains(text(),"医院网站")]/ancestor::li[1]/' 'a[not(contains(@href,"http://www.a-hospital.com"))]', MapCompose(remove_tags, custom_remove_tags, match_special)) loader.add_xpath( 'hospital_fax', 'ul[1]/li/b[contains(text(),"传真号码")]/ancestor::li[1]', MapCompose(remove_tags, custom_remove_tags, match_special)) loader.add_xpath( 'operation_mode', 'ul[1]/li/b[contains(text(),"经营方式")]/ancestor::li[1]', MapCompose(remove_tags, custom_remove_tags, match_special)) loader.add_value('hospital_url', urljoin(self.host, hospital_url)) loader.add_value('dataSource_from', '医学百科') loader.add_value('update_time', now_day()) hospital_info_item = loader.load_item() yield hospital_info_item # 科室信息 dept_info = each_special_hospital.xpath( 'ul[1]/li/b[contains(text(),"重点科室")]/ancestor::li[1]') all_dept_info = match_special( dept_info.xpath('string(.)').extract_first('')) if all_dept_info: for each_dept in all_dept_info.split('、'): dept_loader = CommonLoader2(item=HospitalDepItem(), response=response) dept_loader.add_value('dept_name', each_dept, MapCompose(custom_remove_tags)) dept_loader.add_value( 'hospital_name', hospital_name, MapCompose(custom_remove_tags, match_special2)) dept_loader.add_value('update_time', now_day()) dept_item = dept_loader.load_item() yield dept_item # 医院别名信息 hospital_name2 = each_special_hospital.xpath( 'b/text()').extract_first('') if hospital_name2 and '(' in hospital_name2: alias_name = re.search(r'((.*?))', custom_remove_tags(hospital_name2)) if alias_name: for each_alias_name in alias_name.group(1).split('、'): alias_loader = CommonLoader2( item=HospitalAliasItem(), response=response) alias_loader.add_value( 'hospital_name', hospital_name, MapCompose(custom_remove_tags, match_special2)) alias_loader.add_value('hospital_alias_name', each_alias_name) alias_loader.add_value('update_time', now_day()) alias_item = alias_loader.load_item() yield alias_item except Exception as e: self.logger.error('抓取[{}]医院列表的时候出错了,原因是:{}'.format( hospital_city, repr(e)))
def parse_hospital_detail(self, response): hospital_name = response.meta.get('hospital_name', '默认医院') self.logger.info('>>>>>>正在抓取[{}]详细信息……>>>>>>'.format(hospital_name)) # 获取省市县等信息 municipality = ['北京市', '上海市', '重庆市', '天津市'] pro_or_city = response.xpath( '//table[@class="nav"]/tr/' 'td/a[3]/text()').extract_first('').replace('医院列表', '') if pro_or_city: if pro_or_city.strip() in municipality: # 直辖市,包括市、区等信息 hos_prov = '' hos_city = pro_or_city hos_county = response.xpath( '//table[@class="nav"]/tr/' 'td/a[4]/text()').extract_first('').replace(hos_city, '') else: # 非直辖市,包括省、市、县或区等信息 hos_prov = pro_or_city hos_city = response.xpath( '//table[@class="nav"]/tr/td/' 'a[4]/text()').extract_first('').replace('医院列表', '').replace( hos_prov, '') hos_county = response.xpath( '//table[@class="nav"]/tr/' 'td/a[5]/text()').extract_first('').replace(hos_city, '') else: hos_prov = hos_city = hos_county = None # 获取医院概况 hospital_intro = response.xpath( '//h2/span[contains(text(),"概况")]/ancestor::h2[1]/following::p') i = 0 for each_hi in hospital_intro: i += 1 next_tag = each_hi.xpath( 'preceding::h2[1]/span[not(contains(text(),"概况"))]') if next_tag: i = i - 1 hospital_intro = hospital_intro[:i].extract() break else: hospital_intro = hospital_intro.extract() # 医院信息item hospital_name1 = response.xpath( '//table[@class="nav"]/tr/td/strong/text()').extract_first('') hospital_name2 = response.xpath('//title/text()').extract_first('') hospital_name = hospital_name1 if hospital_name1 else hospital_name2 loader = CommonLoader2(item=HospitalInfoTestItem(), response=response) loader.add_value('hospital_name', hospital_name, MapCompose(custom_remove_tags, match_special2)) loader.add_xpath( 'hospital_level', '//div[@id="bodyContent"]/ul[1]/li/' 'b[contains(text(),"医院等级")]/ancestor::li[1]', MapCompose(remove_tags, custom_remove_tags, match_special)) loader.add_value('hospital_type', '') loader.add_xpath( 'hospital_category', '//div[@id="bodyContent"]/ul[1]/li/' 'b[contains(text(),"医院类型")]/ancestor::li[1]', MapCompose(remove_tags, custom_remove_tags, match_special)) loader.add_xpath( 'hospital_addr', '//div[@id="bodyContent"]/ul[1]/li/' 'b[contains(text(),"医院地址")]/ancestor::li[1]', MapCompose(remove_tags, custom_remove_tags, match_special)) loader.add_value('hospital_pro', hos_prov, MapCompose(custom_remove_tags, match_special)) loader.add_value('hospital_city', hos_city, MapCompose(custom_remove_tags, match_special)) loader.add_value('hospital_county', hos_county, MapCompose(custom_remove_tags, match_special)) loader.add_xpath( 'hospital_phone', '//div[@id="bodyContent"]/ul[1]/li/' 'b[contains(text(),"联系电话")]/ancestor::li[1]', MapCompose(remove_tags, custom_remove_tags, match_special)) loader.add_value('hospital_intro', hospital_intro, MapCompose(remove_tags, custom_remove_tags)) loader.add_xpath( 'hospital_postcode', '//div[@id="bodyContent"]/ul[1]/li/' 'b[contains(text(),"邮政编码")]/ancestor::li[1]', MapCompose(remove_tags, custom_remove_tags, match_special)) loader.add_xpath( 'hospital_email', '//div[@id="bodyContent"]/ul[1]/li/' 'b[contains(text(),"电子邮箱")]/ancestor::li[1]', MapCompose(remove_tags, custom_remove_tags, match_special)) loader.add_xpath( 'hospital_website', '//div[@id="bodyContent"]/ul[1]/li/' 'b[contains(text(),"医院网站")]/ancestor::li[1]/' 'a[not(contains(@href,"http://www.a-hospital.com"))]', MapCompose(remove_tags, custom_remove_tags, match_special)) loader.add_xpath( 'hospital_fax', '//div[@id="bodyContent"]/ul[1]/li/' 'b[contains(text(),"传真号码")]/ancestor::li[1]', MapCompose(remove_tags, custom_remove_tags, match_special)) loader.add_xpath( 'operation_mode', '//div[@id="bodyContent"]/ul[1]/li/' 'b[contains(text(),"经营方式")]/ancestor::li[1]', MapCompose(remove_tags, custom_remove_tags, match_special)) loader.add_value('hospital_url', response.url) loader.add_value('dataSource_from', '医学百科') loader.add_value('update_time', now_day()) hospital_info_item = loader.load_item() yield hospital_info_item # 科室信息 dept_info = response.xpath( '//div[@id="bodyContent"]/ul[1]/li/' 'b[contains(text(),"重点科室")]/ancestor::li[1]') all_dept_info = match_special( dept_info.xpath('string(.)').extract_first('')) if all_dept_info: for each_dept in all_dept_info.split('、'): dept_loader = CommonLoader2(item=HospitalDepItem(), response=response) dept_loader.add_value('dept_name', each_dept, MapCompose(custom_remove_tags)) dept_loader.add_value( 'hospital_name', hospital_name, MapCompose(custom_remove_tags, match_special2)) dept_loader.add_value('update_time', now_day()) dept_item = dept_loader.load_item() yield dept_item # 医院别名信息 hospital_name = response.xpath( '//div[@id="bodyContent"]/p[1]/b/text()').extract_first('') hospital_name2 = response.xpath( '//table[@class="nav"]/tr/td/strong/text()').extract_first('') if hospital_name and '(' in hospital_name: # alias_name = get_hospital_alias(hospital_name.replace(hospital_name2, '')) try: alias_name = re.search(r'^{}((.*?))$'.format(hospital_name2), hospital_name) if alias_name: for each_alias_name in alias_name.group(1).split('、'): alias_loader = CommonLoader2(item=HospitalAliasItem(), response=response) alias_loader.add_value( 'hospital_name', hospital_name, MapCompose(custom_remove_tags, match_special2)) alias_loader.add_value('hospital_alias_name', each_alias_name) alias_loader.add_value('update_time', now_day()) alias_item = alias_loader.load_item() yield alias_item except Exception as e: self.logger.error('抓取[{}]别名的时候出错了,原因是:{}'.format( hospital_name, repr(e)))
def parse_doctor_website(self, response): self.logger.info('>>>>>>正在抓取医生个人主页相关信息……') # 获取医生相关信息 loader = YiHuLoader(item=DoctorInfoItem(), response=response) loader.add_xpath('doctor_name', '//span[@class="c-f22 c-333"]/text()') loader.add_xpath('dept_name', '//div[@class="doctor-info"]/dl/dd[2]/a[2]/text()') loader.add_xpath('hospital_name', '//div[@class="doctor-info"]/dl/dd[2]/a[1]/text()') loader.add_xpath('doctor_level', '//div[@class="doctor-info"]/dl/dd[1]/text()') loader.add_xpath( 'doctor_intro', '//table[@class="pop-myinfo-tb"]/tr[@class="last"]/td/p/text()') loader.add_xpath('doctor_goodAt', '//table[@class="pop-myinfo-tb"]/tr[5]/td/text()') loader.add_value('update_time', now_day()) doctor_item = loader.load_item() yield doctor_item # 获取医院相关信息 hos_link = response.xpath( '//div[@class="doctor-info"]/dl/dd[2]/a[1]/@href').extract_first( '') dept_link = response.xpath( '//div[@class="doctor-info"]/dl/dd[2]/a[2]/@href').extract_first( '') # 抓取医院详细信息 if hos_link: hos_id = re.search(r'/sc/(.*?).shtml', hos_link) if hos_id and hos_id.group(1) not in self.crawled_ids: self.crawled_ids.add(hos_id.group(1)) hos_intro_link = re.sub(r'/sc/', '/detail/', hos_link) hos_con_link = re.sub(r'/sc/', '/contact/', hos_link) hos_loader = YiHuLoader(item=HospitalInfoItem(), response=response) hos_loader.add_xpath( 'hospital_name', '//div[@class="doctor-info"]/dl/dd[2]/a[1]/text()') hospital_detail_request = Request( hos_intro_link, headers=self.headers, callback=self.parse_hospital_detail, meta={ 'loader': hos_loader, 'contact_hos_link': hos_con_link }) hospital_detail_request.meta['Referer'] = response.url yield hospital_detail_request # 存储科室信息 if dept_link: dept_link_id = re.search(r'/arrange/(.*?).shtml', dept_link) if dept_link_id and dept_link_id.group(1) not in self.crawled_dept: self.crawled_dept.add(dept_link_id.group(1)) dept_loader = YiHuLoader(item=HospitalDepItem(), response=response) dept_loader.add_xpath( 'dept_name', '//div[@class="doctor-info"]/dl/dd[2]/a[2]/text()') dept_loader.add_xpath( 'hospital_name', '//div[@class="doctor-info"]/dl/dd[2]/a[1]/text()') dept_loader.add_value('update_time', now_day()) dept_item = dept_loader.load_item() yield dept_item
def parse_hospital_info(self, response): self.logger.info('>>>>>>正在抓取:医院详细信息>>>>>>') try: # 获取医院信息 # 获取医院等级与地区 hospital_info = response.xpath( '//p[@class="yygh_box_top_p2"]').extract() hospital_info2 = custom_remove_tags( remove_tags(''.join(hospital_info))) hospital_level = hospital_info2.split(':')[1].replace('区域', '') hospital_county = hospital_info2.split(':')[2].replace('分类', '') if hospital_level: res = re.search(r'(.*等|.*级|.*合格|.*甲)(.*?)$', hospital_level) if res: h_l = res.group(1) h_c = res.group(2) if h_c: h_c_2 = re.sub(r'合格|医院', '', h_c) if h_c_2: h_c = '{0}{1}'.format(h_c_2, '医院') else: h_l = h_c = None else: h_l = h_c = None loader = CommonLoader2(item=HospitalInfoItem(), response=response) loader.add_xpath('hospital_name', '//p[@class="yygh_box_top_p"]/strong/text()', MapCompose(custom_remove_tags)) loader.add_value('hospital_level', h_l, MapCompose(custom_remove_tags)) loader.add_value('hospital_category', h_c) loader.add_xpath( 'hospital_addr', '//span[@class="yygh_box_con_dl_span1"]/ancestor::dl[1]/dd[1]/p/text()', MapCompose(custom_remove_tags)) loader.add_value('hospital_pro', '') loader.add_value('hospital_city', '北京市') loader.add_value('hospital_county', hospital_county, MapCompose(custom_remove_tags)) loader.add_xpath( 'hospital_phone', '//span[@class="yygh_box_con_dl_span3"]/ancestor::dl[1]/dd[1]/p/text()', MapCompose(custom_remove_tags)) loader.add_value('registered_channel', self.data_source_from) loader.add_value('dataSource_from', self.data_source_from) loader.add_value('hospital_url', response.url) loader.add_value('update_time', now_day()) # 获取医院介绍 hospital_intro_link = response.xpath( '//a[contains(text(),"医院介绍")]/@href').extract_first('') if hospital_intro_link: hospital_intro_link = urljoin(self.host, hospital_intro_link) self.headers['Referer'] = response.url yield Request(hospital_intro_link, headers=self.headers, callback=self.parse_hospital_detail_info, meta={'loader': loader}) # 获取科室信息 all_dept_links = response.xpath('//div[@class="kfyuks_yyksbox"]') for each_dept_link in all_dept_links: dept_type = each_dept_link.xpath( 'div[1]/text()').extract_first('') dept_info = each_dept_link.xpath( 'div[2]/div/ul/li/a/text()').extract() for each_dept_info in dept_info: dept_loader = CommonLoader2(item=HospitalDepItem(), response=response) dept_loader.add_value('dept_name', each_dept_info, MapCompose(custom_remove_tags)) dept_loader.add_value('dept_type', dept_type, MapCompose(custom_remove_tags)) dept_loader.add_xpath( 'hospital_name', '//p[@class="yygh_box_top_p"]/strong/text()', MapCompose(custom_remove_tags)) dept_loader.add_value('dept_info', '') dept_loader.add_value('dataSource_from', self.data_source_from) dept_loader.add_value('update_time', now_day()) dept_item = dept_loader.load_item() yield dept_item except Exception as e: self.logger.error('在抓取医院详细信息过程中出错了,原因是:{}'.format(repr(e)))