def parse_doctor_info_detail(self, response): self.logger.info('>>>>>>正在抓取医生详细信息>>>>>>') try: doctor_name = response.meta.get('doctor_name') dept_name = response.meta.get('dept_name') # dept_name = dept_name.split('-')[-1] if '-' in dept_name else dept_name doctor_level = response.meta.get('doctor_level') hospital_name = response.meta.get('hospital_name') # hospital_name2 = response.xpath('//div[@class="yy_til"]/h2/text()').extract_first('') # hospital_name = hospital_name2 if hospital_name2 else hospital_name1 diagnosis_amt = response.xpath('//td/span[@class="doc_yuyue_time"]/a/@title').extract() if diagnosis_amt: res = re.search(r'.*挂号费:(.*?)$', diagnosis_amt[0], S) if res: diagnosis_amt = res.group(1) else: diagnosis_amt = None else: diagnosis_amt = None loader = CommonLoader2(item=DoctorInfoItem(), response=response) loader.add_value('doctor_name', doctor_name, MapCompose(custom_remove_tags)) loader.add_value('dept_name', dept_name, MapCompose(custom_remove_tags)) loader.add_value('hospital_name', hospital_name, MapCompose(custom_remove_tags)) loader.add_value('doctor_level', doctor_level, MapCompose(custom_remove_tags, match_special2)) loader.add_xpath('doctor_intro', '//div[@class="zrys"]/dl/dd', MapCompose(remove_tags, custom_remove_tags, clean_info2)) loader.add_value('diagnosis_amt', diagnosis_amt) loader.add_value('dataSource_from', self.data_source_from) loader.add_value('crawled_url', response.url) loader.add_value('update_time', now_day()) doctor_item = loader.load_item() yield doctor_item # 获取医生排班信息 has_reg_info = response.xpath('//td/span[@class="doc_yuyue_time"]').extract() if has_reg_info: for each_reg_info in has_reg_info: reg_info_date = re.search(r'.*出诊时间:(.*?)\n', each_reg_info, S) reg_info_date = reg_info_date.group(1) if reg_info_date else None reg_info = '{0}-{1}'.format(now_year(), reg_info_date).replace('月', '-').replace('日', '') reg_loader = CommonLoader2(item=DoctorRegInfoItem(), response=response) reg_loader.add_value('doctor_name', doctor_name, MapCompose(custom_remove_tags)) reg_loader.add_value('dept_name', dept_name, MapCompose(custom_remove_tags)) reg_loader.add_xpath('hospital_name', '//div[@class="yy_til"]/h2/text()', MapCompose(custom_remove_tags)) reg_loader.add_value('reg_info', reg_info, MapCompose(custom_remove_tags)) reg_loader.add_value('dataSource_from', self.data_source_from) reg_loader.add_value('crawled_url', response.url) reg_loader.add_value('update_time', now_day()) reg_item = reg_loader.load_item() yield reg_item except Exception as e: self.logger.error('在抓取医生详细信息的过程中出错了,原因是:{}'.format(repr(e)))
def parse_doctor_info_detail(self, response): self.logger.info('>>>>>>正在抓取{}:医生详细信息>>>>>>'.format( self.hospital_name)) loader = CommonLoader2(item=DoctorInfoItem(), response=response) loader.add_xpath('doctor_name', '//div[@class="viewexpert_demo"]/p[1]/text()', MapCompose(custom_remove_tags)) loader.add_xpath('dept_name', '//div[@class="viewexpert_demo"]/p[3]/text()', MapCompose(custom_remove_tags, match_special)) loader.add_value('hospital_name', self.hospital_name) loader.add_xpath( 'doctor_level', '//div[@class="viewexpert_demo"]/p[2]/text()', MapCompose(custom_remove_tags, match_special, match_special2)) loader.add_xpath('doctor_intro', '//div[@class="viewexpert_detail"]', MapCompose(remove_tags, custom_remove_tags)) loader.add_xpath('doctor_goodAt', '//div[@class="viewexpert_demo"]/p[4]/text()', MapCompose(custom_remove_tags)) loader.add_value('update_time', now_day()) doctor_item = loader.load_item() yield doctor_item # 获取医生排班信息 reg_tr_list = response.xpath( '//div[@class="viewexpert_detail"]/table/tr[position()>1]') is_has_reg = response.xpath( '//div[@class="viewexpert_detail"]/table/tr[position()>1]/td/img') reg_date = ['星期一', '星期二', '星期三', '星期四', '星期五', '星期六', '星期日'] if is_has_reg: for each_td in reg_tr_list: i = 0 reg_time = each_td.xpath('td[1]/text()').extract_first('') all_reg_info = each_td.xpath('td[position()>1]') for each_reg_info in all_reg_info: reg_info_date = reg_date[i] i += 1 has_reg = each_reg_info.xpath('img') if has_reg: reg_info = '{0}{1}'.format(reg_info_date, reg_time) reg_loader = CommonLoader2(item=DoctorRegInfoItem(), response=response) reg_loader.add_xpath( 'doctor_name', '//div[@class="viewexpert_demo"]/p[1]/text()', MapCompose(custom_remove_tags)) reg_loader.add_xpath( 'dept_name', '//div[@class="viewexpert_demo"]/p[3]/text()', MapCompose(custom_remove_tags, match_special)) reg_loader.add_value('hospital_name', self.hospital_name) reg_loader.add_value('reg_info', reg_info) reg_loader.add_value('update_time', now_day()) reg_item = reg_loader.load_item() yield reg_item
def parse_doctor_info(self, response): self.logger.info('>>>>>>正在抓取{}:医生信息>>>>>>'.format(self.hospital_name)) tr_res1 = response.xpath('//div[@class="text"]/table[1]/tbody/tr[position()>9]') tr_res2 = response.xpath('//div[@class="text"]/table[2]/tbody/tr[position()>1]') dept_name = '' try: # 获取医生信息 for each_res in tr_res1: loader = CommonLoader2(item=DoctorInfoItem(), response=response) loader.add_value('hospital_name', self.hospital_name) td_cnt = each_res.xpath('td/span[not(contains(text(),"全天")) ' 'and not(contains(text(),"上午")) ' 'and not(contains(text(),"下午")) ' 'and not(contains(text(),"\u3000"))]/text()') length_of_list = len(td_cnt) if '名医堂' in td_cnt: loader.add_value('dept_name', td_cnt.extract()[0]) loader.add_value('doctor_name', td_cnt.extract()[1]) loader.add_value('update_time', now_day()) yield loader.load_item() elif length_of_list == 3: dept_name = td_cnt.extract()[0].replace(' ', '') loader.add_value('dept_name', dept_name) loader.add_value('doctor_name', td_cnt.extract()[1]) loader.add_value('doctor_level', td_cnt.extract()[2]) loader.add_value('update_time', now_day()) yield loader.load_item() elif length_of_list == 2: loader.add_value('dept_name', dept_name) loader.add_value('doctor_name', td_cnt.extract()[0]) loader.add_value('doctor_level', td_cnt.extract()[1]) loader.add_value('update_time', now_day()) yield loader.load_item() else: pass for each_res in tr_res2: loader = CommonLoader2(item=DoctorInfoItem(), response=response) loader.add_value('hospital_name', self.hospital_name) td_cnt = each_res.xpath('td[position()<4]/span/text()') length_of_list = len(td_cnt) if '名医堂' in td_cnt: loader.add_value('dept_name', td_cnt.extract()[1]) loader.add_value('doctor_name', td_cnt.extract()[0]) loader.add_value('update_time', now_day()) yield loader.load_item() elif length_of_list == 3: loader.add_value('dept_name', td_cnt.extract()[2]) loader.add_value('doctor_name', td_cnt.extract()[0]) loader.add_value('doctor_level', td_cnt.extract()[1]) loader.add_value('update_time', now_day()) yield loader.load_item() else: pass except Exception as e: self.logger.error('>>>>>>抓取过程中出错了,原因是:{}>>>>>>'.format(repr(e)))
def parse_area_detail(self, response): """地区排行详细信息""" self.logger.info('>>>>>>正在抓取地区排行详细信息……>>>>>>') subject_name = response.meta.get('subject_name') res = json.loads(response.text) for each_data in res.get('rows', []): loader = CommonLoader2(item=AreaRankingItem(), response=response) loader.add_value('subject', each_data.get('GB_NAME')) loader.add_value('hospital_pro', subject_name) loader.add_value('ranking', each_data.get('SHOW_RANK')) loader.add_value('hospital_name', each_data.get('HOSPNAME')) loader.add_value('create_time', now_day()) loader.add_value('update_time', now_day()) ranking_item = loader.load_item() yield ranking_item
def parse_doctor_info_detail(self, response): hospital_name = response.meta.get('hospital_name') self.logger.info('>>>>>>正在抓取[{}]医生详细信息>>>>>>'.format(hospital_name)) dept_name = response.meta.get('dept_name') doctor_level = response.meta.get('doctor_level') doctor_name = response.meta.get('doctor_name') loader = CommonLoader2(item=DoctorInfoItem(), response=response) loader.add_value('doctor_name', doctor_name, MapCompose(custom_remove_tags)) # loader.add_xpath('doctor_name', '//span[@class="info-name"]/text()', MapCompose(custom_remove_tags)) loader.add_value('dept_name', dept_name) # loader.add_xpath('hospital_name', # '//div[@class="item gray"]/span[1]/a/text()', # MapCompose(custom_remove_tags)) loader.add_value('hospital_name', hospital_name, MapCompose(custom_remove_tags)) loader.add_value('doctor_level', doctor_level) loader.add_xpath( 'doctor_intro', '//div[@class="info-main"]/div[3]/span', MapCompose(remove_tags, custom_remove_tags, match_special)) loader.add_xpath( 'doctor_goodAt', '//div[@class="info-main"]/div[4]/span', MapCompose(remove_tags, custom_remove_tags, match_special)) loader.add_value('dataSource_from', self.data_source_from) loader.add_value('update_time', now_day()) doctor_item = loader.load_item() yield doctor_item
def parse(self, response): self.logger.info('正在抓取{}:医院信息'.format(self.hospital_name)) loader = MedicalMapLoader(item=HospitalInfoItem(), response=response) # loader.add_value('hospital_id', self.hospital_id) loader.add_value('hospital_name', self.hospital_name) loader.add_value('consulting_hour', '普通门诊上午_8:00-12:00;普通门诊下午13:00-16:30') loader.add_value('hospital_level', '三级乙等') loader.add_value('hospital_type', '公立') loader.add_value('hospital_category', '综合医院') # loader.add_value('hospital_addr', '四川省金堂县赵镇金广路886号') loader.add_value('hospital_pro', '四川省') loader.add_value('hospital_city', '成都市') loader.add_value('hospital_county', '金堂县') loader.add_value('hospital_phone', '医院服务电话_028-84902884;服务质量监督投诉电话_028-84932532;' '急诊急救电话_18181938532;产科急救电话_18181938532;' '医保结算电话_028-84932721;' '预约挂号电话_028-84931443;预约挂号电话_028-84902884;预约挂号电话_028-61568616') loader.add_xpath('hospital_intro', '//div[@class="baseRight-intro"]/p[position()<7]/span/text()') loader.add_value('is_medicare', '') loader.add_value('medicare_type', '') loader.add_value('vaccine_name', '') loader.add_value('is_cpc', '') loader.add_value('is_bdc', '') loader.add_value('cooperative_business', '') loader.add_value('hospital_district', '') loader.add_value('registered_channel', '微信公众号_' + self.hospital_name) loader.add_value('dataSource_from', '官网:http://www.jintangyy.com/index.aspx') loader.add_value('update_time', now_day()) hospital_info_item = loader.load_item() yield hospital_info_item request = Request(self.dep_link, callback=self.parse_hospital_dep) request.meta['Referer'] = response.url yield request
def parse_doctor_reg_info(self, response): self.logger.info('>>>>>>正在抓取医生排班信息……') doctor_reg_info = json.loads(response.text) reg_info_list = doctor_reg_info['data']['selWork'] doctor_name = doctor_reg_info['data']['doctor'][0].get('doctorName', '') hospital_name = doctor_reg_info['data']['doctor'][0].get('hospitalName', '') dept_name = doctor_reg_info['data']['doctor'][0].get('deptName', '') for each_reg_info in reg_info_list: duty_date = each_reg_info['dutydate'] sel_works = each_reg_info['selWorks'] for each_work_info in sel_works: duty_time = each_work_info['dutytime'] if int(duty_time) == 1: duty_time = '上午' elif int(duty_time) == 3: duty_time = '上午' else: # duty_time 4 晚上 doctorId 3329 成都中医药大学附属医院 duty_time = '晚上' reg_info = '{0}{1}'.format(duty_date, duty_time) loader = CommonLoader(item=DoctorRegInfoItem(), response=response) loader.add_value('doctor_name', doctor_name) loader.add_value('hospital_name', hospital_name) loader.add_value('dept_name', dept_name) loader.add_value('reg_info', reg_info) loader.add_value('update_time', now_day()) reg_info_item = loader.load_item() yield reg_info_item
def parse_doctor_info_detail(self, response): self.logger.info('>>>>>>正在抓取{}:医生详细信息>>>>>>'.format( self.hospital_name)) loader = response.meta['loader'] dept_name1 = custom_remove_tags(''.join(response.meta['dept_name'])) doctor_level2 = response.xpath( '//div[@class="FrontProducts_detail02-' '1482202997396_htmlbreak"]/p[1]/strong/text()').extract_first('') doctor_level1 = response.meta['doctor_level'] dept_name2 = response.xpath( '//div[@id="FrontPublic_breadCrumb01-1482202386120"]/div/' 'a[last()]/text()').extract_first('').replace('专家', '').replace( '类', '科') dept_name = re.sub(r'中医医师|中西医医师', '中医科', dept_name1) if dept_name1 else dept_name2 doctor_level = custom_remove_tags( ''.join(doctor_level1)) if doctor_level1 else doctor_level2 doctor_intro = response.xpath( '//div[@class="FrontProducts_detail02-' '1482202997396_htmlbreak"]/p[2]').extract_first('') loader.add_value('dept_name', dept_name, MapCompose(custom_remove_tags, filter_info3)) loader.add_value('doctor_level', doctor_level, MapCompose(filter_info4, custom_remove_tags)) loader.add_value('doctor_intro', doctor_intro, MapCompose(remove_tags, custom_remove_tags)) loader.add_value('update_time', now_day()) doctor_item = loader.load_item() yield doctor_item
def parse(self, response): """获取医院信息""" self.logger.info('>>>>>>正在抓取{}:医院信息>>>>>>'.format(self.hospital_name)) loader = CommonLoader2(item=HospitalInfoItem(), response=response) loader.add_value('hospital_name', self.hospital_name) loader.add_value('consulting_hour', '门诊上午_8:00-12:00;门诊下午_14:00-17:30') loader.add_value('hospital_level', '二级甲等') loader.add_value('hospital_type', '公立') loader.add_value('hospital_category', '综合医院') loader.add_value('hospital_addr', '成都市东三环龙泉驿区十陵街道江华路8号') loader.add_value('hospital_pro', '四川省') loader.add_value('hospital_city', '成都市') loader.add_value('hospital_county', '龙泉驿区') loader.add_value( 'hospital_phone', '急救电话_028-84615120;电话咨询_028-84604546转科室;' '24小时医护热线_028-84615789') loader.add_xpath('hospital_intro', '//article[@class="content"]/div', MapCompose(remove_tags, custom_remove_tags)) loader.add_value('is_medicare', '是') loader.add_value('medicare_type', '成都市医保、工伤保险定点医院') loader.add_value('registered_channel', '官网或官方微信公众号(工作日),法定节假日电话预约') loader.add_value('dataSource_from', '医院官网') loader.add_value('update_time', now_day()) hospital_info_item = loader.load_item() yield hospital_info_item
def parse_doctor_reg_info(self, response): self.logger.info('>>>>>>正在抓取{}:医生排班信息>>>>>>'.format( self.hospital_name)) doctor_name = response.meta['doctor_name'] dept_name = response.meta['dept_name'] reg_tr_list = response.xpath('//table/tr[position()>1]') is_has_reg = response.xpath('//table/tr[position()>1]/td/img') # reg_date = ['星期一', '星期二', '星期三', '星期四', '星期五', '星期六', '星期日'] reg_col = ['上午', '下午', '晚班'] if is_has_reg: for each_td in reg_tr_list: reg_time = each_td.xpath('td[1]/text()').extract_first('') all_reg_info = each_td.xpath('td[position()>1]') for index, each_reg_info in enumerate(all_reg_info): reg_info_date = reg_col[index] has_reg = each_reg_info.xpath('img') if has_reg: reg_info = '{0}{1}'.format(reg_time, reg_info_date) reg_loader = CommonLoader2(item=DoctorRegInfoItem(), response=response) reg_loader.add_value('doctor_name', doctor_name) reg_loader.add_value( 'dept_name', dept_name, MapCompose(custom_remove_tags, match_special)) reg_loader.add_value('hospital_name', self.hospital_name) reg_loader.add_value('reg_info', reg_info) reg_loader.add_value('update_time', now_day()) reg_item = reg_loader.load_item() yield reg_item
def parse_doctor_info_detail(self, response): self.logger.info('>>>>>>正在抓取{}:医生详细信息>>>>>>'.format( self.hospital_name)) doctor_name = response.meta['doctor_name'] dept_name = response.meta['dept_name'] doctor_level = response.meta['doctor_level'] loader = CommonLoader2(item=DoctorInfoItem(), response=response) loader.add_value('doctor_name', doctor_name) loader.add_value('dept_name', dept_name, MapCompose(custom_remove_tags, match_special)) loader.add_value('hospital_name', self.hospital_name) loader.add_value('doctor_level', doctor_level, MapCompose(custom_remove_tags, match_special)) loader.add_xpath('doctor_intro', '//div[@id="about-right-b"]/p', MapCompose(remove_tags, custom_remove_tags)) loader.add_xpath('doctor_goodAt', '//div[@id="about-right-b"]/p', MapCompose(remove_tags, custom_remove_tags)) loader.add_value('update_time', now_day()) doctor_item = loader.load_item() yield doctor_item # 医生排班信息 params = re.search(r'.*\?(.*?)$', response.url) reg_url = 'http://www.scpz120.com/ajax/Doctor.aspx?' if params: reg_link = '{0}{1}'.format(reg_url, params.group(1).replace('&id', '&kid')) reg_request = Request(reg_link, headers=self.headers, callback=self.parse_doctor_reg_info, meta={ 'doctor_name': doctor_name, 'dept_name': dept_name }) self.headers['Referer'] = response.url yield reg_request
def parse(self, response): """获取医院信息""" self.logger.info('>>>>>>正在抓取{}:医院信息>>>>>>'.format(self.hospital_name)) loader = CommonLoader2(item=HospitalInfoItem(), response=response) loader.add_value('hospital_name', self.hospital_name) loader.add_value('consulting_hour', '') loader.add_value('hospital_level', '三级乙等') loader.add_value('hospital_type', '公立') loader.add_value('hospital_category', '妇幼保健院') loader.add_value('hospital_addr', '成都市温江区万春路140') loader.add_value('hospital_pro', '四川省') loader.add_value('hospital_city', '成都市') loader.add_value('hospital_county', '') loader.add_value('hospital_phone', '24小时急救电话_028-82723131;咨询电话_围产期保健_028-82715727;' '咨询电话_妇科门诊_028-82711383;咨询电话_儿童保健_028-82711527;' '咨询电话_婚检科_028-82720337;' '投诉电话_028-82724901(上班时间);投诉电话_13688488598(下班时间)') loader.add_xpath('hospital_intro', '//div[@id="info_txt"]', MapCompose(remove_tags, custom_remove_tags)) # loader.add_value('is_medicare', '是') # loader.add_value('medicare_type', '') loader.add_value('registered_channel', '电话预约;挂号窗口;医院微信公众号') loader.add_value('dataSource_from', '医院官网') loader.add_value('update_time', now_day()) hospital_info_item = loader.load_item() yield hospital_info_item
def parse_hospital_dep_detail(self, response): self.logger.info('>>>>>>正在抓取{}:科室详细信息>>>>>>'.format( self.hospital_name)) loader = CommonLoader2(item=HospitalDepItem(), response=response) loader.add_xpath('dept_type', '//div[@class="title"]/h3/text()', MapCompose(custom_remove_tags)) loader.add_xpath('dept_name', '//div[@class="title"]/h3/text()', MapCompose(custom_remove_tags)) loader.add_value('hospital_name', self.hospital_name) # loader.add_value('dept_type', response.meta['dept_type'], MapCompose(custom_remove_tags)) loader.add_xpath('dept_info', '//div[@class="content"]', MapCompose(remove_tags, custom_remove_tags)) loader.add_value('update_time', now_day()) dept_item = loader.load_item() yield dept_item # 其他科室信息 self.logger.info('>>>>>>正在抓取{}:科室信息>>>>>>'.format(self.hospital_name)) dept_links = response.xpath( '//ul[@class="list2"]/li[position()>1]/a/@href').extract() self.dept_crawled_cnt += 1 if dept_links and self.dept_crawled_cnt == 1: for each_dept_link in dept_links: dept_request = Request(urljoin(self.host, each_dept_link), headers=self.headers, callback=self.parse_hospital_dep_detail, dont_filter=True) dept_request.meta['Referer'] = response.url yield dept_request
def parse_doctor_info_detail(self, response): hospital_name = response.meta.get('hospital_name') self.logger.info('>>>>>>正在抓取[{}]医生详细信息>>>>>>'.format(hospital_name)) try: doctor_name = response.meta.get('doctor_name') dept_name = response.meta.get('dept_name') doctor_level = response.meta.get('doctor_level') doc_gt1 = remove_tags(''.join( response.xpath('//div[@class="intro_more"]').extract())) doc_gt2 = response.xpath( '//dd[contains(text(),"擅长领域")]/text()').extract_first('') doctor_good_at = doc_gt1.replace('[关闭]', '') if doc_gt1 else doc_gt2 loader = CommonLoader2(item=DoctorInfoItem(), response=response) loader.add_value('doctor_name', doctor_name, MapCompose(custom_remove_tags)) loader.add_value('dept_name', dept_name, MapCompose(custom_remove_tags)) loader.add_value('hospital_name', hospital_name, MapCompose(custom_remove_tags)) loader.add_value('doctor_level', doctor_level, MapCompose(custom_remove_tags)) loader.add_xpath('doctor_intro', '//div[@class="hos-guide-box1"]', MapCompose(remove_tags, custom_remove_tags)) loader.add_value( 'doctor_goodAt', doctor_good_at, MapCompose(custom_remove_tags, match_special, clean_info2)) loader.add_value('dataSource_from', self.data_source_from) loader.add_value('crawled_url', response.url) loader.add_value('update_time', now_day()) doctor_item = loader.load_item() yield doctor_item except Exception as e: self.logger.error('在抓取医生详细信息的过程中出错了,原因是:{}'.format(repr(e)))
def parse_hospital_dep(self, response): hospital_name = response.meta.get('hospital_name') dept_type = response.meta.get('dept_type') self.logger.info('>>>>>>正在抓取:[{}]医院-[{}]科室信息>>>>>>'.format(hospital_name, dept_type)) try: dept_info = json.loads(response.text) sub_dept_list = dept_info.get('data').get('subDepList') for each_dept_info in sub_dept_list: dept_name = each_dept_info.get('name') dept_id = each_dept_info.get('id') dept_loader = CommonLoader2(item=HospitalDepItem(), response=response) dept_loader.add_value('dept_name', dept_name, MapCompose(custom_remove_tags)) dept_loader.add_value('dept_type', dept_type, MapCompose(custom_remove_tags)) dept_loader.add_value('hospital_name', hospital_name, MapCompose(custom_remove_tags)) dept_loader.add_value('dataSource_from', self.data_source_from) dept_loader.add_value('crawled_url', response.url) dept_loader.add_value('update_time', now_day()) dept_loader.add_value('dept_id', dept_id) dept_loader.add_value('dept_url', response.url) dept_loader.add_value('gmt_created', now_time()) dept_loader.add_value('gmt_modified', now_time()) dept_item = dept_loader.load_item() yield dept_item except Exception as e: self.logger.error('在抓取医院科室信息过程中出错了,原因是:{}'.format(repr(e)))
def parse_hospital_dep(self, response): """ 获取科室信息 """ self.logger.info('>>>>>>正在抓取科室信息……') hospital_name = response.xpath('//div[@class="hos-info"]/h1/text()').extract_first('') all_dept_links = response.xpath('//dd[@class="ks-2"]/ul/li') self.logger.info('{}:共有{}个科室'.format(hospital_name, str(len(all_dept_links)))) for each_dept_link in all_dept_links: # 获取科室信息 dep_loader = YiHuLoader(item=HospitalDepItem(), selector=each_dept_link) dep_loader.add_xpath('dept_type', 'a/text()') dep_loader.add_xpath('dept_name', 'a/text()') dep_loader.add_value('hospital_name', hospital_name) dep_loader.add_value('update_time', now_day()) dep_item = dep_loader.load_item() yield dep_item # 获取科室医生信息 dept_link = each_dept_link.xpath('a/@href').extract_first('') if dept_link: dept_link = urljoin(self.host, dept_link) # doctor_link = re.sub(r'/arrange/', '/7002/', dept_link) dept_request = Request(dept_link, headers=self.headers, callback=self.parse_dept_link) self.headers['Referer'] = response.url yield dept_request
def parse_dept_info(self, response): dep_type = response.meta['dep_type'] self.logger.info('正在抓取[{}]科室信息'.format(custom_remove_tags(dep_type))) all_dept_names = response.xpath('//div[@class="pic"]') if all_dept_names: # 一级科室有二级科室 for each_dept_name in all_dept_names: dept_detail_link = each_dept_name.xpath( 'a/@href').extract_first('') if dept_detail_link: dept_detail_link = urljoin(self.host, dept_detail_link) request = Request(dept_detail_link, headers=self.headers, callback=self.parse_dept_detail, meta={'dep_type': dep_type}) request.meta['Referer'] = response.url yield request else: # 一级科室没有二级科室 loader = PxfybjyLoader(item=HospitalDepItem(), response=response) loader.add_value('dept_type', dep_type) loader.add_value('hospital_name', self.hospital_name) loader.add_value('update_time', now_day()) hospital_dep_item = loader.load_item() yield hospital_dep_item
def parse(self, response): """获取医院信息""" self.logger.info('>>>>>>正在抓取{}:医院信息>>>>>>'.format(self.hospital_name)) loader = CommonLoader2(item=HospitalInfoItem(), response=response) loader.add_value('hospital_name', self.hospital_name) loader.add_value( 'consulting_hour', '急诊和临床住院科室_24小时值班;' '行政及其它_上午8:00~12:00,下午14:00~17:00') loader.add_value('hospital_level', '三级乙等') loader.add_value('hospital_type', '公立') loader.add_value('hospital_category', '中医医院') loader.add_value('hospital_addr', '四川省彭州市天彭镇南大街396号') loader.add_value('hospital_pro', '四川省') loader.add_value('hospital_city', '彭州市') loader.add_value('hospital_county', '') loader.add_value('hospital_phone', '028-83701908') loader.add_xpath( 'hospital_intro', '//div[@id="about-right-b"]', MapCompose(remove_tags, custom_remove_tags, clean_info)) # loader.add_value('is_medicare', '是') # loader.add_value('medicare_type', '成都市医保、工伤保险定点医院') loader.add_value('registered_channel', '官网') loader.add_value('dataSource_from', '医院官网') loader.add_value('update_time', now_day()) hospital_info_item = loader.load_item() yield hospital_info_item
def parse_hospital_dep(self, response): hospital_name = response.meta.get('hospital_name') self.logger.info('>>>>>>正在抓取:[{}]科室信息>>>>>>'.format(hospital_name)) try: all_dept_links = response.xpath( '//div[@id="one_2"]/div/div/table/tbody/tr/td[@class="contentTd"]/a' ) for each_dept_link in all_dept_links: dept_name = each_dept_link.xpath('text()').extract_first('') dept_detail_link = each_dept_link.xpath('@href').extract_first( '') dept_loader = CommonLoader2(item=HospitalDepItem(), response=response) dept_loader.add_value('dept_name', dept_name, MapCompose(custom_remove_tags)) dept_loader.add_value('hospital_name', hospital_name, MapCompose(custom_remove_tags)) dept_loader.add_value('dataSource_from', self.data_source_from) dept_loader.add_value('update_time', now_day()) # 获取科室详细信息 if dept_name and dept_detail_link: self.headers['Referer'] = response.url yield Request(urljoin(self.host, dept_detail_link), headers=self.headers, callback=self.parse_hospital_dep_detail, meta={ 'dept_name': dept_name, 'dept_loader': dept_loader, 'hospital_name': hospital_name }, dont_filter=True) except Exception as e: self.logger.error('在抓取医院科室信息过程中出错了,原因是:{}'.format(repr(e)))
def parse_dept_info(self, response): self.logger.info('>>>>>>正在抓取医院科室相关信息……') dept_info = json.loads(response.text) for each_dept in dept_info['responseData']['data']['data']['depart']: loader = CommonLoader(item=HospitalDepItem(), response=response) loader.add_value('dept_name', each_dept['deptname']) loader.add_value('hospital_name', dept_info['responseData']['data']['data']['hospital']['hospitalName']) loader.add_value('update_time', now_day()) dept_item = loader.load_item() yield dept_item dept_id = each_dept.get('deptid', '') if dept_id: data = { 'key': '', 'deptId': str(dept_id), 'pageIndex': '1', 'pageSize': '100' } doctor_request = FormRequest(self.doctor_link, headers=self.headers, callback=self.parse_doctor_info, formdata=data, meta={'dept_id': dept_id}, dont_filter=True) self.headers['Referer'] = 'http://www.scgh114.com/web/register/doctor' yield doctor_request
def parse(self, response): """获取医院相关信息""" self.logger.info('>>>>>>正在抓取医院相关信息……') hospital_info = json.loads(response.text) for each_hospital in hospital_info[3:4]: is_medicare = '是' if str(each_hospital.get('Ismedicalcard', '')) == '1' else '否' loader = CommonLoader(item=HospitalInfoItem(), response=response) loader.add_value('hospital_name', each_hospital.get('hospitalname', '')) loader.add_value('hospital_level', each_hospital.get('levelName', '')) loader.add_value('hospital_addr', each_hospital.get('address', '')) loader.add_value('hospital_pro', '四川') loader.add_value('hospital_city', each_hospital.get('areaName', '')) loader.add_value('is_medicare', is_medicare) loader.add_value('dataSource_from', self.source_from) loader.add_value('update_time', now_day()) hospital_item = loader.load_item() yield hospital_item hospital_id = each_hospital.get('hospitalid') if hospital_id: dept_request = FormRequest(self.dept_link, headers=self.headers, callback=self.parse_dept_info, formdata={'hospitalId': str(hospital_id)}, dont_filter=True) self.headers['Referer'] = 'http://www.scgh114.com/web/register/gh' yield dept_request
def parse(self, response): """获取医院信息""" self.logger.info('>>>>>>正在抓取{}:医院信息>>>>>>'.format(self.hospital_name)) loader = CommonLoader2(item=HospitalInfoItem(), response=response) loader.add_value('hospital_name', self.hospital_name) loader.add_value('consulting_hour', '上午8:00—12:00;下午2:00—5:30') loader.add_value('hospital_level', '三级乙等') loader.add_value('hospital_type', '公立') loader.add_value('hospital_category', '妇幼保健院') loader.add_value('hospital_addr', '四川省成都市双流区东升街道涧槽中街396号') loader.add_value('hospital_pro', '四川省') loader.add_value('hospital_city', '成都市') loader.add_value('hospital_county', '') loader.add_value( 'hospital_phone', '母婴咨询热线_028-85884888(工作日);' '总值班电话_028-85808438;' '预约挂号电话_028-85801029(7:30-19:30)') loader.add_xpath('hospital_intro', '//div[@class="describe htmledit"]', MapCompose(remove_tags, custom_remove_tags)) loader.add_value('is_medicare', '是') # loader.add_value('medicare_type', '') loader.add_value('registered_channel', '电话预约;自助挂号机;诊室预约;' '医院微信公众号;健康双流;现场') loader.add_value('dataSource_from', '医院官网') loader.add_value('update_time', now_day()) hospital_info_item = loader.load_item() yield hospital_info_item
def parse(self, response): """综合排行""" self.logger.info('>>>>>>正在抓取综合排行信息……>>>>>>') res = json.loads(response.text) for each_data in res.get('rows', []): loader = CommonLoader2(item=ComprehensiveRankingItem(), response=response) loader.add_value('hospital_pro', each_data.get('PROVINCE')) loader.add_value('ranking', each_data.get('RANK')) loader.add_value('hospital_name', each_data.get('HOSPNAME')) loader.add_value('tech_investment', each_data.get('INPUT')) loader.add_value('tech_output', each_data.get('OUTPUT')) loader.add_value('academic_influence', each_data.get('INFLUENCE')) loader.add_value('total_score', each_data.get('SUM')) loader.add_value('create_time', now_day()) loader.add_value('update_time', now_day()) ranking_item = loader.load_item() yield ranking_item
def parse_dep_detail(self, response): """科室详细信息""" self.logger.info('正在抓取{}:科室详细信息'.format(self.hospital_name)) loader = response.meta['loader'] dept_intro = response.xpath('//div[@class="baseRight-intro"]/p').extract() loader.add_value('dept_info', dept_intro) loader.add_value('update_time', now_day()) hospital_dep_item = loader.load_item() yield hospital_dep_item
def parse_doctor_detail(self, response): self.logger.info('>>>>>>正在抓取医生详细信息……') loader = response.meta['loader'] doctor_detail = json.loads(response.text) loader.add_value('doctor_intro', doctor_detail['data'].get('extDetails', '')) loader.add_value('doctor_goodAt', doctor_detail['data'].get('extExperts', '')) loader.add_value('update_time', now_day()) doctor_item = loader.load_item() yield doctor_item
def parse_hospital_dep_detail(self, response): dept_name = response.meta['dept_name'] loader = CommonLoader2(item=HospitalDepItem(), response=response) loader.add_value('dept_name', dept_name) loader.add_value('hospital_name', self.hospital_name) loader.add_xpath('dept_info', '//div[@class="fleft wd740"]') loader.add_value('update_time', now_day()) dept_item = loader.load_item() yield dept_item
def parse_hospital_info(self, response): self.logger.info('>>>>>>正在抓取:医院信息>>>>>>') try: # 获取区或县 hospital_address = response.xpath( '//div[@class="yy_js clearfix"]/div/dl/dd[1]/text()' ).extract_first('') if hospital_address: hospital_county = get_county2('中国|江苏省|江苏|南京市|南京', hospital_address) else: hospital_county = None # 获取医院信息 loader = CommonLoader2(item=HospitalInfoItem(), response=response) loader.add_xpath('hospital_name', '//div[@class="yy_til"]/h2/text()', MapCompose(custom_remove_tags)) loader.add_value('hospital_level', response.meta.get('hospital_level'), MapCompose(custom_remove_tags, clean_info)) loader.add_xpath( 'hospital_addr', '//div[@class="yy_js clearfix"]/div/dl/dd[1]/text()', MapCompose(custom_remove_tags)) loader.add_value('hospital_pro', '江苏省') loader.add_value('hospital_city', '南京市') loader.add_value('hospital_county', hospital_county) loader.add_xpath( 'hospital_phone', '//div[@class="yy_js clearfix"]/div/dl/dd[2]/text()', MapCompose(custom_remove_tags)) loader.add_xpath('hospital_intro', '//dd[@id="wrap"]', MapCompose(remove_tags, custom_remove_tags)) loader.add_value('registered_channel', self.data_source_from) loader.add_value('dataSource_from', self.data_source_from) loader.add_value('hospital_url', response.url) loader.add_value('update_time', now_day()) hospital_info_item = loader.load_item() yield hospital_info_item # 获取科室信息 # self.logger.info('>>>>>>正在抓取{}:科室详细信息>>>>>>') all_dept_links = response.xpath( '//dl[@class="kfyy clearfix"]/dd/span/a/@href').extract() for each_dept_link in all_dept_links: dept_link = urljoin( self.host, re.sub(r';jsessionid=(.*?)\?', '?', each_dept_link)) self.headers['Referer'] = response.url yield Request(dept_link, headers=self.headers, callback=self.parse_hospital_dep_detail) except Exception as e: self.logger.error('在抓取医院详细信息和科室的过程中出错了,原因是:{}'.format(repr(e)))
def parse_doctor_detail(self, response): """医生详细信息""" self.logger.info('正在抓取{}:医生详细信息'.format(self.hospital_name)) loader = response.meta['loader'] doctor_intro = response.xpath('//div[@class="article"]/text()').extract_first('') loader.add_value('doctor_intro', doctor_intro) loader.add_value('doctor_goodAt', doctor_intro) loader.add_value('update_time', now_day()) doctor_info_item = loader.load_item() yield doctor_info_item
def parse_doctor_detail(self, response): loader = CommonLoader2(item=DoctorInfoItem(), response=response) loader.add_xpath( 'doctor_name', '//div[@class="fleft wd740"]/div[1]/div[2]/p[2]/text()', MapCompose(custom_remove_tags, match_special)) loader.add_xpath( 'dept_name', '//div[@class="fleft wd740"]/div[1]/div[2]/p[1]/text()', MapCompose(custom_remove_tags, match_special)) loader.add_value('hospital_name', self.hospital_name) loader.add_xpath( 'doctor_level', '//div[@class="fleft wd740"]/div[1]/div[2]/p[3]/text()', MapCompose(custom_remove_tags, match_special)) loader.add_xpath('doctor_intro', '//div[@class="fleft wd740"]/div[1]/div[2]/div/p[1]', MapCompose(remove_tags, custom_remove_tags)) loader.add_value('update_time', now_day()) dept_item = loader.load_item() yield dept_item reg_info = response.xpath( '//div[@class="fleft wd740"]/div[1]/div[2]/p[4]/text()' ).extract_first('') if reg_info: reg_info_list = get_reg_info(reg_info) for each_reg_info in reg_info_list: reg_loader = CommonLoader2(item=DoctorRegInfoItem(), response=response) reg_loader.add_xpath( 'doctor_name', '//div[@class="fleft wd740"]/div[1]/div[2]/p[2]/text()', MapCompose(custom_remove_tags, match_special)) reg_loader.add_xpath( 'dept_name', '//div[@class="fleft wd740"]/div[1]/div[2]/p[1]/text()', MapCompose(custom_remove_tags, match_special)) reg_loader.add_value('hospital_name', self.hospital_name) reg_loader.add_value('reg_info', each_reg_info) reg_loader.add_value('update_time', now_day()) reg_item = reg_loader.load_item() yield reg_item
def parse_hospital_dep(self, response): self.logger.info('>>>>>>正在抓取科室信息>>>>>>') try: hospital_name = response.meta.get('hospital_name') all_dept_links = response.xpath( '//div[@class="deptList-block mb20 clearfix"]') for each_dept_link in all_dept_links: dept_type = each_dept_link.xpath('b/text()').extract_first('') dept_info = each_dept_link.xpath('ul/li/a') for each_dept_info in dept_info: # 获取科室信息 dept_name = each_dept_info.xpath('@title').extract_first( '') dept_link = each_dept_info.xpath('@onclick').extract_first( '') dept_link2 = each_dept_info.xpath('@href').extract_first( '') dept_loader = CommonLoader2(item=HospitalDepItem(), response=response) dept_loader.add_value('dept_name', dept_name) dept_loader.add_value('dept_type', dept_type) dept_loader.add_value('hospital_name', hospital_name) dept_loader.add_value('dept_info', '') dept_loader.add_value('dataSource_from', self.data_source_from) dept_loader.add_value('update_time', now_day()) dept_item = dept_loader.load_item() yield dept_item # 获取医生信息 if dept_link: res = re.search(r'goNext\((.*?),\'(.*)\'\);', dept_link) if res: hospital_id = res.group(1) dept_id = res.group(2) doctor_list_url = self.doctor_list_url.format( hospital_id, dept_id) else: doctor_list_url = None else: doctor_list_url = urljoin(self.host, dept_link2) if doctor_list_url: self.headers['Referer'] = response.url yield Request(doctor_list_url, headers=self.headers, callback=self.parse_doctor_info, meta={ 'dept_name': dept_name, 'hospital_name': hospital_name }) except Exception as e: self.logger.error('在抓取科室信息过程中出错了,原因是:{}'.format(repr(e)))