def parse_doctor_reg_info(self, response): self.logger.info('>>>>>>正在抓取医生排班信息……') doctor_reg_info = json.loads(response.text) reg_info_list = doctor_reg_info['data']['selWork'] doctor_name = doctor_reg_info['data']['doctor'][0].get('doctorName', '') hospital_name = doctor_reg_info['data']['doctor'][0].get('hospitalName', '') dept_name = doctor_reg_info['data']['doctor'][0].get('deptName', '') for each_reg_info in reg_info_list: duty_date = each_reg_info['dutydate'] sel_works = each_reg_info['selWorks'] for each_work_info in sel_works: duty_time = each_work_info['dutytime'] if int(duty_time) == 1: duty_time = '上午' elif int(duty_time) == 3: duty_time = '上午' else: # duty_time 4 晚上 doctorId 3329 成都中医药大学附属医院 duty_time = '晚上' reg_info = '{0}{1}'.format(duty_date, duty_time) loader = CommonLoader(item=DoctorRegInfoItem(), response=response) loader.add_value('doctor_name', doctor_name) loader.add_value('hospital_name', hospital_name) loader.add_value('dept_name', dept_name) loader.add_value('reg_info', reg_info) loader.add_value('update_time', now_day()) reg_info_item = loader.load_item() yield reg_info_item
def parse_doctor_reg_info(self, response): self.logger.info('>>>>>>正在抓取{}:医生排班信息>>>>>>'.format( self.hospital_name)) doctor_name = response.meta['doctor_name'] dept_name = response.meta['dept_name'] reg_tr_list = response.xpath('//table/tr[position()>1]') is_has_reg = response.xpath('//table/tr[position()>1]/td/img') # reg_date = ['星期一', '星期二', '星期三', '星期四', '星期五', '星期六', '星期日'] reg_col = ['上午', '下午', '晚班'] if is_has_reg: for each_td in reg_tr_list: reg_time = each_td.xpath('td[1]/text()').extract_first('') all_reg_info = each_td.xpath('td[position()>1]') for index, each_reg_info in enumerate(all_reg_info): reg_info_date = reg_col[index] has_reg = each_reg_info.xpath('img') if has_reg: reg_info = '{0}{1}'.format(reg_time, reg_info_date) reg_loader = CommonLoader2(item=DoctorRegInfoItem(), response=response) reg_loader.add_value('doctor_name', doctor_name) reg_loader.add_value( 'dept_name', dept_name, MapCompose(custom_remove_tags, match_special)) reg_loader.add_value('hospital_name', self.hospital_name) reg_loader.add_value('reg_info', reg_info) reg_loader.add_value('update_time', now_day()) reg_item = reg_loader.load_item() yield reg_item
def parse_doctor_info_detail(self, response): self.logger.info('>>>>>>正在抓取医生详细信息>>>>>>') try: doctor_name = response.meta.get('doctor_name') dept_name = response.meta.get('dept_name') # dept_name = dept_name.split('-')[-1] if '-' in dept_name else dept_name doctor_level = response.meta.get('doctor_level') hospital_name = response.meta.get('hospital_name') # hospital_name2 = response.xpath('//div[@class="yy_til"]/h2/text()').extract_first('') # hospital_name = hospital_name2 if hospital_name2 else hospital_name1 diagnosis_amt = response.xpath('//td/span[@class="doc_yuyue_time"]/a/@title').extract() if diagnosis_amt: res = re.search(r'.*挂号费:(.*?)$', diagnosis_amt[0], S) if res: diagnosis_amt = res.group(1) else: diagnosis_amt = None else: diagnosis_amt = None loader = CommonLoader2(item=DoctorInfoItem(), response=response) loader.add_value('doctor_name', doctor_name, MapCompose(custom_remove_tags)) loader.add_value('dept_name', dept_name, MapCompose(custom_remove_tags)) loader.add_value('hospital_name', hospital_name, MapCompose(custom_remove_tags)) loader.add_value('doctor_level', doctor_level, MapCompose(custom_remove_tags, match_special2)) loader.add_xpath('doctor_intro', '//div[@class="zrys"]/dl/dd', MapCompose(remove_tags, custom_remove_tags, clean_info2)) loader.add_value('diagnosis_amt', diagnosis_amt) loader.add_value('dataSource_from', self.data_source_from) loader.add_value('crawled_url', response.url) loader.add_value('update_time', now_day()) doctor_item = loader.load_item() yield doctor_item # 获取医生排班信息 has_reg_info = response.xpath('//td/span[@class="doc_yuyue_time"]').extract() if has_reg_info: for each_reg_info in has_reg_info: reg_info_date = re.search(r'.*出诊时间:(.*?)\n', each_reg_info, S) reg_info_date = reg_info_date.group(1) if reg_info_date else None reg_info = '{0}-{1}'.format(now_year(), reg_info_date).replace('月', '-').replace('日', '') reg_loader = CommonLoader2(item=DoctorRegInfoItem(), response=response) reg_loader.add_value('doctor_name', doctor_name, MapCompose(custom_remove_tags)) reg_loader.add_value('dept_name', dept_name, MapCompose(custom_remove_tags)) reg_loader.add_xpath('hospital_name', '//div[@class="yy_til"]/h2/text()', MapCompose(custom_remove_tags)) reg_loader.add_value('reg_info', reg_info, MapCompose(custom_remove_tags)) reg_loader.add_value('dataSource_from', self.data_source_from) reg_loader.add_value('crawled_url', response.url) reg_loader.add_value('update_time', now_day()) reg_item = reg_loader.load_item() yield reg_item except Exception as e: self.logger.error('在抓取医生详细信息的过程中出错了,原因是:{}'.format(repr(e)))
def parse_doctor_info_detail(self, response): self.logger.info('>>>>>>正在抓取{}:医生详细信息>>>>>>'.format( self.hospital_name)) loader = CommonLoader2(item=DoctorInfoItem(), response=response) loader.add_xpath('doctor_name', '//div[@class="viewexpert_demo"]/p[1]/text()', MapCompose(custom_remove_tags)) loader.add_xpath('dept_name', '//div[@class="viewexpert_demo"]/p[3]/text()', MapCompose(custom_remove_tags, match_special)) loader.add_value('hospital_name', self.hospital_name) loader.add_xpath( 'doctor_level', '//div[@class="viewexpert_demo"]/p[2]/text()', MapCompose(custom_remove_tags, match_special, match_special2)) loader.add_xpath('doctor_intro', '//div[@class="viewexpert_detail"]', MapCompose(remove_tags, custom_remove_tags)) loader.add_xpath('doctor_goodAt', '//div[@class="viewexpert_demo"]/p[4]/text()', MapCompose(custom_remove_tags)) loader.add_value('update_time', now_day()) doctor_item = loader.load_item() yield doctor_item # 获取医生排班信息 reg_tr_list = response.xpath( '//div[@class="viewexpert_detail"]/table/tr[position()>1]') is_has_reg = response.xpath( '//div[@class="viewexpert_detail"]/table/tr[position()>1]/td/img') reg_date = ['星期一', '星期二', '星期三', '星期四', '星期五', '星期六', '星期日'] if is_has_reg: for each_td in reg_tr_list: i = 0 reg_time = each_td.xpath('td[1]/text()').extract_first('') all_reg_info = each_td.xpath('td[position()>1]') for each_reg_info in all_reg_info: reg_info_date = reg_date[i] i += 1 has_reg = each_reg_info.xpath('img') if has_reg: reg_info = '{0}{1}'.format(reg_info_date, reg_time) reg_loader = CommonLoader2(item=DoctorRegInfoItem(), response=response) reg_loader.add_xpath( 'doctor_name', '//div[@class="viewexpert_demo"]/p[1]/text()', MapCompose(custom_remove_tags)) reg_loader.add_xpath( 'dept_name', '//div[@class="viewexpert_demo"]/p[3]/text()', MapCompose(custom_remove_tags, match_special)) reg_loader.add_value('hospital_name', self.hospital_name) reg_loader.add_value('reg_info', reg_info) reg_loader.add_value('update_time', now_day()) reg_item = reg_loader.load_item() yield reg_item
def parse_doctor_detail(self, response): loader = CommonLoader2(item=DoctorInfoItem(), response=response) loader.add_xpath( 'doctor_name', '//div[@class="fleft wd740"]/div[1]/div[2]/p[2]/text()', MapCompose(custom_remove_tags, match_special)) loader.add_xpath( 'dept_name', '//div[@class="fleft wd740"]/div[1]/div[2]/p[1]/text()', MapCompose(custom_remove_tags, match_special)) loader.add_value('hospital_name', self.hospital_name) loader.add_xpath( 'doctor_level', '//div[@class="fleft wd740"]/div[1]/div[2]/p[3]/text()', MapCompose(custom_remove_tags, match_special)) loader.add_xpath('doctor_intro', '//div[@class="fleft wd740"]/div[1]/div[2]/div/p[1]', MapCompose(remove_tags, custom_remove_tags)) loader.add_value('update_time', now_day()) dept_item = loader.load_item() yield dept_item reg_info = response.xpath( '//div[@class="fleft wd740"]/div[1]/div[2]/p[4]/text()' ).extract_first('') if reg_info: reg_info_list = get_reg_info(reg_info) for each_reg_info in reg_info_list: reg_loader = CommonLoader2(item=DoctorRegInfoItem(), response=response) reg_loader.add_xpath( 'doctor_name', '//div[@class="fleft wd740"]/div[1]/div[2]/p[2]/text()', MapCompose(custom_remove_tags, match_special)) reg_loader.add_xpath( 'dept_name', '//div[@class="fleft wd740"]/div[1]/div[2]/p[1]/text()', MapCompose(custom_remove_tags, match_special)) reg_loader.add_value('hospital_name', self.hospital_name) reg_loader.add_value('reg_info', each_reg_info) reg_loader.add_value('update_time', now_day()) reg_item = reg_loader.load_item() yield reg_item
def parse_doctor_info_detail(self, response): self.logger.info('>>>>>>正在抓取{}:医生详细信息>>>>>>'.format( self.hospital_name)) dept_name = response.meta['dept_name'] doctor_name = response.meta['doctor_name'] loader = CommonLoader2(item=DoctorInfoItem(), response=response) loader.add_value('doctor_name', doctor_name) loader.add_value('dept_name', dept_name) loader.add_value('hospital_name', self.hospital_name) loader.add_xpath('doctor_intro', '//div[@class="right-about clearfix"]', MapCompose(remove_tags, get_doctor_intro2)) loader.add_xpath('doctor_goodAt', '//div[@class="right-about clearfix"]', MapCompose(remove_tags, get_doctor_good_at)) loader.add_value('update_time', now_day()) doctor_item = loader.load_item() yield doctor_item # 获取排班信息 self.logger.info('>>>>>>正在抓取{}:医生排班信息>>>>>>'.format( self.hospital_name)) reg_info = response.xpath( '//div[@class="right-about clearfix"]/p[contains(text(),"坐诊时间")]/text()' '|//div[@class="right-about clearfix"]/p/strong[contains(text(),"坐诊时间")]/text()' '|//div[@class="right-about clearfix"]/p/span/strong[contains(text(),"坐诊时间")]/text()' '|//div[@class="right-about clearfix"]/p/strong[contains(text(),"上午")]/text()' '|//div[@class="right-about clearfix"]/p/strong[contains(text(),"下午")]/text()' '|//div[@class="right-about clearfix"]/p/strong/span[contains(text(),"坐诊时间")]/text()' ).extract_first('') if reg_info: reg_loader = CommonLoader2(item=DoctorRegInfoItem(), response=response) reg_loader.add_value('doctor_name', doctor_name) reg_loader.add_value('dept_name', dept_name) reg_loader.add_value('hospital_name', self.hospital_name) reg_loader.add_value('reg_info', reg_info, MapCompose(match_special, clean_info)) reg_loader.add_value('update_time', now_day()) reg_item = reg_loader.load_item() yield reg_item
def parse_doctor_info_detail(self, response): # self.logger.info('>>>>>>正在抓取{}:医生详细信息>>>>>>'.format(self.hospital_name)) # loader = CommonLoader2(item=DoctorInfoItem(), response=response) # loader.add_xpath('doctor_name', # '//div[@class="page_sum2"]/table/tr[1]/td[3]', # MapCompose(remove_tags, custom_remove_tags, match_special)) # loader.add_xpath('dept_name', # '//div[@class="page_sum2"]/table/tr[3]/td', # MapCompose(remove_tags, custom_remove_tags, match_special)) # loader.add_value('hospital_name', self.hospital_name) # loader.add_xpath('doctor_level', # '//div[@class="page_sum2"]/table/tr[2]/td', # MapCompose(remove_tags, custom_remove_tags, match_special)) # loader.add_xpath('doctor_intro', # '//div[@class="listsum_block"]', # MapCompose(remove_tags, custom_remove_tags, clean_info)) # loader.add_value('doctor_goodAt', '') # loader.add_value('update_time', now_day()) # doctor_item = loader.load_item() # yield doctor_item # 医生排班信息 self.logger.info('>>>>>>正在抓取{}:医生排班信息>>>>>>'.format( self.hospital_name)) reg_loader = CommonLoader2(item=DoctorRegInfoItem(), response=response) reg_loader.add_xpath( 'doctor_name', '//div[@class="page_sum2"]/table/tr[1]/td[3]', MapCompose(remove_tags, custom_remove_tags, match_special)) reg_loader.add_xpath( 'dept_name', '//div[@class="page_sum2"]/table/tr[3]/td', MapCompose(remove_tags, custom_remove_tags, match_special)) reg_loader.add_value('hospital_name', self.hospital_name) reg_loader.add_xpath( 'reg_info', '//div[@class="page_sum2"]/table/tr[5]/td|' '//div[@class="listsum_block"]', MapCompose(remove_tags, custom_remove_tags, match_special)) reg_loader.add_value('update_time', now_day()) reg_item = reg_loader.load_item() yield reg_item
def parse_doctor_info_detail(self, response): hospital_name = response.meta.get('hospital_name') dept_name = response.meta.get('dept_name') self.logger.info('>>>>>>正在抓取[{}]医生详细信息>>>>>>'.format(hospital_name)) try: # 获取医生信息 loader = CommonLoader2(item=DoctorInfoItem(), response=response) loader.add_xpath( 'doctor_name', '//td/b[contains(text(),"姓名")]/ancestor::td[1]', MapCompose(remove_tags, custom_remove_tags, match_special)) loader.add_value('dept_name', dept_name, MapCompose(custom_remove_tags)) loader.add_xpath( 'hospital_name', '//div[@class="page_position"]/a[last()-1]/text()', MapCompose(custom_remove_tags)) loader.add_xpath( 'sex', '//td/b[contains(text(),"性别")]/ancestor::td[1]', MapCompose(remove_tags, custom_remove_tags, match_special, clean_info2)) loader.add_xpath( 'doctor_level', '//td/b[contains(text(),"职称")]/ancestor::td[1]', MapCompose(remove_tags, custom_remove_tags, match_special, clean_info2)) loader.add_xpath( 'doctor_intro', '//td/b[contains(text(),"医生简介")]/ancestor::td[1]', MapCompose(remove_tags, custom_remove_tags, clean_info2)) loader.add_value('dataSource_from', self.data_source_from) loader.add_value('crawled_url', response.url) loader.add_value('update_time', now_day()) doctor_item = loader.load_item() yield doctor_item # 获取医生排班信息 self.logger.info( '>>>>>>正在抓取[{}]医生排班信息>>>>>>'.format(hospital_name)) has_doctor_scheduling = response.xpath( '//td/div[@class="doctor-work"]') if has_doctor_scheduling: doctor_scheduling_tr = response.xpath( '//table[@class="workTable"]/tbody/tr') all_scheduling_date = response.xpath( '//table[@class="workTable"]/thead/tr/td[position()>1]' ).extract() scheduling_date_list = custom_remove_tags( remove_tags(','.join(all_scheduling_date))).split(',') for each_td in doctor_scheduling_tr: scheduling_time = each_td.xpath( 'td[1]/text()').extract_first('') scheduling_info = each_td.xpath('td[position()>1]') for index, each_s_i in enumerate(scheduling_info): has_scheduling = each_s_i.xpath('div') if has_scheduling: each_scheduling_date = scheduling_date_list[index][ 0:3] reg_info = '{0}{1}'.format(each_scheduling_date, scheduling_time) reg_loader = CommonLoader2( item=DoctorRegInfoItem(), response=response) reg_loader.add_xpath( 'doctor_name', '//td/b[contains(text(),"姓名")]/ancestor::td[1]', MapCompose(remove_tags, custom_remove_tags, match_special)) reg_loader.add_value( 'dept_name', dept_name, MapCompose(custom_remove_tags)) reg_loader.add_xpath( 'hospital_name', '//div[@class="page_position"]/a[last()-1]/text()', MapCompose(custom_remove_tags)) reg_loader.add_value('reg_info', reg_info) reg_loader.add_value('dataSource_from', self.data_source_from) reg_loader.add_value('crawled_url', response.url) reg_loader.add_value('update_time', now_day()) reg_item = reg_loader.load_item() yield reg_item except Exception as e: self.logger.error('在抓取医生详细信息的过程中出错了,原因是:{}'.format(repr(e)))
def parse_doctor_reg_info(self, response): """ 获取医生排班信息 """ self.logger.info('>>>>>>正在抓取医生排班信息……>>>>>>') dept_name = response.meta.get('dept_name') hospital_name = response.xpath( '//div[@class="link-555"]/a/text()').extract_first('') all_doctors_link = response.xpath( '//ul[@class="doc-results clearfix"]/li') self.logger.info('>>>>>>当前页共有{}个医生……'.format(str( len(all_doctors_link)))) try: for each_doctor in all_doctors_link: doctor_name = each_doctor.xpath( 'div/dl[@class="doctor-info"]/dt/a/text()').extract_first( '') reg_info_list = each_doctor.xpath( 'div[@class="doc-result-schedule"]/div/div/ul/li[@data-arrangeid]' ) self.logger.info('>>>>>>当前医生[{}]一周内的排班信息有{}条……'.format( doctor_name, str(len(reg_info_list)))) for each_reg_info in reg_info_list: loader = YiHuLoader(item=DoctorRegInfoItem(), selector=each_reg_info) reg_date = each_reg_info.xpath( 'a/span/em[1]/text()').extract_first('') reg_time = each_reg_info.xpath( 'a/span/em[2]/text()').extract_first('') loader.add_value('doctor_name', doctor_name) loader.add_value('dept_name', dept_name) loader.add_value('hospital_name', hospital_name) loader.add_value( 'reg_info', '{0}/{1}{2}'.format(now_year(), reg_date, reg_time).replace('/', '-'), MapCompose(custom_remove_tags, clean_info)) loader.add_value('update_time', now_day()) reg_info_item = loader.load_item() yield reg_info_item # 翻页信息 next_page = response.xpath( '//a[@class="page-next"]/@onclick').extract_first('') if next_page: next_page_link = re.search(r'\'(.*?)\'', next_page) if next_page_link: next_page_link = next_page_link.group(1) reg_request = SplashRequest( next_page_link, splash_headers=self.headers, callback=self.parse_doctor_reg_info, meta={'dept_name': dept_name}, args={ 'images': 0, 'wait': 5 }) self.headers['Referer'] = response.url yield reg_request except Exception as e: self.logger.error('抓取医生排班信息过程中出现错误,错误的眼因是:{}'.format(repr(e)))
def parse_doctor_info_detail(self, response): hospital_name = response.meta.get('hospital_name') dept_name = response.meta.get('dept_name') doctor_name = response.meta.get('doctor_name') self.logger.info('>>>>>>正在抓取[{}]医院-[{}]医生详细信息>>>>>>'.format( hospital_name, doctor_name)) try: # 获取医生信息 doctor_photo_url = response.xpath( '//div[@class="doctor_Img"]/img/@src').extract_first('') loader = CommonLoader2(item=DoctorInfoItem(), response=response) loader.add_value('doctor_name', doctor_name, MapCompose(custom_remove_tags)) loader.add_value('dept_name', dept_name, MapCompose(custom_remove_tags)) loader.add_value('hospital_name', hospital_name, MapCompose(custom_remove_tags)) loader.add_xpath('sex', '//span[@class="doctor_grade"]/text()', MapCompose(custom_remove_tags)) loader.add_xpath('doctor_level', '//span[@class="object_grade"]/text()', MapCompose(custom_remove_tags)) loader.add_xpath( 'doctor_intro', '//div[@class="doctor_Text_Major"]', MapCompose(remove_tags, custom_remove_tags, match_special2)) loader.add_value('dataSource_from', self.data_source_from) loader.add_value('crawled_url', response.url) loader.add_value('update_time', now_day()) loader.add_value('doctor_id', response.url, MapCompose(match_special2)) loader.add_xpath( 'dept_id', '//div[@class="position_one"]/span/a[last()]/@href', MapCompose(match_special2)) loader.add_xpath( 'hospital_id', '//div[@class="position_one"]/span/a[last()-1]/@href', MapCompose(match_special2)) loader.add_value('doctor_photo_url', urljoin(self.host, doctor_photo_url)) loader.add_value('gmt_created', now_time()) loader.add_value('gmt_modified', now_time()) doctor_item = loader.load_item() yield doctor_item # 获取医生排班信息 self.logger.info( '>>>>>>正在抓取[{}]医生排班信息>>>>>>'.format(hospital_name)) has_doctor_scheduling = response.xpath( '//span[@class="yuyue"]/a[contains(text(),"预约")]') if has_doctor_scheduling: doctor_scheduling_list = response.xpath( '//div[@class="whliesubscribe"]/ul/li[1]/div/' 'span/text()').extract() doctor_scheduling_length = len(doctor_scheduling_list) all_scheduling_date = response.xpath( '//div[@class="datetable"]/ul/li[position()>1]/' 'span[1]/text()').extract() scheduling_date_list = custom_remove_tags( remove_tags(','.join(all_scheduling_date))).split(',') for i in range(1, doctor_scheduling_length + 1): scheduling_info = response.xpath( '//div[@class="whliesubscribe"]/ul/li[position()>1]' '/div[{}]'.format(str(i))) scheduling_time = doctor_scheduling_list[i - 1] for index, each_s_i in enumerate(scheduling_info): has_scheduling = each_s_i.xpath('span/a') if has_scheduling: each_scheduling_date = scheduling_date_list[index] reg_info = '{0}-{1}{2}'.format( now_year(), each_scheduling_date, scheduling_time) reg_loader = CommonLoader2( item=DoctorRegInfoItem(), response=response) reg_loader.add_value( 'doctor_name', doctor_name, MapCompose(custom_remove_tags)) reg_loader.add_value( 'dept_name', dept_name, MapCompose(custom_remove_tags)) reg_loader.add_value( 'hospital_name', hospital_name, MapCompose(custom_remove_tags)) reg_loader.add_value('reg_info', reg_info) reg_loader.add_value('dataSource_from', self.data_source_from) reg_loader.add_value('crawled_url', response.url) reg_loader.add_value('update_time', now_day()) reg_loader.add_value('doctor_id', response.url, MapCompose(match_special2)) reg_loader.add_xpath( 'dept_id', '//div[@class="position_one"]/span/a[last()]/@href', MapCompose(match_special2)) reg_loader.add_xpath( 'hospital_id', '//div[@class="position_one"]/span/a[last()-1]/@href', MapCompose(match_special2)) reg_loader.add_value('gmt_created', now_time()) reg_loader.add_value('gmt_modified', now_time()) reg_item = reg_loader.load_item() yield reg_item except Exception as e: self.logger.error('在抓取医生详细信息的过程中出错了,原因是:{}'.format(repr(e)))