Example #1
0
 def parse_hospital_dep(self, response):
     hospital_name = response.meta.get('hospital_name')
     dept_type = response.meta.get('dept_type')
     self.logger.info('>>>>>>正在抓取:[{}]医院-[{}]科室信息>>>>>>'.format(hospital_name, dept_type))
     try:
         dept_info = json.loads(response.text)
         sub_dept_list = dept_info.get('data').get('subDepList')
         for each_dept_info in sub_dept_list:
             dept_name = each_dept_info.get('name')
             dept_id = each_dept_info.get('id')
             dept_loader = CommonLoader2(item=HospitalDepItem(), response=response)
             dept_loader.add_value('dept_name', dept_name, MapCompose(custom_remove_tags))
             dept_loader.add_value('dept_type', dept_type, MapCompose(custom_remove_tags))
             dept_loader.add_value('hospital_name', hospital_name, MapCompose(custom_remove_tags))
             dept_loader.add_value('dataSource_from', self.data_source_from)
             dept_loader.add_value('crawled_url', response.url)
             dept_loader.add_value('update_time', now_day())
             dept_loader.add_value('dept_id', dept_id)
             dept_loader.add_value('dept_url', response.url)
             dept_loader.add_value('gmt_created', now_time())
             dept_loader.add_value('gmt_modified', now_time())
             dept_item = dept_loader.load_item()
             yield dept_item
     except Exception as e:
         self.logger.error('在抓取医院科室信息过程中出错了,原因是:{}'.format(repr(e)))
Example #2
0
    def parse_doctor_info(self, response):
        hospital_name = response.meta.get('hospital_name')
        self.logger.info('>>>>>>正在抓取[{}]医院医生详细信息>>>>>>'.format(hospital_name))
        try:
            # 获取医生信息
            hospital_id = response.meta.get('hospital_id')
            doctor_info = json.loads(response.text)
            doctor_info_pages = doctor_info.get('data').get('pages')
            doctor_info_list = doctor_info.get('data').get('doctorPageList')
            current_page_num = re.search(r'&curr=(\d+)$', response.url)
            for each_doctor_info in doctor_info_list:
                portrait = each_doctor_info.get('portrait')
                doctor_photo_url = urljoin(self.doctor_image_host, portrait) if portrait else ''
                loader = CommonLoader2(item=DoctorInfoItem(), response=response)
                loader.add_value('doctor_name', each_doctor_info.get('name'), MapCompose(custom_remove_tags))
                loader.add_value('dept_name', each_doctor_info.get('departmentName'))
                loader.add_value('hospital_name', each_doctor_info.get('hospitalName'))
                loader.add_value('doctor_level', each_doctor_info.get('doctorTitleName'))
                loader.add_value('dataSource_from', self.data_source_from)
                loader.add_value('crawled_url', response.url)
                loader.add_value('update_time', now_day())
                loader.add_value('doctor_id', each_doctor_info.get('id'))
                loader.add_value('hospital_id', hospital_id)
                loader.add_value('doctor_photo_url', doctor_photo_url)
                loader.add_value('gmt_created', now_time())
                loader.add_value('gmt_modified', now_time())
                doctor_item = loader.load_item()
                yield doctor_item

            # 医生翻页
            if doctor_info_pages and current_page_num:
                current_page_num = int(current_page_num.group(1))
                total_pages = int(doctor_info_pages)
                next_page = current_page_num + 1
                if next_page <= total_pages:
                    next_doctor_url = self.doctor_url.format(str(hospital_id), str(next_page))
                    yield Request(next_doctor_url,
                                  headers=self.headers,
                                  callback=self.parse_doctor_info,
                                  meta={
                                      'hospital_name': hospital_name,
                                      'hospital_id': hospital_id
                                  })
        except Exception as e:
            self.logger.error('在抓取医生详细信息的过程中出错了,原因是:{}'.format(repr(e)))
Example #3
0
    def parse_hospital_info(self, response):
        hospital_name = response.meta.get('hospital_name')
        self.logger.info('>>>>>>正在抓取医院详细信息>>>>>>')
        try:
            hospital_id = response.meta.get('hospital_id')
            hospital_img_url = response.xpath(
                '//div[@class="divLeft_Img"]/img/@src').extract_first('')
            hospital_img_url = urljoin(
                self.host, hospital_img_url) if hospital_img_url else ''
            hospital_address = response.xpath(
                '//li[contains(text(),"地址")]/text()').extract_first('')
            hospital_county = get_county2('中国|福建省|福建|厦门市|厦门',
                                          match_special2(hospital_address))
            loader = CommonLoader2(item=HospitalInfoItem(), response=response)
            loader.add_xpath(
                'hospital_name',
                '//div[@class="divLeft_Info"]/ul/li[1]/span/text()',
                MapCompose(custom_remove_tags))
            loader.add_value('hospital_addr', hospital_address,
                             MapCompose(custom_remove_tags, match_special2))
            loader.add_value('hospital_pro', '福建省')
            loader.add_value('hospital_city', '厦门市')
            loader.add_value('hospital_county', hospital_county,
                             MapCompose(custom_remove_tags))
            loader.add_xpath('hospital_phone',
                             '//li[contains(text(),"电话")]/text()',
                             MapCompose(custom_remove_tags, match_special2))
            loader.add_xpath('hospital_intro', '//div[@class="introduceSpan"]',
                             MapCompose(remove_tags, custom_remove_tags))
            loader.add_value('registered_channel', self.data_source_from)
            loader.add_value('dataSource_from', self.data_source_from)
            loader.add_value('crawled_url', response.url)
            loader.add_value('update_time', now_day())
            loader.add_xpath('hospital_official_website',
                             '//li[contains(text(),"官网")]/text()',
                             MapCompose(custom_remove_tags, match_special2))
            loader.add_xpath('hospital_route',
                             '//li[contains(text(),"公交线路")]/text()',
                             MapCompose(custom_remove_tags, match_special2))
            loader.add_value('hospital_img_url', hospital_img_url)
            loader.add_value('gmt_created', now_time())
            loader.add_value('gmt_modified', now_time())
            loader.add_value('hospital_id', hospital_id)
            hospital_item = loader.load_item()
            yield hospital_item

            # 科室信息
            all_dept_info = response.xpath(
                '//div[@class="medicineOne"]|//div[@class="medicineTwo"]')
            for each_dept_info in all_dept_info:
                dept_type = each_dept_info.xpath(
                    'div[1]/span/text()').extract_first('')
                dept_names = each_dept_info.xpath('div[2]/div[1]')
                for each_dept_name in dept_names:
                    dept_name = each_dept_name.xpath('a/text()').extract_first(
                        '')
                    dept_link = each_dept_name.xpath('a/@href').extract_first(
                        '')
                    doctor_num_of_dept = each_dept_name.xpath(
                        'span/text()').extract_first('')

                    # 获取科室人数
                    if doctor_num_of_dept:
                        dept_person_num = re.search(r'(\d+)',
                                                    doctor_num_of_dept)
                        dept_person_num = int(dept_person_num.group(
                            1)) if dept_person_num else None
                    else:
                        dept_person_num = None

                    # 获取科室详细信息
                    dept_loader = CommonLoader2(item=HospitalDepItem(),
                                                response=response)
                    dept_loader.add_value('dept_name', dept_name,
                                          MapCompose(custom_remove_tags))
                    dept_loader.add_value('hospital_name', hospital_name,
                                          MapCompose(custom_remove_tags))
                    dept_loader.add_value('dept_type', dept_type,
                                          MapCompose(custom_remove_tags))
                    dept_loader.add_value('dataSource_from',
                                          self.data_source_from)
                    dept_info = ''.join(
                        response.xpath(
                            '//p[contains(text(),"科室简介")]/ancestor::tr[1]').
                        extract())
                    dept_loader.add_value(
                        'dept_info', dept_info,
                        MapCompose(remove_tags, custom_remove_tags,
                                   match_special2))
                    dept_loader.add_value('crawled_url', response.url)
                    dept_loader.add_value('update_time', now_day())
                    dept_loader.add_value('dept_id', dept_link,
                                          MapCompose(match_special2))
                    dept_loader.add_value('hospital_id', hospital_id)
                    dept_loader.add_value('dept_person_num', dept_person_num)
                    dept_loader.add_value('dept_url',
                                          urljoin(self.host, dept_link))
                    dept_loader.add_value('gmt_created', now_time())
                    dept_loader.add_value('gmt_modified', now_time())
                    dept_item = dept_loader.load_item()
                    yield dept_item

                    # 获取医生信息
                    if dept_link and dept_person_num:
                        self.headers['Referer'] = response.url
                        yield Request(urljoin(self.host, dept_link),
                                      headers=self.headers,
                                      callback=self.parse_doctor_info,
                                      dont_filter=True,
                                      meta={
                                          'hospital_name': hospital_name,
                                          'dept_name': dept_name,
                                      })
        except Exception as e:
            self.logger.error('在抓取医院详细信息过程中出错了,原因是:{}'.format(repr(e)))
Example #4
0
    def parse_hospital_info(self, response):
        hospital_name = response.meta.get('hospital_name')
        self.logger.info('>>>>>>正在抓取[{}]医院详细信息>>>>>>'.format(hospital_name))
        try:
            hospital_id = response.meta.get('hospital_id')
            data_type = response.meta.get('data_type')
            hospital_pro = response.meta.get('province_name')
            if data_type == '1':
                hospital_address = response.xpath('///div[@class="search-result-hospital-text"]/'
                                                  'p[4]/text()').extract_first('')
                hospital_phone = response.xpath('//div[@class="search-result-hospital-text"]/'
                                                'p[3]/text()').extract_first('')
                check_phone = re.search('(\d{6,})', hospital_phone)
                if not check_phone and not hospital_address:
                    hospital_address = hospital_phone
                    hospital_phone = ''
                # hospital_city = get_city('', hospital_address)
                # hospital_county = get_county2('', match_special2(hospital_address))
                df = transform([hospital_address])
                # hospital_pro = df.head()['省'][0]
                hospital_city = df.head()['市'][0]
                hospital_county = df.head()['区'][0]
                if hospital_pro in MUNICIPALITY2:
                    hospital_city = '{0}{1}'.format(hospital_pro, '市')
                    hospital_pro = ''
                else:
                    hospital_pro = '{0}{1}'.format(hospital_pro, '省')
                loader = CommonLoader2(item=HospitalInfoItem(), response=response)
                loader.add_xpath('hospital_name',
                                 '//span[@class="search-result-hospital-name"]/text()',
                                 MapCompose(custom_remove_tags))
                loader.add_xpath('hospital_level',
                                 '//div[@class="search-result-hospital-text"]/p[2]/text()',
                                 MapCompose(custom_remove_tags, clean_info2))
                loader.add_value('hospital_addr', hospital_address, MapCompose(custom_remove_tags))
                loader.add_value('hospital_pro', hospital_pro)
                loader.add_value('hospital_city', hospital_city)
                loader.add_value('hospital_county', hospital_county, MapCompose(custom_remove_tags))
                loader.add_value('hospital_phone', hospital_phone, MapCompose(custom_remove_tags))
                loader.add_xpath('hospital_intro',
                                 '//li[@id="info"]/p',
                                 MapCompose(remove_tags, custom_remove_tags))
                loader.add_value('registered_channel', self.data_source_from)
                loader.add_value('dataSource_from', self.data_source_from)
                loader.add_value('crawled_url', response.url)
                loader.add_value('update_time', now_day())
                loader.add_xpath('hospital_route',
                                 '//div[@class="search-result-hospital-text"]/p[5]/text()',
                                 MapCompose(custom_remove_tags, match_special2))
                loader.add_xpath('hospital_img_url', 'div[@class="search-result-hospital-img"]/img/@src')
                loader.add_value('hospital_tags', '1')
                loader.add_value('gmt_created', now_time())
                loader.add_value('gmt_modified', now_time())
                loader.add_value('hospital_id', hospital_id)
                hospital_item = loader.load_item()
                yield hospital_item

                # 获取科室信息
                # 从一级科室获取二级科室信息
                all_dept = response.xpath('//ul[@id="parent-list"]/li[@id]')
                for each_dept in all_dept:
                    each_dept_id = each_dept.xpath('@id').extract_first('')
                    each_dept_type = each_dept.xpath('div/span/text()').extract_first('')
                    self.headers['Referer'] = response.url
                    dept_link = self.dept_url.format(hospital_id, each_dept_id)
                    yield Request(dept_link,
                                  headers=self.headers,
                                  callback=self.parse_hospital_dep,
                                  meta={
                                      'hospital_name': hospital_name,
                                      'hospital_id': hospital_id,
                                      'dept_type': each_dept_type
                                  })

                # 获取医生信息
                self.headers['Referer'] = response.url
                doctor_info_link = self.doctor_url.format(hospital_id, '1')
                yield Request(doctor_info_link,
                              headers=self.headers,
                              callback=self.parse_doctor_info,
                              meta={
                                  'hospital_name': hospital_name,
                                  'hospital_id': hospital_id
                              })
            elif data_type == '2':
                hospital_address = response.xpath('//p[@class="hospital-private-address-line fc-6"]'
                                                  '[contains(text(),"地址")]/text()').extract_first('')
                # hospital_city = get_city('', hospital_address)
                # hospital_county = get_county2('', match_special2(hospital_address))
                df = transform([hospital_address])
                # hospital_pro = df.head()['省'][0]
                hospital_city = df.head()['市'][0]
                hospital_county = df.head()['区'][0]
                if hospital_pro in MUNICIPALITY2:
                    hospital_city = '{0}{1}'.format(hospital_pro, '市')
                    hospital_pro = ''
                else:
                    hospital_pro = '{0}{1}'.format(hospital_pro, '省')
                loader = CommonLoader2(item=HospitalInfoItem(), response=response)
                loader.add_xpath('hospital_name',
                                 '//p[@class="hospital-private-content-tit"]/text()',
                                 MapCompose(custom_remove_tags))
                loader.add_value('hospital_addr', hospital_address, MapCompose(custom_remove_tags, match_special2))
                loader.add_value('hospital_pro', hospital_pro)
                loader.add_value('hospital_city', hospital_city)
                loader.add_value('hospital_county', hospital_county, MapCompose(custom_remove_tags))
                loader.add_xpath('hospital_phone',
                                 '//div[@class="search-result-hospital-text"]/p[3]/text()',
                                 MapCompose(custom_remove_tags))
                loader.add_xpath('hospital_intro',
                                 '//li[@id="info"]/p',
                                 MapCompose(remove_tags, custom_remove_tags))
                loader.add_value('registered_channel', self.data_source_from)
                loader.add_value('dataSource_from', self.data_source_from)
                loader.add_value('crawled_url', response.url)
                loader.add_value('update_time', now_day())
                loader.add_xpath('hospital_route',
                                 '//li[@id="address"]/p[3]/text()',
                                 MapCompose(custom_remove_tags, match_special2))
                # loader.add_xpath('hospital_img_url', 'div[@class="search-result-hospital-img"]/img/@src')
                loader.add_value('hospital_tags', '2')
                loader.add_value('gmt_created', now_time())
                loader.add_value('gmt_modified', now_time())
                loader.add_value('hospital_id', hospital_id)
                hospital_item = loader.load_item()
                yield hospital_item

                # 获取科室信息
                # 从一级科室获取二级科室信息
                all_dept = response.xpath('//ul[@id="parent-list"]/li[position()>1]')
                for each_dept in all_dept:
                    dept_id = each_dept.xpath('div/@id').extract_first('')
                    dept_name = each_dept.xpath('div/span/text()').extract_first('')
                    dept_loader = CommonLoader2(item=HospitalDepItem(), response=response)
                    dept_loader.add_value('dept_name', dept_name, MapCompose(custom_remove_tags))
                    dept_loader.add_value('hospital_name', hospital_name, MapCompose(custom_remove_tags))
                    dept_loader.add_value('dataSource_from', self.data_source_from)
                    dept_loader.add_value('crawled_url', response.url)
                    dept_loader.add_value('update_time', now_day())
                    dept_loader.add_value('dept_id', dept_id.replace('subDepLi-', ''))
                    dept_loader.add_value('dept_url', response.url)
                    dept_loader.add_value('gmt_created', now_time())
                    dept_loader.add_value('gmt_modified', now_time())
                    dept_item = dept_loader.load_item()
                    yield dept_item

                    # 获取医生信息
                    self.headers['Referer'] = response.url
                    doctor_info_link = self.doctor_url.format(hospital_id, '1')
                    yield Request(doctor_info_link,
                                  headers=self.headers,
                                  callback=self.parse_doctor_info,
                                  meta={
                                      'hospital_name': hospital_name,
                                      'hospital_id': hospital_id,
                                      'dept_name': dept_name
                                  })
        except Exception as e:
            self.logger.error('在抓取医院详细信息过程中出错了,原因是:{}'.format(repr(e)))
Example #5
0
    def parse_doctor_info_detail(self, response):
        hospital_name = response.meta.get('hospital_name')
        dept_name = response.meta.get('dept_name')
        doctor_name = response.meta.get('doctor_name')
        self.logger.info('>>>>>>正在抓取[{}]医院-[{}]医生详细信息>>>>>>'.format(
            hospital_name, doctor_name))
        try:
            # 获取医生信息
            doctor_photo_url = response.xpath(
                '//div[@class="doctor_Img"]/img/@src').extract_first('')
            loader = CommonLoader2(item=DoctorInfoItem(), response=response)
            loader.add_value('doctor_name', doctor_name,
                             MapCompose(custom_remove_tags))
            loader.add_value('dept_name', dept_name,
                             MapCompose(custom_remove_tags))
            loader.add_value('hospital_name', hospital_name,
                             MapCompose(custom_remove_tags))
            loader.add_xpath('sex', '//span[@class="doctor_grade"]/text()',
                             MapCompose(custom_remove_tags))
            loader.add_xpath('doctor_level',
                             '//span[@class="object_grade"]/text()',
                             MapCompose(custom_remove_tags))
            loader.add_xpath(
                'doctor_intro', '//div[@class="doctor_Text_Major"]',
                MapCompose(remove_tags, custom_remove_tags, match_special2))
            loader.add_value('dataSource_from', self.data_source_from)
            loader.add_value('crawled_url', response.url)
            loader.add_value('update_time', now_day())
            loader.add_value('doctor_id', response.url,
                             MapCompose(match_special2))
            loader.add_xpath(
                'dept_id', '//div[@class="position_one"]/span/a[last()]/@href',
                MapCompose(match_special2))
            loader.add_xpath(
                'hospital_id',
                '//div[@class="position_one"]/span/a[last()-1]/@href',
                MapCompose(match_special2))
            loader.add_value('doctor_photo_url',
                             urljoin(self.host, doctor_photo_url))
            loader.add_value('gmt_created', now_time())
            loader.add_value('gmt_modified', now_time())
            doctor_item = loader.load_item()
            yield doctor_item

            # 获取医生排班信息
            self.logger.info(
                '>>>>>>正在抓取[{}]医生排班信息>>>>>>'.format(hospital_name))
            has_doctor_scheduling = response.xpath(
                '//span[@class="yuyue"]/a[contains(text(),"预约")]')
            if has_doctor_scheduling:
                doctor_scheduling_list = response.xpath(
                    '//div[@class="whliesubscribe"]/ul/li[1]/div/'
                    'span/text()').extract()
                doctor_scheduling_length = len(doctor_scheduling_list)
                all_scheduling_date = response.xpath(
                    '//div[@class="datetable"]/ul/li[position()>1]/'
                    'span[1]/text()').extract()
                scheduling_date_list = custom_remove_tags(
                    remove_tags(','.join(all_scheduling_date))).split(',')
                for i in range(1, doctor_scheduling_length + 1):
                    scheduling_info = response.xpath(
                        '//div[@class="whliesubscribe"]/ul/li[position()>1]'
                        '/div[{}]'.format(str(i)))
                    scheduling_time = doctor_scheduling_list[i - 1]
                    for index, each_s_i in enumerate(scheduling_info):
                        has_scheduling = each_s_i.xpath('span/a')
                        if has_scheduling:
                            each_scheduling_date = scheduling_date_list[index]
                            reg_info = '{0}-{1}{2}'.format(
                                now_year(), each_scheduling_date,
                                scheduling_time)
                            reg_loader = CommonLoader2(
                                item=DoctorRegInfoItem(), response=response)
                            reg_loader.add_value(
                                'doctor_name', doctor_name,
                                MapCompose(custom_remove_tags))
                            reg_loader.add_value(
                                'dept_name', dept_name,
                                MapCompose(custom_remove_tags))
                            reg_loader.add_value(
                                'hospital_name', hospital_name,
                                MapCompose(custom_remove_tags))
                            reg_loader.add_value('reg_info', reg_info)
                            reg_loader.add_value('dataSource_from',
                                                 self.data_source_from)
                            reg_loader.add_value('crawled_url', response.url)
                            reg_loader.add_value('update_time', now_day())
                            reg_loader.add_value('doctor_id', response.url,
                                                 MapCompose(match_special2))
                            reg_loader.add_xpath(
                                'dept_id',
                                '//div[@class="position_one"]/span/a[last()]/@href',
                                MapCompose(match_special2))
                            reg_loader.add_xpath(
                                'hospital_id',
                                '//div[@class="position_one"]/span/a[last()-1]/@href',
                                MapCompose(match_special2))
                            reg_loader.add_value('gmt_created', now_time())
                            reg_loader.add_value('gmt_modified', now_time())
                            reg_item = reg_loader.load_item()
                            yield reg_item
        except Exception as e:
            self.logger.error('在抓取医生详细信息的过程中出错了,原因是:{}'.format(repr(e)))