Esempio n. 1
0
 def parse_doctor_info_detail(self, response):
     self.logger.info('>>>>>>正在抓取{}:医生详细信息>>>>>>'.format(
         self.hospital_name))
     loader = response.meta['loader']
     dept_name1 = custom_remove_tags(''.join(response.meta['dept_name']))
     doctor_level2 = response.xpath(
         '//div[@class="FrontProducts_detail02-'
         '1482202997396_htmlbreak"]/p[1]/strong/text()').extract_first('')
     doctor_level1 = response.meta['doctor_level']
     dept_name2 = response.xpath(
         '//div[@id="FrontPublic_breadCrumb01-1482202386120"]/div/'
         'a[last()]/text()').extract_first('').replace('专家', '').replace(
             '类', '科')
     dept_name = re.sub(r'中医医师|中西医医师', '中医科',
                        dept_name1) if dept_name1 else dept_name2
     doctor_level = custom_remove_tags(
         ''.join(doctor_level1)) if doctor_level1 else doctor_level2
     doctor_intro = response.xpath(
         '//div[@class="FrontProducts_detail02-'
         '1482202997396_htmlbreak"]/p[2]').extract_first('')
     loader.add_value('dept_name', dept_name,
                      MapCompose(custom_remove_tags, filter_info3))
     loader.add_value('doctor_level', doctor_level,
                      MapCompose(filter_info4, custom_remove_tags))
     loader.add_value('doctor_intro', doctor_intro,
                      MapCompose(remove_tags, custom_remove_tags))
     loader.add_value('update_time', now_day())
     doctor_item = loader.load_item()
     yield doctor_item
Esempio n. 2
0
 def parse_dept_info(self, response):
     dep_type = response.meta['dep_type']
     self.logger.info('正在抓取[{}]科室信息'.format(custom_remove_tags(dep_type)))
     all_dept_names = response.xpath('//div[@class="pic"]')
     if all_dept_names:
         # 一级科室有二级科室
         for each_dept_name in all_dept_names:
             dept_detail_link = each_dept_name.xpath(
                 'a/@href').extract_first('')
             if dept_detail_link:
                 dept_detail_link = urljoin(self.host, dept_detail_link)
                 request = Request(dept_detail_link,
                                   headers=self.headers,
                                   callback=self.parse_dept_detail,
                                   meta={'dep_type': dep_type})
                 request.meta['Referer'] = response.url
                 yield request
     else:
         # 一级科室没有二级科室
         loader = PxfybjyLoader(item=HospitalDepItem(), response=response)
         loader.add_value('dept_type', dep_type)
         loader.add_value('hospital_name', self.hospital_name)
         loader.add_value('update_time', now_day())
         hospital_dep_item = loader.load_item()
         yield hospital_dep_item
Esempio n. 3
0
 def parse_doctor_info_detail(self, response):
     self.logger.info('>>>>>>正在抓取:医生详细信息>>>>>>')
     try:
         diagnosis_fee = response.meta.get('diagnosis_fee')
         doctor_info = custom_remove_tags(
             remove_tags(''.join(
                 response.xpath('//td[@class="bk '
                                'titletxt11"]').extract())))
         doctor_intro1 = get_hospital_info(doctor_info, '个人简介:', '荣誉集锦:')
         doctor_intro2 = get_hospital_info(doctor_info, '个人简介:', '出诊时间:')
         doctor_intro = doctor_intro2 if doctor_intro2 else doctor_intro1
         loader = CommonLoader2(item=DoctorInfoItem(), response=response)
         loader.add_xpath(
             'doctor_name',
             '//table[@id="m_jkzs"]/tr/td[1]/a[last()]/text()',
             MapCompose(custom_remove_tags))
         loader.add_xpath(
             'dept_name',
             '//table[@id="m_jkzs"]/tr/td[1]/a[last()-1]/text()',
             MapCompose(custom_remove_tags))
         loader.add_xpath(
             'hospital_name',
             '//table[@id="m_jkzs"]/tr/td[1]/a[last()-2]/text()',
             MapCompose(custom_remove_tags))
         loader.add_xpath(
             'doctor_level',
             '//span[@class="selecttxt"][contains(text(),"医师") or contains(text(),"专家")]/text()',
             MapCompose(custom_remove_tags))
         loader.add_value('doctor_intro', doctor_intro,
                          MapCompose(custom_remove_tags))
         loader.add_xpath(
             'doctor_goodAt',
             '//span[@class="titletxt11"]/b[contains(text(),"擅长")]/ancestor::span[1]/text()',
             MapCompose(remove_tags, custom_remove_tags))
         loader.add_value(
             'diagnosis_amt', diagnosis_fee,
             MapCompose(remove_tags, custom_remove_tags, get_number))
         loader.add_value('dataSource_from', self.data_source_from)
         loader.add_value('update_time', now_day())
         doctor_item = loader.load_item()
         yield doctor_item
     except Exception as e:
         self.logger.error('在抓取医生详细信息的过程中出错了,原因是:{}'.format(repr(e)))
Esempio n. 4
0
    def parse_doctor_info_detail(self, response):
        hospital_name = response.meta.get('hospital_name')
        dept_name = response.meta.get('dept_name')
        self.logger.info('>>>>>>正在抓取[{}]医生详细信息>>>>>>'.format(hospital_name))
        try:
            # 获取医生信息
            loader = CommonLoader2(item=DoctorInfoItem(), response=response)
            loader.add_xpath(
                'doctor_name', '//td/b[contains(text(),"姓名")]/ancestor::td[1]',
                MapCompose(remove_tags, custom_remove_tags, match_special))
            loader.add_value('dept_name', dept_name,
                             MapCompose(custom_remove_tags))
            loader.add_xpath(
                'hospital_name',
                '//div[@class="page_position"]/a[last()-1]/text()',
                MapCompose(custom_remove_tags))
            loader.add_xpath(
                'sex', '//td/b[contains(text(),"性别")]/ancestor::td[1]',
                MapCompose(remove_tags, custom_remove_tags, match_special,
                           clean_info2))
            loader.add_xpath(
                'doctor_level',
                '//td/b[contains(text(),"职称")]/ancestor::td[1]',
                MapCompose(remove_tags, custom_remove_tags, match_special,
                           clean_info2))
            loader.add_xpath(
                'doctor_intro',
                '//td/b[contains(text(),"医生简介")]/ancestor::td[1]',
                MapCompose(remove_tags, custom_remove_tags, clean_info2))
            loader.add_value('dataSource_from', self.data_source_from)
            loader.add_value('crawled_url', response.url)
            loader.add_value('update_time', now_day())
            doctor_item = loader.load_item()
            yield doctor_item

            # 获取医生排班信息
            self.logger.info(
                '>>>>>>正在抓取[{}]医生排班信息>>>>>>'.format(hospital_name))
            has_doctor_scheduling = response.xpath(
                '//td/div[@class="doctor-work"]')
            if has_doctor_scheduling:
                doctor_scheduling_tr = response.xpath(
                    '//table[@class="workTable"]/tbody/tr')
                all_scheduling_date = response.xpath(
                    '//table[@class="workTable"]/thead/tr/td[position()>1]'
                ).extract()
                scheduling_date_list = custom_remove_tags(
                    remove_tags(','.join(all_scheduling_date))).split(',')
                for each_td in doctor_scheduling_tr:
                    scheduling_time = each_td.xpath(
                        'td[1]/text()').extract_first('')
                    scheduling_info = each_td.xpath('td[position()>1]')
                    for index, each_s_i in enumerate(scheduling_info):
                        has_scheduling = each_s_i.xpath('div')
                        if has_scheduling:
                            each_scheduling_date = scheduling_date_list[index][
                                0:3]
                            reg_info = '{0}{1}'.format(each_scheduling_date,
                                                       scheduling_time)
                            reg_loader = CommonLoader2(
                                item=DoctorRegInfoItem(), response=response)
                            reg_loader.add_xpath(
                                'doctor_name',
                                '//td/b[contains(text(),"姓名")]/ancestor::td[1]',
                                MapCompose(remove_tags, custom_remove_tags,
                                           match_special))
                            reg_loader.add_value(
                                'dept_name', dept_name,
                                MapCompose(custom_remove_tags))
                            reg_loader.add_xpath(
                                'hospital_name',
                                '//div[@class="page_position"]/a[last()-1]/text()',
                                MapCompose(custom_remove_tags))
                            reg_loader.add_value('reg_info', reg_info)
                            reg_loader.add_value('dataSource_from',
                                                 self.data_source_from)
                            reg_loader.add_value('crawled_url', response.url)
                            reg_loader.add_value('update_time', now_day())
                            reg_item = reg_loader.load_item()
                            yield reg_item
        except Exception as e:
            self.logger.error('在抓取医生详细信息的过程中出错了,原因是:{}'.format(repr(e)))
Esempio n. 5
0
    def parse(self, response):
        self.logger.info('>>>>>>正在抓取所有医院信息>>>>>>')
        all_hospitals = response.xpath(
            '//div[@class="serach-left-list"]/ul/li')
        hospital_pro = response.xpath(
            '//div[@id="yyk_header_location"]/strong/text()').extract_first('')
        for each_hospital in all_hospitals:
            each_hospital_link = each_hospital.xpath('a/@href').extract_first(
                '')
            each_hospital_name = each_hospital.xpath(
                'div[1]/div[1]/a/text()').extract_first('')
            if each_hospital_link:
                hospital_id = re.search(r'.*/(.*?).html$', each_hospital_link)
                hospital_link = urljoin(self.host, each_hospital_link)
                if hospital_id:
                    hospital_id = hospital_id.group(1)

                    # 获取医院详细信息
                    hospital_detail_url = '{0}{1}'.format(
                        hospital_id, self.hospital_postfix)
                    hospital_intro_link = urljoin(self.hospital_url,
                                                  hospital_detail_url)
                    self.headers['Referer'] = hospital_link
                    yield Request(hospital_intro_link,
                                  headers=self.headers,
                                  callback=self.parse_hospital_info,
                                  meta={
                                      'hospital_pro':
                                      hospital_pro,
                                      'hospital_name':
                                      custom_remove_tags(each_hospital_name)
                                  },
                                  dont_filter=True)

                    # 获取科室信息
                    dept_detail_url = '{0}{1}'.format(hospital_id,
                                                      self.dept_postfix)
                    dept_link = urljoin(self.hospital_url, dept_detail_url)
                    yield Request(urljoin(self.host, dept_link),
                                  headers=self.headers,
                                  callback=self.parse_hospital_dep,
                                  dont_filter=True,
                                  meta={'hospital_name': each_hospital_name})

            # 获取医生信息,入口:医院列表页面[推荐专家],存在的问题:部分医生没有科室
            # doctors_link = each_hospital.xpath('div[@class="as"]/a[contains(text(),'
            #                                    '"推荐专家")]/@href').extract_first('')
            # if doctors_link:
            #     self.headers['Referer'] = response.url
            #     yield Request(urljoin(self.host, doctors_link),
            #                   headers=self.headers,
            #                   callback=self.parse_doctor_info,
            #                   dont_filter=True,
            #                   meta={'hospital_name': each_hospital_name})

        # 翻页
        has_next = response.xpath(
            '//div[@class="next"]/a[contains(text(),"下一页")]/@href'
        ).extract_first('')
        if has_next:
            next_page_link = urljoin(self.host, has_next)
            self.headers['Referer'] = response.url
            yield Request(next_page_link,
                          headers=self.headers,
                          callback=self.parse)
Esempio n. 6
0
    def parse_hospital_info(self, response):
        hospital_name = response.meta.get('hospital_name')
        self.logger.info('>>>>>>正在抓取:[{}]医院详细信息>>>>>>'.format(hospital_name))
        try:
            # 获取医院等级与类别
            l_a_c = response.xpath(
                '//div[@class="l"]/h2/span/i/text()').extract()
            l_a_c = custom_remove_tags(remove_tags('|'.join(l_a_c)))
            h_l = h_c = m_t = None
            if l_a_c:

                # 等级
                level = re.search(r'(.*等|.*级|.*甲)', l_a_c)
                if level:
                    h_l = level.group(1).split('|')[-1]

                # 类别
                category = re.search(r'(.*?医院)', l_a_c.replace('医保定点医院', ''))
                if category:
                    h_c = category.group(1).split('|')[-1]

                # 医保类型
                medical_type = re.search(r'(.*定点)', l_a_c)
                if medical_type:
                    m_t = medical_type.group(1).split('|')[-1]
            else:
                h_l = h_c = None

            # 获取省市信息
            hospital_pro = response.meta.get('hospital_pro')
            hospital_city = hospital_county = None
            h_a = response.xpath(
                '//dt[contains(text(),"地址")]/ancestor::dl[1]/dd').extract()
            hospital_address = custom_remove_tags(
                remove_tags(''.join(h_a).replace('查看地图', '')))
            if hospital_pro and hospital_address:
                if hospital_pro in MUNICIPALITY2:
                    hospital_city = hospital_pro
                    hospital_pro = ''
                    hos_c = hospital_city.replace('市', '')
                    useless_info = '{}{}|{}'.format(hos_c, '市', hos_c)
                    single_address = match_special2(
                        hospital_address.split(';')[0])
                    hospital_county = get_county2(useless_info, single_address)
                else:
                    hos_p = hospital_pro
                    hospital_pro = '{0}{1}'.format(hospital_pro, '省')
                    single_address = match_special2(
                        hospital_address.split(';')[0])
                    hospital_city = get_city(hospital_pro, single_address)
                    if hospital_city:
                        hos_c = hospital_city.replace('市', '')
                        useless_info = '{}|{}|{}|{}'.format(
                            hospital_pro, hos_p, hospital_city, hos_c)
                        hospital_county = get_county2(useless_info,
                                                      single_address)

            # 公立/私立
            h_t = custom_remove_tags(
                response.xpath(
                    '//li/b[contains(text(),"国营")]/text()').extract_first(''))
            hospital_type = '公立' if h_t == '国营' else ''

            # 医院信息item
            loader = CommonLoader2(item=HospitalInfoItem(), response=response)
            loader.add_xpath('hospital_name', '//div[@class="l"]/h2/text()',
                             MapCompose(custom_remove_tags))
            loader.add_value('hospital_level', h_l,
                             MapCompose(custom_remove_tags))
            loader.add_value('hospital_type', hospital_type)
            loader.add_value('hospital_category', h_c,
                             MapCompose(custom_remove_tags))
            loader.add_value('hospital_addr', hospital_address,
                             MapCompose(custom_remove_tags))
            loader.add_value('hospital_pro', hospital_pro,
                             MapCompose(custom_remove_tags))
            loader.add_value('hospital_city', hospital_city,
                             MapCompose(custom_remove_tags))
            loader.add_value('hospital_county', hospital_county,
                             MapCompose(custom_remove_tags))
            loader.add_xpath('hospital_phone',
                             '//dt[contains(text(),"电话")]/ancestor::dl[1]/dd',
                             MapCompose(remove_tags, custom_remove_tags))
            loader.add_xpath(
                'hospital_intro',
                '//dt/strong[contains(text(),"简介")]/ancestor::dl[1]/dd',
                MapCompose(remove_tags, custom_remove_tags))
            loader.add_value('medicare_type', m_t,
                             MapCompose(custom_remove_tags))
            # loader.add_value('registered_channel', self.data_source_from)
            loader.add_value('dataSource_from', self.data_source_from)
            loader.add_value('hospital_url', response.url)
            loader.add_value('update_time', now_day())
            hospital_item = loader.load_item()
            yield hospital_item

            # 获取医院别名
            hospital_alias = response.xpath(
                '//div[@class="l"]/p/text()').extract_first('')
            if hospital_alias:
                h_s = custom_remove_tags(hospital_alias)
                if h_s:
                    all_hospital_alias = h_s.split(',')
                    for each_alias in all_hospital_alias:
                        if each_alias != hospital_name:
                            alias_loader = CommonLoader2(
                                item=HospitalAliasItem(), response=response)
                            alias_loader.add_xpath(
                                'hospital_name', '//div[@class="l"]/h2/text()',
                                MapCompose(custom_remove_tags))
                            alias_loader.add_value(
                                'hospital_alias_name', each_alias,
                                MapCompose(custom_remove_tags, match_special))
                            alias_loader.add_value('dataSource_from',
                                                   self.data_source_from)
                            alias_loader.add_value('update_time', now_day())
                            alias_item = alias_loader.load_item()
                            yield alias_item
        except Exception as e:
            self.logger.error('在抓取医院详细信息过程中出错了,原因是:{}'.format(repr(e)))
Esempio n. 7
0
    def parse_hospital_info(self, response):
        self.logger.info('>>>>>>正在抓取:医院详细信息和科室信息>>>>>>')
        try:
            # 获取医院信息
            hospital_type = response.meta.get('hospital_type')
            hospital_category = '{0}{1}'.format(
                hospital_type, '医院') if hospital_type else None
            hospital_info = custom_remove_tags(
                remove_tags(''.join(
                    response.xpath('//td[@class='
                                   '"title_yh14"]').extract())))
            hospital_address = get_hospital_info(hospital_info, '地址:', '电话:')
            hospital_address = hospital_address.replace(
                '查看地图', '') if hospital_address else None
            hospital_phone = get_hospital_info(hospital_info, '电话:', '官网')
            hospital_intro = get_hospital_info(hospital_info, '简介:', '$')
            hospital_intro = hospital_intro.replace(
                '...更多>>', '') if hospital_intro else None
            loader = CommonLoader2(item=HospitalInfoItem(), response=response)
            loader.add_xpath('hospital_name', '//span[@class="title"]/text()',
                             MapCompose(custom_remove_tags))
            loader.add_xpath('hospital_level', '//span[@class="dj"]/text()',
                             MapCompose(custom_remove_tags))
            loader.add_value('hospital_category', hospital_category)
            loader.add_value('hospital_addr', hospital_address,
                             MapCompose(custom_remove_tags))
            loader.add_value('hospital_pro', '山西省')
            loader.add_xpath(
                'hospital_city',
                '//td[contains(text(),"山西")]/ancestor::tr[1]/td[1]/a[1]/text()',
                MapCompose(custom_remove_tags))
            loader.add_xpath(
                'hospital_county',
                '//td[contains(text(),"山西")]/ancestor::tr[1]/td[1]/a[2]/text()',
                MapCompose(custom_remove_tags))
            loader.add_value('hospital_phone', hospital_phone,
                             MapCompose(custom_remove_tags))
            loader.add_value('hospital_intro', hospital_intro,
                             MapCompose(custom_remove_tags))
            loader.add_value('registered_channel', self.data_source_from)
            loader.add_value('dataSource_from', self.data_source_from)
            loader.add_value('update_time', now_day())
            hospital_info_item = loader.load_item()
            yield hospital_info_item

            # 获取科室信息
            self.logger.info('>>>>>>正在抓取科室详细信息>>>>>>')
            all_dept_links = response.xpath('//tr[@class="h_bottom"]')
            for each_dept_link in all_dept_links:
                dept_type = each_dept_link.xpath('td[1]/text()').extract_first(
                    '')
                dept_name = each_dept_link.xpath(
                    'td[2]/table/tr/td/a/text()').extract()
                for each_dept_name in dept_name:
                    dept_loader = CommonLoader2(item=HospitalDepItem(),
                                                response=response)
                    dept_loader.add_value(
                        'dept_name', each_dept_name,
                        MapCompose(custom_remove_tags, match_special2))
                    dept_loader.add_value(
                        'dept_type', dept_type,
                        MapCompose(custom_remove_tags, match_special2))
                    dept_loader.add_xpath('hospital_name',
                                          '//span[@class="title"]/text()',
                                          MapCompose(custom_remove_tags))
                    dept_loader.add_value('dept_info', '')
                    dept_loader.add_value('dataSource_from',
                                          self.data_source_from)
                    dept_loader.add_value('update_time', now_day())
                    dept_item = dept_loader.load_item()
                    yield dept_item
        except Exception as e:
            self.logger.error('在抓取医院详细信息和科室的过程中出错了,原因是:{}'.format(repr(e)))
Esempio n. 8
0
    def parse_hospital_info(self, response):
        self.logger.info('>>>>>>正在抓取:医院详细信息>>>>>>')
        try:
            # 获取医院信息

            # 获取医院等级与地区
            hospital_info = response.xpath(
                '//p[@class="yygh_box_top_p2"]').extract()
            hospital_info2 = custom_remove_tags(
                remove_tags(''.join(hospital_info)))
            hospital_level = hospital_info2.split(':')[1].replace('区域', '')
            hospital_county = hospital_info2.split(':')[2].replace('分类', '')
            if hospital_level:
                res = re.search(r'(.*等|.*级|.*合格|.*甲)(.*?)$', hospital_level)
                if res:
                    h_l = res.group(1)
                    h_c = res.group(2)
                    if h_c:
                        h_c_2 = re.sub(r'合格|医院', '', h_c)
                        if h_c_2:
                            h_c = '{0}{1}'.format(h_c_2, '医院')
                else:
                    h_l = h_c = None
            else:
                h_l = h_c = None
            loader = CommonLoader2(item=HospitalInfoItem(), response=response)
            loader.add_xpath('hospital_name',
                             '//p[@class="yygh_box_top_p"]/strong/text()',
                             MapCompose(custom_remove_tags))
            loader.add_value('hospital_level', h_l,
                             MapCompose(custom_remove_tags))
            loader.add_value('hospital_category', h_c)
            loader.add_xpath(
                'hospital_addr',
                '//span[@class="yygh_box_con_dl_span1"]/ancestor::dl[1]/dd[1]/p/text()',
                MapCompose(custom_remove_tags))
            loader.add_value('hospital_pro', '')
            loader.add_value('hospital_city', '北京市')
            loader.add_value('hospital_county', hospital_county,
                             MapCompose(custom_remove_tags))
            loader.add_xpath(
                'hospital_phone',
                '//span[@class="yygh_box_con_dl_span3"]/ancestor::dl[1]/dd[1]/p/text()',
                MapCompose(custom_remove_tags))
            loader.add_value('registered_channel', self.data_source_from)
            loader.add_value('dataSource_from', self.data_source_from)
            loader.add_value('hospital_url', response.url)
            loader.add_value('update_time', now_day())

            # 获取医院介绍
            hospital_intro_link = response.xpath(
                '//a[contains(text(),"医院介绍")]/@href').extract_first('')
            if hospital_intro_link:
                hospital_intro_link = urljoin(self.host, hospital_intro_link)
                self.headers['Referer'] = response.url
                yield Request(hospital_intro_link,
                              headers=self.headers,
                              callback=self.parse_hospital_detail_info,
                              meta={'loader': loader})

            # 获取科室信息
            all_dept_links = response.xpath('//div[@class="kfyuks_yyksbox"]')
            for each_dept_link in all_dept_links:
                dept_type = each_dept_link.xpath(
                    'div[1]/text()').extract_first('')
                dept_info = each_dept_link.xpath(
                    'div[2]/div/ul/li/a/text()').extract()
                for each_dept_info in dept_info:
                    dept_loader = CommonLoader2(item=HospitalDepItem(),
                                                response=response)
                    dept_loader.add_value('dept_name', each_dept_info,
                                          MapCompose(custom_remove_tags))
                    dept_loader.add_value('dept_type', dept_type,
                                          MapCompose(custom_remove_tags))
                    dept_loader.add_xpath(
                        'hospital_name',
                        '//p[@class="yygh_box_top_p"]/strong/text()',
                        MapCompose(custom_remove_tags))
                    dept_loader.add_value('dept_info', '')
                    dept_loader.add_value('dataSource_from',
                                          self.data_source_from)
                    dept_loader.add_value('update_time', now_day())
                    dept_item = dept_loader.load_item()
                    yield dept_item
        except Exception as e:
            self.logger.error('在抓取医院详细信息过程中出错了,原因是:{}'.format(repr(e)))
Esempio n. 9
0
    def parse_hospital_info(self, response):
        hospital_name = response.meta.get('hospital_name')
        self.logger.info('>>>>>>正在抓取[{}]医院信息和科室信息>>>>>>'.format(hospital_name))
        hospital_city = response.xpath(
            '//div[@class="jieshao_zi"]/p[4]/text()').extract()
        if hospital_city:
            hospital_address = custom_remove_tags(''.join(hospital_city))
            hospital_city2 = get_city(hospital_address)
            useless_info = '中国|湖南省|湖南|{}'.format(hospital_city2)
            hospital_county = get_county2(useless_info, hospital_address)
        else:
            hospital_county = None
        loader = CommonLoader2(item=HospitalInfoItem(), response=response)
        loader.add_xpath('hospital_name',
                         '//div[@class="jieshao_zi"]/p/font/text()',
                         MapCompose(custom_remove_tags))
        loader.add_xpath('hospital_level',
                         '//div[@class="jieshao_zi"]/p[2]/text()',
                         MapCompose(custom_remove_tags))
        loader.add_value('hospital_type', '公立')
        loader.add_value('hospital_category', '')
        loader.add_xpath('hospital_addr',
                         '//div[@class="jieshao_zi"]/p[4]/text()',
                         MapCompose(custom_remove_tags))
        loader.add_value('hospital_pro', '湖南省')
        loader.add_xpath('hospital_city',
                         '//div[@class="jieshao_zi"]/p[4]/text()',
                         MapCompose(custom_remove_tags, get_city))
        loader.add_value('hospital_county', hospital_county)
        loader.add_xpath('hospital_phone',
                         '//div[@class="jieshao_zi"]/p[3]/text()',
                         MapCompose(custom_remove_tags))
        loader.add_xpath(
            'hospital_intro', '//div[@id="starlist"]',
            MapCompose(remove_tags, custom_remove_tags, clean_info))
        loader.add_value('registered_channel', self.data_source_from)
        loader.add_value('dataSource_from', self.data_source_from)
        loader.add_value('update_time', now_day())
        hospital_info_item = loader.load_item()
        yield hospital_info_item

        # 获取科室信息
        self.logger.info('>>>>>>正在抓取[{}]科室信息>>>>>>'.format(hospital_name))
        dept_links = response.xpath('//div[@class="xuanze_kslb"]')
        if dept_links:
            for each_dept_link in dept_links:
                dept_type = each_dept_link.xpath(
                    'div[1]/ul/li/text()').extract_first('')
                all_dept_links = each_dept_link.xpath('div[2]/ul/li/a')
                for dept_link in all_dept_links:
                    # dept_name = dept_link.xpath('text()').extract_first('')
                    data_info = dept_link.xpath('@onclick').extract_first('')
                    if data_info:
                        data_info = ''.join(re.findall(r'\S+', data_info))
                        is_sp_time = re.search(r'isSpTime:\'(.*?)\'',
                                               data_info)
                        pay_mode = re.search(r'paymode:\'(.*?)\'', data_info)
                        dept_id = re.search(r'platformDeptId:\'(.*?)\'',
                                            data_info)
                        hos_id = re.search(r'platformHosId:\'(.*?)\'',
                                           data_info, S)
                        dept_name = re.search(r'tempDeptName:\'(.*?)\'',
                                              data_info, S)
                        org_name = re.search(r'orgname:\'(.*?)\'', data_info,
                                             S)
                        if dept_id and hos_id and dept_name and org_name:
                            is_sp_time = is_sp_time.group(1)
                            pay_mode = pay_mode.group(1)
                            dept_id = dept_id.group(1)
                            hos_id = hos_id.group(1)
                            dept_name = dept_name.group(1)
                            org_name = org_name.group(1)
                            data = {
                                'isSpTime': str(is_sp_time),
                                'paymode': quote(pay_mode),
                                'doctorCollectResult': '',
                                'platformDeptId': str(dept_id),
                                'orgname': quote(org_name),
                                'tempDeptName': quote(dept_name),
                                'platformHosId': str(hos_id),
                                'platformDoctorId': ''
                            }
                            self.headers.update({
                                'Content-Type':
                                'application/x-www-form-urlencoded',
                                'Origin': 'http://www.hnyygh.com',
                                'Referer':
                                'http://www.hnyygh.com/searchDeptmentAction.action',
                                'Pragma': 'no-cache'
                            })
                            splash_args = {
                                'url': self.dept_detail_url,
                                'headers': self.headers,
                                'lua_source': self.dept_script,
                                'data': data
                            }
                            yield SplashRequest(
                                self.dept_detail_url,
                                endpoint='execute',
                                args=splash_args,
                                dont_filter=True,
                                headers=self.headers,
                                callback=self.parse_hospital_dep_detail,
                                meta={
                                    'dept_type': dept_type,
                                    'dept_name': dept_name,
                                    'hospital_name': org_name
                                })
                            # 获取医生信息
                            data = {
                                'platformDeptId': dept_id,
                                'platformHosId': hos_id,
                                'platformDoctorId': '',
                                'nextNumInfo': '0'
                            }
                            self.headers.update({
                                'Content-Type':
                                'application/x-www-form-urlencoded',
                                'Origin':
                                'http://www.hnyygh.com',
                                'Referer':
                                'http://www.hnyygh.com/searchOrderNumInfoAction.action'
                            })
                            splash_args = {
                                'url': self.doctor_url,
                                'headers': self.headers,
                                'lua_source': self.dept_script,
                                'data': data
                            }
                            yield SplashRequest(
                                self.doctor_url,
                                endpoint='execute',
                                args=splash_args,
                                dont_filter=True,
                                headers=self.headers,
                                callback=self.parse_doctor_info,
                                meta={
                                    'dept_type': dept_type,
                                    'dept_name': dept_name,
                                    'dept_id': dept_id,
                                    'hospital_name': org_name
                                })
Esempio n. 10
0
    def parse_area(self, response):
        hospital_city = response.meta.get('area_city', '默认城市')
        self.logger.info('>>>>>>正在抓取[{}]医院列表……>>>>>>'.format(hospital_city))

        # 获取省市县等信息
        municipality = ['北京市', '上海市', '重庆市', '天津市']
        pro_or_city = response.xpath(
            '//table[@class="nav"]/tr/'
            'td/a[3]/text()').extract_first('').replace('医院列表', '')
        if pro_or_city:
            if pro_or_city.strip() in municipality:
                # 直辖市,包括市、区等信息
                hos_prov = ''
                hos_city = pro_or_city
                hos_county = response.xpath('//h1[@id="firstHeading"]/text()'
                                            ).extract_first('').replace(
                                                hos_city, '')
            else:
                # 非直辖市,包括省、市、县或区等信息
                hos_prov = pro_or_city
                hos_city = response.xpath('//h1[@id="firstHeading"]'
                                          '/text()').extract_first('').replace(
                                              '医院列表',
                                              '').replace(hos_prov, '')
                hos_county = ''
        else:
            hos_prov = hos_city = hos_county = None

        # 有医院最终页的医院
        # all_hospital_list = response.xpath('//div[@id="bodyContent"]/ul[3]/li/b/a/@href').extract()
        all_hospital_list2 = response.xpath(
            '//h2/span[contains(text(),"医院列表")]/'
            'following::ul[1]/li/b/a[not(contains(@href,"index"))]')
        special_hospital_list = response.xpath(
            '//h2/span[contains(text(),"医院列表")]/'
            'following::ul[1]/li/b/a[(contains(@href,"index"))]/ancestor::li[1]'
        )
        area_hos_cnt = len(all_hospital_list2) + len(special_hospital_list)
        self.logger.info('>>>>>>[{}]总共有{}家医院……>>>>>>'.format(
            hospital_city, str(area_hos_cnt)))
        self.total_hospital_cnt += area_hos_cnt
        self.crawler.signals.connect(self.output_statistics,
                                     signals.spider_closed)
        try:
            # 有医院最终页的
            for each_hospital in all_hospital_list2:
                hospital_name = each_hospital.xpath('text()').extract_first('')
                hospital_link = each_hospital.xpath('@href').extract_first('')
                self.headers['Referer'] = response.url
                yield Request(urljoin(self.host, hospital_link),
                              headers=self.headers,
                              callback=self.parse_hospital_detail,
                              meta={'hospital_name': hospital_name},
                              dont_filter=True)
            # 没有医院最终页的
            for each_special_hospital in special_hospital_list:
                hospital_name = each_special_hospital.xpath(
                    'b/a/text()').extract_first('')
                hospital_url = each_special_hospital.xpath(
                    'b/a/@href').extract_first('')
                hospital_address = each_special_hospital.xpath(
                    'ul[1]/li/b[contains(text(),'
                    '"医院地址")]/ancestor::li[1]/text()').extract_first('')
                hos_county = hos_county if hos_county else get_county(
                    hos_prov, hos_city, hospital_address)
                loader = CommonLoader2(item=HospitalInfoTestItem(),
                                       selector=each_special_hospital)
                loader.add_value('hospital_name', hospital_name)
                loader.add_xpath(
                    'hospital_level',
                    'ul[1]/li/b[contains(text(),"医院等级")]/ancestor::li[1]',
                    MapCompose(remove_tags, custom_remove_tags, match_special))
                loader.add_xpath(
                    'hospital_category',
                    'ul[1]/li/b[contains(text(),"医院类型")]/ancestor::li[1]',
                    MapCompose(remove_tags, custom_remove_tags, match_special))
                loader.add_xpath(
                    'hospital_addr',
                    'ul[1]/li/b[contains(text(),"医院地址")]/ancestor::li[1]',
                    MapCompose(remove_tags, custom_remove_tags, match_special))
                loader.add_value('hospital_pro', hos_prov,
                                 MapCompose(custom_remove_tags, match_special))
                loader.add_value('hospital_city', hos_city,
                                 MapCompose(custom_remove_tags, match_special))
                loader.add_value('hospital_county', hos_county,
                                 MapCompose(custom_remove_tags, match_special))
                loader.add_xpath(
                    'hospital_phone',
                    'ul[1]/li/b[contains(text(),"联系电话")]/ancestor::li[1]',
                    MapCompose(remove_tags, custom_remove_tags, match_special))
                loader.add_value('hospital_intro', '')
                loader.add_xpath(
                    'hospital_postcode',
                    'ul[1]/li/b[contains(text(),"邮政编码")]/ancestor::li[1]',
                    MapCompose(remove_tags, custom_remove_tags, match_special))
                loader.add_xpath(
                    'hospital_email',
                    'ul[1]/li/b[contains(text(),"电子邮箱")]/ancestor::li[1]',
                    MapCompose(remove_tags, custom_remove_tags, match_special))
                loader.add_xpath(
                    'hospital_website',
                    'ul[1]/li/b[contains(text(),"医院网站")]/ancestor::li[1]/'
                    'a[not(contains(@href,"http://www.a-hospital.com"))]',
                    MapCompose(remove_tags, custom_remove_tags, match_special))
                loader.add_xpath(
                    'hospital_fax',
                    'ul[1]/li/b[contains(text(),"传真号码")]/ancestor::li[1]',
                    MapCompose(remove_tags, custom_remove_tags, match_special))
                loader.add_xpath(
                    'operation_mode',
                    'ul[1]/li/b[contains(text(),"经营方式")]/ancestor::li[1]',
                    MapCompose(remove_tags, custom_remove_tags, match_special))
                loader.add_value('hospital_url',
                                 urljoin(self.host, hospital_url))
                loader.add_value('dataSource_from', '医学百科')
                loader.add_value('update_time', now_day())
                hospital_info_item = loader.load_item()
                yield hospital_info_item

                # 科室信息
                dept_info = each_special_hospital.xpath(
                    'ul[1]/li/b[contains(text(),"重点科室")]/ancestor::li[1]')
                all_dept_info = match_special(
                    dept_info.xpath('string(.)').extract_first(''))
                if all_dept_info:
                    for each_dept in all_dept_info.split('、'):
                        dept_loader = CommonLoader2(item=HospitalDepItem(),
                                                    response=response)
                        dept_loader.add_value('dept_name', each_dept,
                                              MapCompose(custom_remove_tags))
                        dept_loader.add_value(
                            'hospital_name', hospital_name,
                            MapCompose(custom_remove_tags, match_special2))
                        dept_loader.add_value('update_time', now_day())
                        dept_item = dept_loader.load_item()
                        yield dept_item

                # 医院别名信息
                hospital_name2 = each_special_hospital.xpath(
                    'b/text()').extract_first('')
                if hospital_name2 and '(' in hospital_name2:
                    alias_name = re.search(r'((.*?))',
                                           custom_remove_tags(hospital_name2))
                    if alias_name:
                        for each_alias_name in alias_name.group(1).split('、'):
                            alias_loader = CommonLoader2(
                                item=HospitalAliasItem(), response=response)
                            alias_loader.add_value(
                                'hospital_name', hospital_name,
                                MapCompose(custom_remove_tags, match_special2))
                            alias_loader.add_value('hospital_alias_name',
                                                   each_alias_name)
                            alias_loader.add_value('update_time', now_day())
                            alias_item = alias_loader.load_item()
                            yield alias_item
        except Exception as e:
            self.logger.error('抓取[{}]医院列表的时候出错了,原因是:{}'.format(
                hospital_city, repr(e)))
Esempio n. 11
0
    def parse_doctor_info_detail(self, response):
        hospital_name = response.meta.get('hospital_name')
        dept_name = response.meta.get('dept_name')
        doctor_name = response.meta.get('doctor_name')
        self.logger.info('>>>>>>正在抓取[{}]医院-[{}]医生详细信息>>>>>>'.format(
            hospital_name, doctor_name))
        try:
            # 获取医生信息
            doctor_photo_url = response.xpath(
                '//div[@class="doctor_Img"]/img/@src').extract_first('')
            loader = CommonLoader2(item=DoctorInfoItem(), response=response)
            loader.add_value('doctor_name', doctor_name,
                             MapCompose(custom_remove_tags))
            loader.add_value('dept_name', dept_name,
                             MapCompose(custom_remove_tags))
            loader.add_value('hospital_name', hospital_name,
                             MapCompose(custom_remove_tags))
            loader.add_xpath('sex', '//span[@class="doctor_grade"]/text()',
                             MapCompose(custom_remove_tags))
            loader.add_xpath('doctor_level',
                             '//span[@class="object_grade"]/text()',
                             MapCompose(custom_remove_tags))
            loader.add_xpath(
                'doctor_intro', '//div[@class="doctor_Text_Major"]',
                MapCompose(remove_tags, custom_remove_tags, match_special2))
            loader.add_value('dataSource_from', self.data_source_from)
            loader.add_value('crawled_url', response.url)
            loader.add_value('update_time', now_day())
            loader.add_value('doctor_id', response.url,
                             MapCompose(match_special2))
            loader.add_xpath(
                'dept_id', '//div[@class="position_one"]/span/a[last()]/@href',
                MapCompose(match_special2))
            loader.add_xpath(
                'hospital_id',
                '//div[@class="position_one"]/span/a[last()-1]/@href',
                MapCompose(match_special2))
            loader.add_value('doctor_photo_url',
                             urljoin(self.host, doctor_photo_url))
            loader.add_value('gmt_created', now_time())
            loader.add_value('gmt_modified', now_time())
            doctor_item = loader.load_item()
            yield doctor_item

            # 获取医生排班信息
            self.logger.info(
                '>>>>>>正在抓取[{}]医生排班信息>>>>>>'.format(hospital_name))
            has_doctor_scheduling = response.xpath(
                '//span[@class="yuyue"]/a[contains(text(),"预约")]')
            if has_doctor_scheduling:
                doctor_scheduling_list = response.xpath(
                    '//div[@class="whliesubscribe"]/ul/li[1]/div/'
                    'span/text()').extract()
                doctor_scheduling_length = len(doctor_scheduling_list)
                all_scheduling_date = response.xpath(
                    '//div[@class="datetable"]/ul/li[position()>1]/'
                    'span[1]/text()').extract()
                scheduling_date_list = custom_remove_tags(
                    remove_tags(','.join(all_scheduling_date))).split(',')
                for i in range(1, doctor_scheduling_length + 1):
                    scheduling_info = response.xpath(
                        '//div[@class="whliesubscribe"]/ul/li[position()>1]'
                        '/div[{}]'.format(str(i)))
                    scheduling_time = doctor_scheduling_list[i - 1]
                    for index, each_s_i in enumerate(scheduling_info):
                        has_scheduling = each_s_i.xpath('span/a')
                        if has_scheduling:
                            each_scheduling_date = scheduling_date_list[index]
                            reg_info = '{0}-{1}{2}'.format(
                                now_year(), each_scheduling_date,
                                scheduling_time)
                            reg_loader = CommonLoader2(
                                item=DoctorRegInfoItem(), response=response)
                            reg_loader.add_value(
                                'doctor_name', doctor_name,
                                MapCompose(custom_remove_tags))
                            reg_loader.add_value(
                                'dept_name', dept_name,
                                MapCompose(custom_remove_tags))
                            reg_loader.add_value(
                                'hospital_name', hospital_name,
                                MapCompose(custom_remove_tags))
                            reg_loader.add_value('reg_info', reg_info)
                            reg_loader.add_value('dataSource_from',
                                                 self.data_source_from)
                            reg_loader.add_value('crawled_url', response.url)
                            reg_loader.add_value('update_time', now_day())
                            reg_loader.add_value('doctor_id', response.url,
                                                 MapCompose(match_special2))
                            reg_loader.add_xpath(
                                'dept_id',
                                '//div[@class="position_one"]/span/a[last()]/@href',
                                MapCompose(match_special2))
                            reg_loader.add_xpath(
                                'hospital_id',
                                '//div[@class="position_one"]/span/a[last()-1]/@href',
                                MapCompose(match_special2))
                            reg_loader.add_value('gmt_created', now_time())
                            reg_loader.add_value('gmt_modified', now_time())
                            reg_item = reg_loader.load_item()
                            yield reg_item
        except Exception as e:
            self.logger.error('在抓取医生详细信息的过程中出错了,原因是:{}'.format(repr(e)))