コード例 #1
0
ファイル: nj12320.py プロジェクト: git-wsf/crawler_project
    def parse_hospital_info(self, response):
        self.logger.info('>>>>>>正在抓取:医院信息>>>>>>')

        try:
            # 获取区或县
            hospital_address = response.xpath(
                '//div[@class="yy_js clearfix"]/div/dl/dd[1]/text()'
            ).extract_first('')
            if hospital_address:
                hospital_county = get_county2('中国|江苏省|江苏|南京市|南京',
                                              hospital_address)
            else:
                hospital_county = None

            # 获取医院信息
            loader = CommonLoader2(item=HospitalInfoItem(), response=response)
            loader.add_xpath('hospital_name',
                             '//div[@class="yy_til"]/h2/text()',
                             MapCompose(custom_remove_tags))
            loader.add_value('hospital_level',
                             response.meta.get('hospital_level'),
                             MapCompose(custom_remove_tags, clean_info))
            loader.add_xpath(
                'hospital_addr',
                '//div[@class="yy_js clearfix"]/div/dl/dd[1]/text()',
                MapCompose(custom_remove_tags))
            loader.add_value('hospital_pro', '江苏省')
            loader.add_value('hospital_city', '南京市')
            loader.add_value('hospital_county', hospital_county)
            loader.add_xpath(
                'hospital_phone',
                '//div[@class="yy_js clearfix"]/div/dl/dd[2]/text()',
                MapCompose(custom_remove_tags))
            loader.add_xpath('hospital_intro', '//dd[@id="wrap"]',
                             MapCompose(remove_tags, custom_remove_tags))
            loader.add_value('registered_channel', self.data_source_from)
            loader.add_value('dataSource_from', self.data_source_from)
            loader.add_value('hospital_url', response.url)
            loader.add_value('update_time', now_day())
            hospital_info_item = loader.load_item()
            yield hospital_info_item

            # 获取科室信息
            # self.logger.info('>>>>>>正在抓取{}:科室详细信息>>>>>>')
            all_dept_links = response.xpath(
                '//dl[@class="kfyy clearfix"]/dd/span/a/@href').extract()
            for each_dept_link in all_dept_links:
                dept_link = urljoin(
                    self.host,
                    re.sub(r';jsessionid=(.*?)\?', '?', each_dept_link))
                self.headers['Referer'] = response.url
                yield Request(dept_link,
                              headers=self.headers,
                              callback=self.parse_hospital_dep_detail)
        except Exception as e:
            self.logger.error('在抓取医院详细信息和科室的过程中出错了,原因是:{}'.format(repr(e)))
コード例 #2
0
 def parse_hospital_info(self, response):
     self.logger.info('>>>>>>正在抓取:医院详细信息>>>>>>')
     try:
         # 获取医院信息
         hospital_info = json.loads(response.text)
         # 获取医院等级
         hospital_level_info = hospital_info.get('HIS_LVL')
         if hospital_level_info == '3':
             hospital_level = '三级'
         elif hospital_level_info == '2':
             hospital_level = '二级'
         elif hospital_level_info == '1':
             hospital_level = '一级'
         else:
             hospital_level = None
         # 获取医院所在区或县
         hospital_address = hospital_info.get('HIS_AD')
         if hospital_address:
             hospital_county = get_county2('中国|广东省|广东|广州市|广州',
                                           hospital_address)
         else:
             hospital_county = None
         loader = CommonLoader2(item=HospitalInfoItem(), response=response)
         loader.add_value('hospital_name', hospital_info.get('HIS_NM'))
         loader.add_value('hospital_level', hospital_level)
         loader.add_value('hospital_category', '')
         loader.add_value('hospital_addr', hospital_address)
         loader.add_value('hospital_pro', '广东省')
         loader.add_value('hospital_city', '广州市')
         loader.add_value('hospital_county', hospital_county)
         loader.add_value('hospital_phone', hospital_info.get('TEL_NO'))
         loader.add_value('hospital_intro', hospital_info.get('HIS_RM'))
         loader.add_value('registered_channel', self.data_source_from)
         loader.add_value('dataSource_from', self.data_source_from)
         loader.add_value('hospital_url', response.url)
         loader.add_value('update_time', now_day())
         hospital_info_item = loader.load_item()
         yield hospital_info_item
     except Exception as e:
         self.logger.error('在抓取医院详细信息过程中出错了,原因是:{}'.format(repr(e)))
コード例 #3
0
ファイル: zhyygh.py プロジェクト: git-wsf/crawler_project
 def parse_hospital_info(self, response):
     self.logger.info('>>>>>>正在抓取医院详细信息>>>>>>')
     try:
         hospital_address = response.xpath(
             '//b[contains(text(),"医院地址")]/'
             'ancestor::td[1]/text()').extract_first('')
         hospital_county = get_county2('中国|广东省|广东|珠海市|珠海', hospital_address)
         loader = CommonLoader2(item=HospitalInfoItem(), response=response)
         loader.add_xpath(
             'hospital_name',
             '//b[contains(text(),"医院全称")]/ancestor::td[1]/text()',
             MapCompose(custom_remove_tags))
         loader.add_xpath(
             'hospital_level',
             '//b[contains(text(),"医院级别")]/ancestor::td[1]/text()',
             MapCompose(custom_remove_tags))
         loader.add_value('hospital_addr', hospital_address,
                          MapCompose(custom_remove_tags))
         loader.add_value('hospital_pro', '广东省')
         loader.add_value('hospital_city', '珠海市')
         loader.add_value('hospital_county', hospital_county,
                          MapCompose(custom_remove_tags))
         loader.add_xpath(
             'hospital_phone',
             '//b[contains(text(),"联系电话")]/ancestor::td[1]/text()',
             MapCompose(custom_remove_tags))
         loader.add_xpath(
             'hospital_intro', '//b[contains(text(),"简介")]/ancestor::td[1]',
             MapCompose(remove_tags, custom_remove_tags, match_special))
         loader.add_value('registered_channel', self.data_source_from)
         loader.add_value('dataSource_from', self.data_source_from)
         loader.add_value('crawled_url', response.url)
         loader.add_value('update_time', now_day())
         hospital_item = loader.load_item()
         yield hospital_item
     except Exception as e:
         self.logger.error('在抓取医院详细信息过程中出错了,原因是:{}'.format(repr(e)))
コード例 #4
0
ファイル: zsyjjkw.py プロジェクト: git-wsf/crawler_project
    def parse(self, response):
        self.logger.info('>>>>>>正在抓取所有医院信息>>>>>>')
        all_hospitals = json.loads(response.text)
        for each_hospital in all_hospitals.get('list'):
            hospital_name = each_hospital.get('hospitalname')
            hospital_address = each_hospital.get('address')
            hospital_county = get_county2('中国|广东省|广东|中山市|中山', hospital_address)
            loader = CommonLoader2(item=HospitalInfoItem(), response=response)
            loader.add_value('hospital_name', hospital_name,
                             MapCompose(custom_remove_tags))
            loader.add_value('hospital_addr', hospital_address,
                             MapCompose(custom_remove_tags))
            loader.add_value('hospital_pro', '广东省')
            loader.add_value('hospital_city', '中山市')
            loader.add_value('hospital_county', hospital_county,
                             MapCompose(custom_remove_tags))
            loader.add_value('hospital_phone',
                             each_hospital.get('telephoneno'),
                             MapCompose(custom_remove_tags))
            loader.add_value('hospital_intro',
                             each_hospital.get('information'),
                             MapCompose(custom_remove_tags))
            loader.add_value('registered_channel', self.data_source_from)
            loader.add_value('dataSource_from', self.data_source_from)
            loader.add_value('hospital_url', response.url)
            loader.add_value('update_time', now_day())
            hospital_item = loader.load_item()
            yield hospital_item

            # 获取科室信息、医生信息
            hospital_id = each_hospital.get('hospitalid')
            if hospital_id:
                self.headers['Referer'] = self.entry_url
                yield Request(self.hospital_detail_url.format(hospital_id),
                              headers=self.headers,
                              callback=self.parse_hospital_dep,
                              meta={'hospital_name': hospital_name})
コード例 #5
0
    def parse_hospital_info(self, response):
        hospital_name = response.meta.get('hospital_name')
        self.logger.info('>>>>>>正在抓取:[{}]医院详细信息>>>>>>'.format(hospital_name))
        try:
            # 获取医院等级与类别
            l_a_c = response.xpath(
                '//div[@class="l"]/h2/span/i/text()').extract()
            l_a_c = custom_remove_tags(remove_tags('|'.join(l_a_c)))
            h_l = h_c = m_t = None
            if l_a_c:

                # 等级
                level = re.search(r'(.*等|.*级|.*甲)', l_a_c)
                if level:
                    h_l = level.group(1).split('|')[-1]

                # 类别
                category = re.search(r'(.*?医院)', l_a_c.replace('医保定点医院', ''))
                if category:
                    h_c = category.group(1).split('|')[-1]

                # 医保类型
                medical_type = re.search(r'(.*定点)', l_a_c)
                if medical_type:
                    m_t = medical_type.group(1).split('|')[-1]
            else:
                h_l = h_c = None

            # 获取省市信息
            hospital_pro = response.meta.get('hospital_pro')
            hospital_city = hospital_county = None
            h_a = response.xpath(
                '//dt[contains(text(),"地址")]/ancestor::dl[1]/dd').extract()
            hospital_address = custom_remove_tags(
                remove_tags(''.join(h_a).replace('查看地图', '')))
            if hospital_pro and hospital_address:
                if hospital_pro in MUNICIPALITY2:
                    hospital_city = hospital_pro
                    hospital_pro = ''
                    hos_c = hospital_city.replace('市', '')
                    useless_info = '{}{}|{}'.format(hos_c, '市', hos_c)
                    single_address = match_special2(
                        hospital_address.split(';')[0])
                    hospital_county = get_county2(useless_info, single_address)
                else:
                    hos_p = hospital_pro
                    hospital_pro = '{0}{1}'.format(hospital_pro, '省')
                    single_address = match_special2(
                        hospital_address.split(';')[0])
                    hospital_city = get_city(hospital_pro, single_address)
                    if hospital_city:
                        hos_c = hospital_city.replace('市', '')
                        useless_info = '{}|{}|{}|{}'.format(
                            hospital_pro, hos_p, hospital_city, hos_c)
                        hospital_county = get_county2(useless_info,
                                                      single_address)

            # 公立/私立
            h_t = custom_remove_tags(
                response.xpath(
                    '//li/b[contains(text(),"国营")]/text()').extract_first(''))
            hospital_type = '公立' if h_t == '国营' else ''

            # 医院信息item
            loader = CommonLoader2(item=HospitalInfoItem(), response=response)
            loader.add_xpath('hospital_name', '//div[@class="l"]/h2/text()',
                             MapCompose(custom_remove_tags))
            loader.add_value('hospital_level', h_l,
                             MapCompose(custom_remove_tags))
            loader.add_value('hospital_type', hospital_type)
            loader.add_value('hospital_category', h_c,
                             MapCompose(custom_remove_tags))
            loader.add_value('hospital_addr', hospital_address,
                             MapCompose(custom_remove_tags))
            loader.add_value('hospital_pro', hospital_pro,
                             MapCompose(custom_remove_tags))
            loader.add_value('hospital_city', hospital_city,
                             MapCompose(custom_remove_tags))
            loader.add_value('hospital_county', hospital_county,
                             MapCompose(custom_remove_tags))
            loader.add_xpath('hospital_phone',
                             '//dt[contains(text(),"电话")]/ancestor::dl[1]/dd',
                             MapCompose(remove_tags, custom_remove_tags))
            loader.add_xpath(
                'hospital_intro',
                '//dt/strong[contains(text(),"简介")]/ancestor::dl[1]/dd',
                MapCompose(remove_tags, custom_remove_tags))
            loader.add_value('medicare_type', m_t,
                             MapCompose(custom_remove_tags))
            # loader.add_value('registered_channel', self.data_source_from)
            loader.add_value('dataSource_from', self.data_source_from)
            loader.add_value('hospital_url', response.url)
            loader.add_value('update_time', now_day())
            hospital_item = loader.load_item()
            yield hospital_item

            # 获取医院别名
            hospital_alias = response.xpath(
                '//div[@class="l"]/p/text()').extract_first('')
            if hospital_alias:
                h_s = custom_remove_tags(hospital_alias)
                if h_s:
                    all_hospital_alias = h_s.split(',')
                    for each_alias in all_hospital_alias:
                        if each_alias != hospital_name:
                            alias_loader = CommonLoader2(
                                item=HospitalAliasItem(), response=response)
                            alias_loader.add_xpath(
                                'hospital_name', '//div[@class="l"]/h2/text()',
                                MapCompose(custom_remove_tags))
                            alias_loader.add_value(
                                'hospital_alias_name', each_alias,
                                MapCompose(custom_remove_tags, match_special))
                            alias_loader.add_value('dataSource_from',
                                                   self.data_source_from)
                            alias_loader.add_value('update_time', now_day())
                            alias_item = alias_loader.load_item()
                            yield alias_item
        except Exception as e:
            self.logger.error('在抓取医院详细信息过程中出错了,原因是:{}'.format(repr(e)))
コード例 #6
0
    def parse_hospital_info(self, response):
        hospital_name = response.meta.get('hospital_name')
        self.logger.info('>>>>>>正在抓取[{}]医院信息和科室信息>>>>>>'.format(hospital_name))
        hospital_city = response.xpath(
            '//div[@class="jieshao_zi"]/p[4]/text()').extract()
        if hospital_city:
            hospital_address = custom_remove_tags(''.join(hospital_city))
            hospital_city2 = get_city(hospital_address)
            useless_info = '中国|湖南省|湖南|{}'.format(hospital_city2)
            hospital_county = get_county2(useless_info, hospital_address)
        else:
            hospital_county = None
        loader = CommonLoader2(item=HospitalInfoItem(), response=response)
        loader.add_xpath('hospital_name',
                         '//div[@class="jieshao_zi"]/p/font/text()',
                         MapCompose(custom_remove_tags))
        loader.add_xpath('hospital_level',
                         '//div[@class="jieshao_zi"]/p[2]/text()',
                         MapCompose(custom_remove_tags))
        loader.add_value('hospital_type', '公立')
        loader.add_value('hospital_category', '')
        loader.add_xpath('hospital_addr',
                         '//div[@class="jieshao_zi"]/p[4]/text()',
                         MapCompose(custom_remove_tags))
        loader.add_value('hospital_pro', '湖南省')
        loader.add_xpath('hospital_city',
                         '//div[@class="jieshao_zi"]/p[4]/text()',
                         MapCompose(custom_remove_tags, get_city))
        loader.add_value('hospital_county', hospital_county)
        loader.add_xpath('hospital_phone',
                         '//div[@class="jieshao_zi"]/p[3]/text()',
                         MapCompose(custom_remove_tags))
        loader.add_xpath(
            'hospital_intro', '//div[@id="starlist"]',
            MapCompose(remove_tags, custom_remove_tags, clean_info))
        loader.add_value('registered_channel', self.data_source_from)
        loader.add_value('dataSource_from', self.data_source_from)
        loader.add_value('update_time', now_day())
        hospital_info_item = loader.load_item()
        yield hospital_info_item

        # 获取科室信息
        self.logger.info('>>>>>>正在抓取[{}]科室信息>>>>>>'.format(hospital_name))
        dept_links = response.xpath('//div[@class="xuanze_kslb"]')
        if dept_links:
            for each_dept_link in dept_links:
                dept_type = each_dept_link.xpath(
                    'div[1]/ul/li/text()').extract_first('')
                all_dept_links = each_dept_link.xpath('div[2]/ul/li/a')
                for dept_link in all_dept_links:
                    # dept_name = dept_link.xpath('text()').extract_first('')
                    data_info = dept_link.xpath('@onclick').extract_first('')
                    if data_info:
                        data_info = ''.join(re.findall(r'\S+', data_info))
                        is_sp_time = re.search(r'isSpTime:\'(.*?)\'',
                                               data_info)
                        pay_mode = re.search(r'paymode:\'(.*?)\'', data_info)
                        dept_id = re.search(r'platformDeptId:\'(.*?)\'',
                                            data_info)
                        hos_id = re.search(r'platformHosId:\'(.*?)\'',
                                           data_info, S)
                        dept_name = re.search(r'tempDeptName:\'(.*?)\'',
                                              data_info, S)
                        org_name = re.search(r'orgname:\'(.*?)\'', data_info,
                                             S)
                        if dept_id and hos_id and dept_name and org_name:
                            is_sp_time = is_sp_time.group(1)
                            pay_mode = pay_mode.group(1)
                            dept_id = dept_id.group(1)
                            hos_id = hos_id.group(1)
                            dept_name = dept_name.group(1)
                            org_name = org_name.group(1)
                            data = {
                                'isSpTime': str(is_sp_time),
                                'paymode': quote(pay_mode),
                                'doctorCollectResult': '',
                                'platformDeptId': str(dept_id),
                                'orgname': quote(org_name),
                                'tempDeptName': quote(dept_name),
                                'platformHosId': str(hos_id),
                                'platformDoctorId': ''
                            }
                            self.headers.update({
                                'Content-Type':
                                'application/x-www-form-urlencoded',
                                'Origin': 'http://www.hnyygh.com',
                                'Referer':
                                'http://www.hnyygh.com/searchDeptmentAction.action',
                                'Pragma': 'no-cache'
                            })
                            splash_args = {
                                'url': self.dept_detail_url,
                                'headers': self.headers,
                                'lua_source': self.dept_script,
                                'data': data
                            }
                            yield SplashRequest(
                                self.dept_detail_url,
                                endpoint='execute',
                                args=splash_args,
                                dont_filter=True,
                                headers=self.headers,
                                callback=self.parse_hospital_dep_detail,
                                meta={
                                    'dept_type': dept_type,
                                    'dept_name': dept_name,
                                    'hospital_name': org_name
                                })
                            # 获取医生信息
                            data = {
                                'platformDeptId': dept_id,
                                'platformHosId': hos_id,
                                'platformDoctorId': '',
                                'nextNumInfo': '0'
                            }
                            self.headers.update({
                                'Content-Type':
                                'application/x-www-form-urlencoded',
                                'Origin':
                                'http://www.hnyygh.com',
                                'Referer':
                                'http://www.hnyygh.com/searchOrderNumInfoAction.action'
                            })
                            splash_args = {
                                'url': self.doctor_url,
                                'headers': self.headers,
                                'lua_source': self.dept_script,
                                'data': data
                            }
                            yield SplashRequest(
                                self.doctor_url,
                                endpoint='execute',
                                args=splash_args,
                                dont_filter=True,
                                headers=self.headers,
                                callback=self.parse_doctor_info,
                                meta={
                                    'dept_type': dept_type,
                                    'dept_name': dept_name,
                                    'dept_id': dept_id,
                                    'hospital_name': org_name
                                })
コード例 #7
0
ファイル: xmsmjk.py プロジェクト: git-wsf/crawler_project
    def parse_hospital_info(self, response):
        hospital_name = response.meta.get('hospital_name')
        self.logger.info('>>>>>>正在抓取医院详细信息>>>>>>')
        try:
            hospital_id = response.meta.get('hospital_id')
            hospital_img_url = response.xpath(
                '//div[@class="divLeft_Img"]/img/@src').extract_first('')
            hospital_img_url = urljoin(
                self.host, hospital_img_url) if hospital_img_url else ''
            hospital_address = response.xpath(
                '//li[contains(text(),"地址")]/text()').extract_first('')
            hospital_county = get_county2('中国|福建省|福建|厦门市|厦门',
                                          match_special2(hospital_address))
            loader = CommonLoader2(item=HospitalInfoItem(), response=response)
            loader.add_xpath(
                'hospital_name',
                '//div[@class="divLeft_Info"]/ul/li[1]/span/text()',
                MapCompose(custom_remove_tags))
            loader.add_value('hospital_addr', hospital_address,
                             MapCompose(custom_remove_tags, match_special2))
            loader.add_value('hospital_pro', '福建省')
            loader.add_value('hospital_city', '厦门市')
            loader.add_value('hospital_county', hospital_county,
                             MapCompose(custom_remove_tags))
            loader.add_xpath('hospital_phone',
                             '//li[contains(text(),"电话")]/text()',
                             MapCompose(custom_remove_tags, match_special2))
            loader.add_xpath('hospital_intro', '//div[@class="introduceSpan"]',
                             MapCompose(remove_tags, custom_remove_tags))
            loader.add_value('registered_channel', self.data_source_from)
            loader.add_value('dataSource_from', self.data_source_from)
            loader.add_value('crawled_url', response.url)
            loader.add_value('update_time', now_day())
            loader.add_xpath('hospital_official_website',
                             '//li[contains(text(),"官网")]/text()',
                             MapCompose(custom_remove_tags, match_special2))
            loader.add_xpath('hospital_route',
                             '//li[contains(text(),"公交线路")]/text()',
                             MapCompose(custom_remove_tags, match_special2))
            loader.add_value('hospital_img_url', hospital_img_url)
            loader.add_value('gmt_created', now_time())
            loader.add_value('gmt_modified', now_time())
            loader.add_value('hospital_id', hospital_id)
            hospital_item = loader.load_item()
            yield hospital_item

            # 科室信息
            all_dept_info = response.xpath(
                '//div[@class="medicineOne"]|//div[@class="medicineTwo"]')
            for each_dept_info in all_dept_info:
                dept_type = each_dept_info.xpath(
                    'div[1]/span/text()').extract_first('')
                dept_names = each_dept_info.xpath('div[2]/div[1]')
                for each_dept_name in dept_names:
                    dept_name = each_dept_name.xpath('a/text()').extract_first(
                        '')
                    dept_link = each_dept_name.xpath('a/@href').extract_first(
                        '')
                    doctor_num_of_dept = each_dept_name.xpath(
                        'span/text()').extract_first('')

                    # 获取科室人数
                    if doctor_num_of_dept:
                        dept_person_num = re.search(r'(\d+)',
                                                    doctor_num_of_dept)
                        dept_person_num = int(dept_person_num.group(
                            1)) if dept_person_num else None
                    else:
                        dept_person_num = None

                    # 获取科室详细信息
                    dept_loader = CommonLoader2(item=HospitalDepItem(),
                                                response=response)
                    dept_loader.add_value('dept_name', dept_name,
                                          MapCompose(custom_remove_tags))
                    dept_loader.add_value('hospital_name', hospital_name,
                                          MapCompose(custom_remove_tags))
                    dept_loader.add_value('dept_type', dept_type,
                                          MapCompose(custom_remove_tags))
                    dept_loader.add_value('dataSource_from',
                                          self.data_source_from)
                    dept_info = ''.join(
                        response.xpath(
                            '//p[contains(text(),"科室简介")]/ancestor::tr[1]').
                        extract())
                    dept_loader.add_value(
                        'dept_info', dept_info,
                        MapCompose(remove_tags, custom_remove_tags,
                                   match_special2))
                    dept_loader.add_value('crawled_url', response.url)
                    dept_loader.add_value('update_time', now_day())
                    dept_loader.add_value('dept_id', dept_link,
                                          MapCompose(match_special2))
                    dept_loader.add_value('hospital_id', hospital_id)
                    dept_loader.add_value('dept_person_num', dept_person_num)
                    dept_loader.add_value('dept_url',
                                          urljoin(self.host, dept_link))
                    dept_loader.add_value('gmt_created', now_time())
                    dept_loader.add_value('gmt_modified', now_time())
                    dept_item = dept_loader.load_item()
                    yield dept_item

                    # 获取医生信息
                    if dept_link and dept_person_num:
                        self.headers['Referer'] = response.url
                        yield Request(urljoin(self.host, dept_link),
                                      headers=self.headers,
                                      callback=self.parse_doctor_info,
                                      dont_filter=True,
                                      meta={
                                          'hospital_name': hospital_name,
                                          'dept_name': dept_name,
                                      })
        except Exception as e:
            self.logger.error('在抓取医院详细信息过程中出错了,原因是:{}'.format(repr(e)))