Exemple #1
0
 def parse_item(self, response):
     item = HospitalItem()
     item['url'] = response.xpath("//h1[@class='article-title']/a/@href").extract_first().strip().encode('utf-8')
     session = DBSession()
     if len(item['url']) == 0 or session.query(Log).filter(Log.url == item['url']).count():
         session.close()
     else:
         item['title'] = response.xpath(
             "//h1[@class='article-title']/a/text()").extract_first().strip().encode(
             'utf-8')
         item['content'] = response.xpath("//article/node()").extract()
         item['domain'] = 'www.36dsj.com'
         session.close()
         return item
Exemple #2
0
    def getInfo(self, response):
        #根据不同省份的不同页数的网页源代码,刷选出所需信息
        hospitalList = response.xpath(
            '//table[@class="table table-striped"]/tbody/tr')
        # 储存对象
        item = HospitalItem()

        for hospital in hospitalList:
            # 医院名称
            hospitalName = hospital.xpath('./th/a/text()').extract()[0]
            # 医院等级
            hospitalGrade = hospital.xpath('./td[1]/text()').extract()
            if len(hospitalGrade) != 0:
                hospitalGrade = hospitalGrade[0]
            else:
                hospitalGrade = '未定级'
            # 医院类型
            hospitalType = hospital.xpath('./td[2]/text()').extract()[0]
            # 省
            province = hospital.xpath('./td[3]/text()').extract()[0]
            # 市
            city = hospital.xpath('./td[4]/text()').extract()[0]
            # 区/县
            direct = hospital.xpath('./td[5]/text()').extract()[0]
            # 床位数
            bedNum = hospital.xpath('./td[6]/text()').extract()
            if len(bedNum) != 0:
                bedNum = bedNum[0]
            else:
                bedNum = 0
            # 医院地址
            address = hospital.xpath('./td[7]/text()').extract()[0]

            print(hospitalName, hospitalGrade, hospitalType, province, city,
                  direct, bedNum, address)
            #信息保存并传递给管道
            item['hospitalName'] = hospitalName
            item['hospitalGrade'] = hospitalGrade
            item['hospitalType'] = hospitalType
            item['province'] = province
            item['city'] = city
            item['direct'] = direct
            item['bedNum'] = bedNum
            item['address'] = address

            yield item
Exemple #3
0
    def parse(self, response):

        # extract the classfy dept
        for sel in response.xpath('//li[@class="g-clear"]'):
            items = []
            item = HospitalItem()
            item['type'] = self.remove_char(
                sel.xpath("label/text()").extract_first())
            for i in sel.xpath('p/span/a[@class="ishao"]/text()').extract():
                items.append(self.remove_char(i))
            item['name'] = items
            line = json.dumps(dict(item), ensure_ascii=False) + "\n"
            self.file.write(line)
            postItem = dict(item)  # 把item转化成字典形式
            self.coll.insert(postItem)  # 向数据库插入一条记录

        self.file.close()
Exemple #4
0
 def parse_item(self, response):
     item = HospitalItem()
     item['url'] = response.url.strip().encode('utf-8')
     session = DBSession()
     if len(item['url']) == 0 or session.query(Log).filter(
             Log.url == item['url']).count():
         session.close()
     else:
         item['title'] = response.xpath(
             "//div[@class='article']/div[@class='title']/text()"
         ).extract_first().strip().encode('utf-8')
         item['content'] = response.xpath(
             "//div[@class='article']/div[@class='answer']/node()").extract(
             )
         item['domain'] = 'news.hc3i.cn'
         session.close()
         return item
Exemple #5
0
    def parseItem(self, response):
        item = HospitalItem()
        item["poiid"] = response.meta['poiid']
        item["name"] = response.xpath('//h1/text()').extract_first()

        res = response.xpath(
            "//ul/li[@class='list-group-item']/a/text()").extract()
        item["province"] = res[0]
        item["city"] = res[1]
        item["district"] = res[2]
        item["label"] = res[3].replace("(", "")

        res = response.xpath("//ul/li[@class='list-group-item']").extract()
        item["address"] = re.findall("</span>(.*)</li>", res[3])[0]
        item["phone"] = re.findall("</span>(.*)</li>", res[4])[0]
        item["category"] = ""
        item["groundpos"] = re.findall("</span>(.*)</li>", res[-3])[0]
        item["marspos"] = re.findall("</span>(.*)</li>", res[-2])[0]
        item["baidupos"] = re.findall("</span>(.*)</li>", res[-1])[0]

        yield item
 def parse_item(self, response):
     i = HospitalItem(
     )  #http://www.a-hospital.com/w/%E5%9B%9B%E5%B7%9D%E7%9C%81%E5%8C%BB%E9%99%A2%E5%88%97%E8%A1%A8
     province = urllib.unquote(
         response.url[len("http://www.a-hospital.com/w/"):])
     for name, content in re.findall(
             r'<li><b><a href=".*?" title=".*?">(.*?)</a>.*?</b>[\s\S]*?<ul>([\s\S]*?)</ul>[\s\S]*?</li>',
             response.body):
         i['hospitalName'] = name.decode('utf-8')
         content = content.decode("utf-8")
         hospitalAddress = re.findall(u"<b>医院地址</b>[:|:](.*?)</li>",
                                      content)
         hospitalPhoneNumber = re.findall(u"<b>联系电话</b>[:|:](.*?)</li>",
                                          content)
         hospitalLevel = re.findall(u"<b>医院等级</b>[:|:](.*?)</li>", content)
         hospitalType = re.findall(u"<b>经营方式</b>[:|:](.*?)</li>", content)
         hospitalFaxNumber = re.findall(u"<b>传真号码</b>[:|:](.*?)</li>",
                                        content)
         hospitalEmail = re.findall(u"<b>电子邮箱</b>[:|:](.*?)</li>", content)
         hospitalWebsite = re.findall(
             u'<b>医院网站</b>[:|:]<a href="(.*?)" class="external free" rel="nofollow" target="_blank">.*?</a></li>',
             content)
         if hospitalAddress:
             i["hospitalAddress"] = hospitalAddress[0]
         if hospitalPhoneNumber:
             i['hospitalPhoneNumber'] = hospitalPhoneNumber[0]
         if hospitalLevel:
             i['hospitalLevel'] = hospitalLevel[0]
         if hospitalType:
             i['hospitalType'] = hospitalType[0]
         if hospitalFaxNumber:
             i['hospitalFaxNumber'] = hospitalFaxNumber[0]
         if hospitalEmail:
             i['hospitalEmail'] = hospitalEmail[0]
         if hospitalWebsite:
             i['hospitalWebsite'] = hospitalWebsite[0]
         i['hospitalProvince'] = province.decode('utf-8')
         yield i