def parse_item(self, response): item = HospitalItem() item['url'] = response.xpath("//h1[@class='article-title']/a/@href").extract_first().strip().encode('utf-8') session = DBSession() if len(item['url']) == 0 or session.query(Log).filter(Log.url == item['url']).count(): session.close() else: item['title'] = response.xpath( "//h1[@class='article-title']/a/text()").extract_first().strip().encode( 'utf-8') item['content'] = response.xpath("//article/node()").extract() item['domain'] = 'www.36dsj.com' session.close() return item
def getInfo(self, response): #根据不同省份的不同页数的网页源代码,刷选出所需信息 hospitalList = response.xpath( '//table[@class="table table-striped"]/tbody/tr') # 储存对象 item = HospitalItem() for hospital in hospitalList: # 医院名称 hospitalName = hospital.xpath('./th/a/text()').extract()[0] # 医院等级 hospitalGrade = hospital.xpath('./td[1]/text()').extract() if len(hospitalGrade) != 0: hospitalGrade = hospitalGrade[0] else: hospitalGrade = '未定级' # 医院类型 hospitalType = hospital.xpath('./td[2]/text()').extract()[0] # 省 province = hospital.xpath('./td[3]/text()').extract()[0] # 市 city = hospital.xpath('./td[4]/text()').extract()[0] # 区/县 direct = hospital.xpath('./td[5]/text()').extract()[0] # 床位数 bedNum = hospital.xpath('./td[6]/text()').extract() if len(bedNum) != 0: bedNum = bedNum[0] else: bedNum = 0 # 医院地址 address = hospital.xpath('./td[7]/text()').extract()[0] print(hospitalName, hospitalGrade, hospitalType, province, city, direct, bedNum, address) #信息保存并传递给管道 item['hospitalName'] = hospitalName item['hospitalGrade'] = hospitalGrade item['hospitalType'] = hospitalType item['province'] = province item['city'] = city item['direct'] = direct item['bedNum'] = bedNum item['address'] = address yield item
def parse(self, response): # extract the classfy dept for sel in response.xpath('//li[@class="g-clear"]'): items = [] item = HospitalItem() item['type'] = self.remove_char( sel.xpath("label/text()").extract_first()) for i in sel.xpath('p/span/a[@class="ishao"]/text()').extract(): items.append(self.remove_char(i)) item['name'] = items line = json.dumps(dict(item), ensure_ascii=False) + "\n" self.file.write(line) postItem = dict(item) # 把item转化成字典形式 self.coll.insert(postItem) # 向数据库插入一条记录 self.file.close()
def parse_item(self, response): item = HospitalItem() item['url'] = response.url.strip().encode('utf-8') session = DBSession() if len(item['url']) == 0 or session.query(Log).filter( Log.url == item['url']).count(): session.close() else: item['title'] = response.xpath( "//div[@class='article']/div[@class='title']/text()" ).extract_first().strip().encode('utf-8') item['content'] = response.xpath( "//div[@class='article']/div[@class='answer']/node()").extract( ) item['domain'] = 'news.hc3i.cn' session.close() return item
def parseItem(self, response): item = HospitalItem() item["poiid"] = response.meta['poiid'] item["name"] = response.xpath('//h1/text()').extract_first() res = response.xpath( "//ul/li[@class='list-group-item']/a/text()").extract() item["province"] = res[0] item["city"] = res[1] item["district"] = res[2] item["label"] = res[3].replace("(", "") res = response.xpath("//ul/li[@class='list-group-item']").extract() item["address"] = re.findall("</span>(.*)</li>", res[3])[0] item["phone"] = re.findall("</span>(.*)</li>", res[4])[0] item["category"] = "" item["groundpos"] = re.findall("</span>(.*)</li>", res[-3])[0] item["marspos"] = re.findall("</span>(.*)</li>", res[-2])[0] item["baidupos"] = re.findall("</span>(.*)</li>", res[-1])[0] yield item
def parse_item(self, response): i = HospitalItem( ) #http://www.a-hospital.com/w/%E5%9B%9B%E5%B7%9D%E7%9C%81%E5%8C%BB%E9%99%A2%E5%88%97%E8%A1%A8 province = urllib.unquote( response.url[len("http://www.a-hospital.com/w/"):]) for name, content in re.findall( r'<li><b><a href=".*?" title=".*?">(.*?)</a>.*?</b>[\s\S]*?<ul>([\s\S]*?)</ul>[\s\S]*?</li>', response.body): i['hospitalName'] = name.decode('utf-8') content = content.decode("utf-8") hospitalAddress = re.findall(u"<b>医院地址</b>[:|:](.*?)</li>", content) hospitalPhoneNumber = re.findall(u"<b>联系电话</b>[:|:](.*?)</li>", content) hospitalLevel = re.findall(u"<b>医院等级</b>[:|:](.*?)</li>", content) hospitalType = re.findall(u"<b>经营方式</b>[:|:](.*?)</li>", content) hospitalFaxNumber = re.findall(u"<b>传真号码</b>[:|:](.*?)</li>", content) hospitalEmail = re.findall(u"<b>电子邮箱</b>[:|:](.*?)</li>", content) hospitalWebsite = re.findall( u'<b>医院网站</b>[:|:]<a href="(.*?)" class="external free" rel="nofollow" target="_blank">.*?</a></li>', content) if hospitalAddress: i["hospitalAddress"] = hospitalAddress[0] if hospitalPhoneNumber: i['hospitalPhoneNumber'] = hospitalPhoneNumber[0] if hospitalLevel: i['hospitalLevel'] = hospitalLevel[0] if hospitalType: i['hospitalType'] = hospitalType[0] if hospitalFaxNumber: i['hospitalFaxNumber'] = hospitalFaxNumber[0] if hospitalEmail: i['hospitalEmail'] = hospitalEmail[0] if hospitalWebsite: i['hospitalWebsite'] = hospitalWebsite[0] i['hospitalProvince'] = province.decode('utf-8') yield i