def parse(self, response): # response.body soup = BeautifulSoup(response.body, "lxml") divs = soup.findAll('div', {'class': 'box_list clearfix'}) for div in divs: # title, content,url item = IfengItem() h2 = div.find('h2') link = h2.find('a') url = link['href'] item['url'] = url title = link['title'] item['title'] = title response2 = urllib.urlopen(url) soup2 = BeautifulSoup(response2, "lxml") content = soup2.find('div', {'id': 'artical_real'}).get_text() item['content'] = content item['label'] = 'history' if self.check(item['url']): yield item #//*[@id="pagenext"] next_url = response.xpath( "//*[@id='pagenext'] /@href").extract() # 找到下一个链接,也就是翻页。 if next_url: yield scrapy.Request(next_url[0], callback=self.parse)
def parse(self, response): # response.body soup = BeautifulSoup(response.body, "lxml") #/html/body/div[4]/div[1]/div/div/div[1]/a divs = soup.findAll('div', {'class': 'con_lis show'}) for div in divs: # title, content,url item = IfengItem() url = div.find('a')['href'] title = div.find('h4').get_text() item['url'] = url item['title'] = title response2 = urllib.urlopen(url) soup2 = BeautifulSoup(response2, "lxml") content = soup2.find('div', {'id': 'yc_con_txt'}).get_text() item['content'] = content item['label'] = 'history' if self.check(item['url']): yield item #//*[@id="pagenext"] next_url = response.xpath( "//*[@id='pagenext'] /@href").extract() # 找到下一个链接,也就是翻页。 if next_url: yield scrapy.Request(next_url[0], callback=self.parse)
def parse_news(self, response): title = response.css( 'div#artical h1#artical_topic::text').extract_first() fbody = response.css( 'div#main_content.js_selection_area p::text').extract() body = '\n'.join(fbody) source_url = response.css( 'div#artical_sth.clearfix span.ss03 a::attr(href)').extract_first( ) response.meta['data'].update({ 'content': body, 'source_url': source_url, 'title': title, 'response_url': response.url }) yield IfengItem(response.meta['data'])
def parse(self, response): # response.body soup = BeautifulSoup(response.body, "lxml") root = soup.find('div', {'class': 'leftList'}) lis = root.findAll('li') for li in lis: # title, content,url item = IfengItem() url = li.find('a')['href'] item['url'] = url title = li.get_text() item['title'] = title response2 = urllib.urlopen(url) soup2 = BeautifulSoup(response2, "lxml") try: content = soup2.find('div', {'id': 'Cnt-Main-Article-QQ'}).get_text()#Cnt-Main-Article-QQ item['content'] = content except AttributeError: print AttributeError.message item['label'] = 'military' if self.check(item['url']): yield item