def parse(self, response): # 选择器 sel = Selector(response) item = CpsecspidersItem() # 文章url列表 article_url = sel.xpath('//dt//p/a/@href').extract() # 下一页地址 next_page_url = sel.xpath( "//a[@class='paginate'][last()]/@href").extract() for url in article_url: # 拼接url urll = urljoin(self.baseurl, url) # 调用parse_item解析文章内容 request = scrapy.Request(urll, callback=self.parse_item) request.meta['item'] = item yield request if next_page_url[0]: # 调用自身进行迭代 n = next_page_url[0][-2:-1] t = list(next_page_url[0]) if int(n) > 0: t[8] = str(int(n) - 1) tt = ''.join(t) request = scrapy.Request(urljoin(self.baseurl, tt), callback=self.parse) yield request
def parse(self, response): # 选择器 sel = Selector(response) item = CpsecspidersItem() # 文章url列表 article_url = sel.xpath( '//div[@class="mt5"]/table[@class="tab-bbs-list tab-bbs-list-2"]//tr[@class="bg"]/td[1]/a/@href' ).extract() # 下一页地址 next_page_url = sel.xpath( "//div[@class='short-pages-2 clearfix']/div[@class='links']/a[last()]/@href" ).extract() for url in article_url: # 拼接url urll = urljoin(self.baseurl, url) # 调用parse_item解析文章内容 request = scrapy.Request(urll, callback=self.parse_item) request.meta['item'] = item yield request if next_page_url[0]: # 调用自身进行迭代 request = scrapy.Request(urljoin(self.baseurl, next_page_url[0]), callback=self.parse) yield request
def parse_item(self, response): import time time = time.strftime("%Y.%m.%d", time.localtime()) content = '' sel = Selector(response) item = response.meta['item'] l = ItemLoader(item=CpsecspidersItem(), response=response) article_url = str(response.url) article_name = sel.xpath( '//div[@id="post_head"]/h1/span/span/text()').extract() article_content = sel.xpath( '//div[@class="atl-main"]//div/div[@class="atl-content"]/div[2]/div[1]/text()' ).extract() article_author = sel.xpath( "//a[@class='js-vip-check']/text()").extract() article_clik_num = sel.xpath( 'substring-after(//div[@class="atl-info"]/span[3]/text(),":")' ).extract() article_reply_num = sel.xpath( 'substring-after(//div[@class="atl-info"]/span[4]/text(),":")' ).extract() # 文章内容拼起来 for i in article_content: content = content + i # 如果文章名为空的情况 if not article_name: article_name = "无名" article_name = article_name content = content article_url = article_url article_author = article_author[0] click_num = article_clik_num[0] reply_num = article_reply_num[0] l.add_value('title', article_name) l.add_value('content', content) l.add_value('url', article_url) l.add_value('reply', reply_num) l.add_value('click', click_num) l.add_value('uname', article_author) l.add_value('source', "天涯论坛-养宠心情") l.add_value('typeid', 0) l.add_value('datetime', time) l.add_value('EmotionalScore', 0) yield l.load_item()
def parse_item(self, response): import time time = time.strftime("%Y.%m.%d", time.localtime()) content = '' sel = Selector(response) item = response.meta['item'] l = ItemLoader(item=CpsecspidersItem(), response=response) article_url = str(response.url) article_name = sel.xpath('//a[@class="maintitle"]/text()').extract() article_content = sel.xpath( '//table[@class="attachtable"]//text()').extract() article_author = sel.xpath( "//td[@class='row1'][1]/span[@class='postdetails']/text()[1]" ).extract() article_clik_num = sel.xpath( 'substring-after(//div[@class="atl-info"]/span[3]/text(),":")' ).extract() article_reply_num = sel.xpath( 'substring-after(//div[@class="atl-info"]/span[4]/text(),":")' ).extract() # print(article_name) # 文章内容拼起来 for i in article_content: content = content + i # 如果文章名为空的情况 if not article_name: article_name = "无名" article_name = article_name content = content article_url = article_url article_author = article_author[0] click_num = article_clik_num[0] reply_num = article_reply_num[0] l.add_value('title', article_name) l.add_value('content', content) l.add_value('url', article_url) l.add_value('reply', reply_num) l.add_value('click', click_num) l.add_value('uname', article_author) l.add_value('source', "cpn论坛-百鸟园") l.add_value('typeid', 0) l.add_value('datetime', time) l.add_value('EmotionalScore', 0) yield l.load_item()
def parse_item(self, response): import time time = time.strftime("%Y.%m.%d",time.localtime()) content = '' sel = Selector(response) item = response.meta['item'] l = ItemLoader(item=CpsecspidersItem(), response=response) article_url = str(response.url) article_name = sel.xpath("//h1[@class='core_title_txt ']/text()").extract() article_content = sel.xpath( "//div[@class='p_content p_content p_content_nameplate']/cc//text()").extract() article_author = sel.xpath("substring-after(//a[@class='p_author_name j_user_card']/text(),'')").extract() article_clik_num = sel.xpath('substring-after(//div[@class="atl-info"]/span[3]/text(),":")').extract() article_reply_num = sel.xpath('substring-after(//div[@class="atl-info"]/span[4]/text(),":")').extract() # 文章内容拼起来 for i in article_content: content = content + i # 如果文章名为空的情况 if not article_name: article_name="无名" article_name = article_name content = content article_url = article_url article_author = article_author[0] click_num = article_clik_num[0] reply_num = article_reply_num[0] l.add_value('title', article_name) l.add_value('content', content) l.add_value('url', article_url) l.add_value('reply', reply_num) l.add_value('click', click_num) l.add_value('uname', article_author) l.add_value('source', "百度贴吧") l.add_value('typeid', 0) l.add_value('datetime', time) l.add_value('EmotionalScore', 0) yield l.load_item()
def parse(self, response): # 选择器 sel = Selector(response) item = CpsecspidersItem() # 文章url列表 article_url = sel.xpath( '//a[@class="j_th_tit "]/@href').extract() # 下一页地址 next_page_url = sel.xpath('//*[@id="frs_list_pager"]/a[10]/@href').extract() for url in article_url: # 拼接url urll = urljoin(self.baseurl, url) # 调用parse_item解析文章内容 request = scrapy.Request(urll, callback=self.parse_item) request.meta['item'] = item yield request if next_page_url[0]: # 调用自身进行迭代 print(urljoin(self.baseurl, next_page_url[0])) request = scrapy.Request(urljoin(self.baseurl, next_page_url[0]), callback=self.parse) yield request
def parse(self, response): # 选择器 sel = Selector(response) item = CpsecspidersItem() # 文章url列表 article_url = sel.xpath( '//span[@class="topictitle"]/a/@href').extract() # 下一页地址 next_page_url = sel.xpath( '//td[@nowrap="nowrap"]/span[@class="nav"]/a[1]/@href').extract() for url in article_url: # 拼接url urll = urljoin(self.baseurl, url) # 调用parse_item解析文章内容 request = scrapy.Request(urll, callback=self.parse_item) request.meta['item'] = item yield request if next_page_url[0]: # 调用自身进行迭代 request = scrapy.Request(urljoin(self.baseurl, next_page_url[0]), callback=self.parse) yield request