def parse(self, response): sel = Selector(response) item = CsdnItem() title = sel.xpath( '//div[@id="article_details"]/div/h1/span/a/text()').extract() article_url = str(response.url) time = sel.xpath( '//div[@id="article_details"]/div[2]/div/span[@class="link_postdate"]/text()' ).extract() readtimes = sel.xpath( '//div[@id="article_details"]/div[2]/div/span[@class="link_view"]/text()' ).extract() item['title'] = [ n.encode('utf-8').replace("\r\n", "").strip() for n in title ] item['time'] = [n.encode('utf-8') for n in time] item['readtimes'] = [n.encode('utf-8') for n in readtimes] yield item #get next url urls = sel.xpath('//li[@class="next_article"]/a/@href').extract() for url in urls: print url url = "http://blog.csdn.net" + url print url yield Request(url, callback=self.parse)
def parse(self, response): item=CsdnItem() item['title']=response.xpath("//a[@class='meta-title']/text()").extract() item['desc']=response.xpath("//span[@class='excerpt']/p/text()").extract() item['link']=response.xpath("//a[@class='meta-title']/@href").extract() item['time']=response.xpath("normalize-space(//p/text())").extract() yield item
def parse(self, response): item=CsdnItem() item['title']=response.xpath("//h2[@class='csdn-tracking-statistics']" "/a[@strategy='career']/text()").extract() item['desc']=response.xpath("//div[@class='summary oneline']/text()").extract() item['link']=response.xpath("//h2[@class='csdn-tracking-statistics']" "/a[@strategy='career']/@href").extract() item['time']=response.xpath("//dd[@class='time']/text()").extract() yield item
def parse(self, response): item = CsdnItem() item['title'] = response.xpath( "//a[@class='archive-title']/text()").extract() item['desc'] = response.xpath( "//span[@class='excerpt']/p/text()").extract() item['link'] = response.xpath( "//a[@class='archive-title']/@href").extract() item['time'] = response.xpath("//dd[@class='time']/text()").extract() yield item
def parse(self, response): item = CsdnItem() item["name"] = response.xpath( '//h1[@class="title-article"]/text()').extract()[0] item["time"] = response.xpath( '//span[@class="time"]/text()').extract()[0] item["number"] = response.xpath( '//span[@class="read-count"]/text()').extract()[0] yield item
def parse_content(self, response): content = response.xpath('//div[@class="markdown_views"]').extract() item = CsdnItem() item['url'] = response.meta['url'][1:] item['jianjie'] = response.meta['jianjie'] item['cete'] = response.meta['cete'] item['date'] = response.meta['date'] item['count'] = response.meta['count'] item['author'] = response.meta['author'] item['title'] = response.meta['title'] item['content'] = content yield item
def format_item(self, response: Response): title = response.xpath("//h1[@class='title-article']/text()")[0].get() body = response.xpath("//div[@id='article_content']")[0].get() taga = response.xpath("//a[@class='tag-link']/text()") tags = [] for i in taga: tag = i.get() tags.append(tag) tag = "<spliter>".join(tags) yield CsdnItem(title=title, body=body, tag=tag) print("*" * 30 + "parse_item") for i in self._explore_all_href(response): yield i
def parse(self, response): item = CsdnItem() # 不加/text()出现的是选择器对象且包含节点所有内容[<selector .., data ='<h1>...</h1>'] # 若加/text()也是得到选择器对象(节点文本)[selector ..., data='文本'] # extract将选择器对象中文本取出来['文本内容'] item["name"] = response.xpath( '//h1[@class="title-article"]/text()').extract()[0] # 把匹配出的对象转为字符串 item["time"] = response.xpath( '//span[@class="time"]/text()').extract()[0] item["count"] = response.xpath( '//span[@class="read-count"]/text()').extract()[0] yield item
def parse_item(self, response): item = CsdnItem() item['question'] = response.xpath( '//div[@class="detail_title"]/h1/span/text()').extract()[0] content = response.xpath('//div[@class="detailed"]/table')[0].xpath( './/div[@class="post_body"]/text()').extract() item['content'] = "".join(content).strip().replace("\n", "").replace( " ", "") item['answer_num'] = response.xpath( '//span[@class="return_time"]/text()').extract()[0] item['question_url'] = response.url yield item
def parse(self, response): for i in response.xpath("//div[@class='content']/a/@href"): next_url = i.get() if next_url: yield scrapy.Request(next_url, self.parse) tags = [] for i in response.xpath("//a[@class='tag-link']/text()"): tag = i.get().strip().lstrip() tags.append(tag) item = CsdnItem( tags=tags, url=response.url, content=response.xpath("//div[@id='content_views']").get()) yield item
def parse(self, response): for sel in response.xpath('//*[@id="asideProfile"]'): # author = sel.xpath('div[1]/div[2]/div[1]/a/text()').extract() # fans = sel.xpath('div[2]/dl[2]/dt/span/text()').extract() # like = sel.xpath('div[2]/dl[3]/dt/span/text()').extract() # comment = sel.xpath('div[2]/dl[4]/dt/span/text()').extract() # print(author, fans, like, comment) item = CsdnItem() item['author'] = sel.xpath( 'div[1]/div[2]/div[1]/a/span/text()').extract() #.strip() item['fans'] = sel.xpath('div[2]/dl[2]/dt/span/text()').extract() item['like'] = sel.xpath('div[2]/dl[3]/dt/span/text()').extract() item['comment'] = sel.xpath( 'div[2]/dl[4]/dt/span/text()').extract() yield item
def parse(self, response): # 先获取页面中class="list_con"的div标签,在获取class="title oneline"的子div标签 body = response.xpath( '//div[@class="list_con"]//div[@class="title oneline"]') for value in body: item = CsdnItem() try: item['title'] = value.xpath( './h2/a/text()')[0].extract().strip() item['href'] = value.xpath('./h2/a/@href')[0].extract() except Exception as e: print(e) else: print(item['title'] + ':' + item['href']) yield item
def parse_item(self, response): sel = Selector(response) item = CsdnItem() title = sel.xpath( '//div[@id="article_details"]/div/h1/span/a/text()').extract() article_url = str(response.url) time = sel.xpath( '//div[@id="article_details"]/div[2]/div/span[@class="link_postdate"]/text()' ).extract() readtimes = sel.xpath( '//div[@id="article_details"]/div[2]/div/span[@class="link_view"]/text()' ).extract() item['title'] = [ n.encode('utf-8').replace("\r\n", "").strip() for n in title ] item['time'] = [n.encode('utf-8') for n in time] item['readtimes'] = [n.encode('utf-8') for n in readtimes] yield item
def parse_item(self, response): sel = response.selector posts = sel.xpath( '//*[@id="article_list"]/div[@class="list_item article_item"]') items = [] for p in posts: item = CsdnItem() item['title'] = p.xpath( './/span[@class="link_title"]/a/text()').extract_first() item['pdate'] = p.xpath( './/span[@class="link_postdate"]/text()').extract_first() item['url'] = response.url item['description'] = p.xpath( './/*[@class="article_description"]/text()').extract_first() items.append(item) return items
def parse_item(self, response): name = response.xpath('//div[@class="article-intro"]/h1/text()').get() if response.xpath( '//div[@class="article-intro"]/h1/span/text()').get(): name += response.xpath( '//div[@class="article-intro"]/h1/span/text()').get() contents = response.xpath( '//div[@class="article-intro"]//text()').getall() title = [] title.append(name) if response.xpath('//div[@class="article-intro"]/h2/text()').get(): title_2 = response.xpath( '//div[@class="article-intro"]/h2/text()').getall() title += title_2 if response.xpath('//div[@class="article-intro"]/h3/text()').get(): title_3 = response.xpath( '//div[@class="article-intro"]/h3/text()').getall() title += title_3 print("===============") print(name) print(title) content_list = [] for i in contents: # if content=="\r\n": # continue if "\t" in i: continue if "\n" in i: continue if i in title: content_list.append("\n") content_list.append(i.strip()) if i in title: content_list.append("\n") content = " ".join(content_list) print(content) item = CsdnItem(name=name, content=content) print(item) yield item
def parseComent(self, response): title = response.xpath("//h1[@class='title-article']/text()")[0].get() tag = class_name body = response.xpath("//div[@id='article_content']")[0].get() yield CsdnItem(title=title, tag=tag, body=body)
def parse(self, response): # wantnum = CsdnPipeline.wantnum # num = CsdnPipeline.num # print 'the num is ',num # print 'the wantnum is ',wantnum # print "parse_item>>>>>>" print '进入爬虫' item = CsdnItem() sel = Selector(response) blog_url = str(response.url) blog_name = sel.xpath( '//div[@id="article_details"]/div/h1/span/a/text()').extract() # print u'',blog_name[0].replace("\r\n",'').replace(" ",''),'' item['blog_viewnum'] = sel.xpath( '//span[@class="link_view"]/text()').extract()[0] item['blog_time'] = sel.xpath( '//span[@class="link_postdate"]/text()').extract()[0] item['blog_author'] = sel.xpath( '//div[@id="blog_title"]/h2/a/text()').extract()[0] item['blog_comment'] = '' alltext = sel.xpath('//span[@class="link_comments"]//text()').extract() for eacht in sel.xpath( '//span[@class="link_comments"]//text()').extract(): item['blog_comment'] = item['blog_comment'] + eacht # print u'',item['blog_viewnum'],'' item['blog_title'] = blog_name[0].replace("\r\n", '').replace(" ", '') item['blog_content'] = '' alltext = sel.xpath('//div[@id="article_content"]//text()').extract() # for eachtext in alltext: # item['blog_content'] = item['blog_content'] + eachtext.replace("\r\n",'').replace(' ','') for i in range(0, alltext.__len__() - 1): onetext = alltext[i].replace("\r\n", '').replace(' ', '') if '$(function()' in onetext: print u'找到你啦' continue onetext = onetext.replace("'", "\\\'") onetext = onetext.replace('"', '\\\"') item['blog_content'] = item['blog_content'] + onetext print item['blog_content'] item['blog_picture'] = [] allimgurl = sel.xpath( '//div[@id="article_content"]//img/@src').extract() # print allimgurl # for eachurl in allimgurl: # # print type(eachurl) # item['blog_picture'] = item['blog_picture'].append(eachurl) item['blog_picture'] = allimgurl num = 0 item['blog_url'] = blog_url.encode('utf-8') yield item preurl = sel.xpath( '//ul[@class="article_next_prev"]/li[@class="prev_article"]/a/@href' ).extract() #print preurl nexturl = sel.xpath( '//ul[@class="article_next_prev"]/li[@class="next_article"]/a/@href' ).extract() #print nexturl if (self.num < int(self.wantnum)): if (self.direct == '1'): if (preurl.__len__() != 0): self.num = self.num + 1 yield Request(url=preurl[0], callback=self.parse) elif (self.direct == '2'): if (nexturl.__len__() != 0): self.num = self.num + 1 yield Request(url=nexturl[0], callback=self.parse) else: print u'您输入的字符不合法' return
def parse(self, response): ''' @Author: 孟红全 @Time: 2019/4/22 上午10:21 @文章标签增加1 @k是类中的变量,要在方法中访问需要用self.k print '@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@' print self.k ''' self.k = self.k+1 for i in range(pageCount): ''' @Author: 孟红全 @Time: 2019/4/21 下午3:11 @调试for循环 print '@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@' print i print '//*[@id="feedlist_id"]/li[' + str(i) + ']' ''' # 遍历每个标签下的每一页 selectors_feedlist = response.xpath('//*[@id="feedlist_id"]/li[' + str(i) + ']') if selectors_feedlist: ''' @Author: 孟红全 @Time: 2019/4/23 下午5:20 @提取数据并清洗 ''' # 用re正则表达式去除空格 try: title = selectors_feedlist.xpath('div/div[1]/h2/a/text()').extract_first(0) title = re.sub(r'\s', '', title) # 用strip()函数去除空格 # title.strip() summary = selectors_feedlist.xpath('div/div[2]/text()').extract_first(0) summary = re.sub(r'\s', '', summary) # 用strip()函数去除空格 # summary.strip() readCount = selectors_feedlist.xpath('div/dl/div[2]/dd[1]/a/span[2]/text()').extract_first(0) readCount = re.sub(r'\s', '', readCount) author = selectors_feedlist.xpath('div/dl/dd[1]/a/text()').extract_first(0) author = re.sub(r'\s', '', author) url = selectors_feedlist.xpath('div/div[1]/h2/a/@href').extract_first(0) re.sub(r'\s', '', url) date = selectors_feedlist.xpath('div/dl/dd[2]/text()').extract_first(0) date = re.sub(r'\s', '', date) # 创建item对象 # 提取每一页相应的item元素 item = CsdnItem() item['title'] = title item['summary'] = summary item['readCount'] = readCount item['author'] = author item['tag'] = tags[self.k] item['url'] = url item['date'] = date except: pass print (item) # 装配item yield item else: pass
def parse(self, response): item = CsdnItem() item['title'] = response.css("h1.title-article::text").extract_first() item['readcount'] = response.css("span.read-count::text").extract() return item