Esempio n. 1
0
 def parse(self, response):
     sel = Selector(response)
     item = CsdnItem()
     title = sel.xpath(
         '//div[@id="article_details"]/div/h1/span/a/text()').extract()
     article_url = str(response.url)
     time = sel.xpath(
         '//div[@id="article_details"]/div[2]/div/span[@class="link_postdate"]/text()'
     ).extract()
     readtimes = sel.xpath(
         '//div[@id="article_details"]/div[2]/div/span[@class="link_view"]/text()'
     ).extract()
     item['title'] = [
         n.encode('utf-8').replace("\r\n", "").strip() for n in title
     ]
     item['time'] = [n.encode('utf-8') for n in time]
     item['readtimes'] = [n.encode('utf-8') for n in readtimes]
     yield item
     #get next url
     urls = sel.xpath('//li[@class="next_article"]/a/@href').extract()
     for url in urls:
         print url
         url = "http://blog.csdn.net" + url
         print url
         yield Request(url, callback=self.parse)
Esempio n. 2
0
 def parse(self, response):
     item=CsdnItem()
     item['title']=response.xpath("//a[@class='meta-title']/text()").extract()
     item['desc']=response.xpath("//span[@class='excerpt']/p/text()").extract()
     item['link']=response.xpath("//a[@class='meta-title']/@href").extract()
     item['time']=response.xpath("normalize-space(//p/text())").extract()
     yield item
Esempio n. 3
0
 def parse(self, response):
     item=CsdnItem()
     item['title']=response.xpath("//h2[@class='csdn-tracking-statistics']"
                                  "/a[@strategy='career']/text()").extract()
     item['desc']=response.xpath("//div[@class='summary oneline']/text()").extract()
     item['link']=response.xpath("//h2[@class='csdn-tracking-statistics']"
                                 "/a[@strategy='career']/@href").extract()
     item['time']=response.xpath("//dd[@class='time']/text()").extract()
     yield item
Esempio n. 4
0
 def parse(self, response):
     item = CsdnItem()
     item['title'] = response.xpath(
         "//a[@class='archive-title']/text()").extract()
     item['desc'] = response.xpath(
         "//span[@class='excerpt']/p/text()").extract()
     item['link'] = response.xpath(
         "//a[@class='archive-title']/@href").extract()
     item['time'] = response.xpath("//dd[@class='time']/text()").extract()
     yield item
    def parse(self, response):
        item = CsdnItem()

        item["name"] = response.xpath(
            '//h1[@class="title-article"]/text()').extract()[0]
        item["time"] = response.xpath(
            '//span[@class="time"]/text()').extract()[0]
        item["number"] = response.xpath(
            '//span[@class="read-count"]/text()').extract()[0]
        yield item
Esempio n. 6
0
 def parse_content(self, response):
     content = response.xpath('//div[@class="markdown_views"]').extract()
     item = CsdnItem()
     item['url'] = response.meta['url'][1:]
     item['jianjie'] = response.meta['jianjie']
     item['cete'] = response.meta['cete']
     item['date'] = response.meta['date']
     item['count'] = response.meta['count']
     item['author'] = response.meta['author']
     item['title'] = response.meta['title']
     item['content'] = content
     yield item
Esempio n. 7
0
 def format_item(self, response: Response):
     title = response.xpath("//h1[@class='title-article']/text()")[0].get()
     body = response.xpath("//div[@id='article_content']")[0].get()
     taga = response.xpath("//a[@class='tag-link']/text()")
     tags = []
     for i in taga:
         tag = i.get()
         tags.append(tag)
     tag = "<spliter>".join(tags)
     yield CsdnItem(title=title, body=body, tag=tag)
     print("*" * 30 + "parse_item")
     for i in self._explore_all_href(response):
         yield i
Esempio n. 8
0
    def parse(self, response):
        item = CsdnItem()
        # 不加/text()出现的是选择器对象且包含节点所有内容[<selector .., data ='<h1>...</h1>']
        # 若加/text()也是得到选择器对象(节点文本)[selector ..., data='文本']
        # extract将选择器对象中文本取出来['文本内容']
        item["name"] = response.xpath(
            '//h1[@class="title-article"]/text()').extract()[0]  # 把匹配出的对象转为字符串
        item["time"] = response.xpath(
            '//span[@class="time"]/text()').extract()[0]
        item["count"] = response.xpath(
            '//span[@class="read-count"]/text()').extract()[0]

        yield item
Esempio n. 9
0
    def parse_item(self, response):
        item = CsdnItem()
        item['question'] = response.xpath(
            '//div[@class="detail_title"]/h1/span/text()').extract()[0]
        content = response.xpath('//div[@class="detailed"]/table')[0].xpath(
            './/div[@class="post_body"]/text()').extract()
        item['content'] = "".join(content).strip().replace("\n", "").replace(
            " ", "")
        item['answer_num'] = response.xpath(
            '//span[@class="return_time"]/text()').extract()[0]
        item['question_url'] = response.url

        yield item
Esempio n. 10
0
 def parse(self, response):
     for i in response.xpath("//div[@class='content']/a/@href"):
         next_url = i.get()
         if next_url:
             yield scrapy.Request(next_url, self.parse)
     tags = []
     for i in response.xpath("//a[@class='tag-link']/text()"):
         tag = i.get().strip().lstrip()
         tags.append(tag)
     item = CsdnItem(
         tags=tags,
         url=response.url,
         content=response.xpath("//div[@id='content_views']").get())
     yield item
Esempio n. 11
0
 def parse(self, response):
     for sel in response.xpath('//*[@id="asideProfile"]'):
         # author = sel.xpath('div[1]/div[2]/div[1]/a/text()').extract()
         # fans = sel.xpath('div[2]/dl[2]/dt/span/text()').extract()
         # like = sel.xpath('div[2]/dl[3]/dt/span/text()').extract()
         # comment = sel.xpath('div[2]/dl[4]/dt/span/text()').extract()
         # print(author, fans, like, comment)
         item = CsdnItem()
         item['author'] = sel.xpath(
             'div[1]/div[2]/div[1]/a/span/text()').extract()  #.strip()
         item['fans'] = sel.xpath('div[2]/dl[2]/dt/span/text()').extract()
         item['like'] = sel.xpath('div[2]/dl[3]/dt/span/text()').extract()
         item['comment'] = sel.xpath(
             'div[2]/dl[4]/dt/span/text()').extract()
         yield item
Esempio n. 12
0
 def parse(self, response):
     # 先获取页面中class="list_con"的div标签,在获取class="title oneline"的子div标签
     body = response.xpath(
         '//div[@class="list_con"]//div[@class="title oneline"]')
     for value in body:
         item = CsdnItem()
         try:
             item['title'] = value.xpath(
                 './h2/a/text()')[0].extract().strip()
             item['href'] = value.xpath('./h2/a/@href')[0].extract()
         except Exception as e:
             print(e)
         else:
             print(item['title'] + ':' + item['href'])
         yield item
Esempio n. 13
0
 def parse_item(self, response):
     sel = Selector(response)
     item = CsdnItem()
     title = sel.xpath(
         '//div[@id="article_details"]/div/h1/span/a/text()').extract()
     article_url = str(response.url)
     time = sel.xpath(
         '//div[@id="article_details"]/div[2]/div/span[@class="link_postdate"]/text()'
     ).extract()
     readtimes = sel.xpath(
         '//div[@id="article_details"]/div[2]/div/span[@class="link_view"]/text()'
     ).extract()
     item['title'] = [
         n.encode('utf-8').replace("\r\n", "").strip() for n in title
     ]
     item['time'] = [n.encode('utf-8') for n in time]
     item['readtimes'] = [n.encode('utf-8') for n in readtimes]
     yield item
Esempio n. 14
0
    def parse_item(self, response):

        sel = response.selector
        posts = sel.xpath(
            '//*[@id="article_list"]/div[@class="list_item article_item"]')
        items = []

        for p in posts:
            item = CsdnItem()
            item['title'] = p.xpath(
                './/span[@class="link_title"]/a/text()').extract_first()
            item['pdate'] = p.xpath(
                './/span[@class="link_postdate"]/text()').extract_first()
            item['url'] = response.url
            item['description'] = p.xpath(
                './/*[@class="article_description"]/text()').extract_first()

            items.append(item)

        return items
Esempio n. 15
0
 def parse_item(self, response):
     name = response.xpath('//div[@class="article-intro"]/h1/text()').get()
     if response.xpath(
             '//div[@class="article-intro"]/h1/span/text()').get():
         name += response.xpath(
             '//div[@class="article-intro"]/h1/span/text()').get()
     contents = response.xpath(
         '//div[@class="article-intro"]//text()').getall()
     title = []
     title.append(name)
     if response.xpath('//div[@class="article-intro"]/h2/text()').get():
         title_2 = response.xpath(
             '//div[@class="article-intro"]/h2/text()').getall()
         title += title_2
     if response.xpath('//div[@class="article-intro"]/h3/text()').get():
         title_3 = response.xpath(
             '//div[@class="article-intro"]/h3/text()').getall()
         title += title_3
     print("===============")
     print(name)
     print(title)
     content_list = []
     for i in contents:
         # if content=="\r\n":
         #     continue
         if "\t" in i:
             continue
         if "\n" in i:
             continue
         if i in title:
             content_list.append("\n")
         content_list.append(i.strip())
         if i in title:
             content_list.append("\n")
     content = " ".join(content_list)
     print(content)
     item = CsdnItem(name=name, content=content)
     print(item)
     yield item
Esempio n. 16
0
 def parseComent(self, response):
     title = response.xpath("//h1[@class='title-article']/text()")[0].get()
     tag = class_name
     body = response.xpath("//div[@id='article_content']")[0].get()
     yield CsdnItem(title=title, tag=tag, body=body)
Esempio n. 17
0
    def parse(self, response):

        # wantnum = CsdnPipeline.wantnum
        # num = CsdnPipeline.num
        # print 'the num is ',num
        # print 'the wantnum is ',wantnum
        # print "parse_item>>>>>>"
        print '进入爬虫'
        item = CsdnItem()
        sel = Selector(response)
        blog_url = str(response.url)
        blog_name = sel.xpath(
            '//div[@id="article_details"]/div/h1/span/a/text()').extract()
        # print u'',blog_name[0].replace("\r\n",'').replace(" ",''),''
        item['blog_viewnum'] = sel.xpath(
            '//span[@class="link_view"]/text()').extract()[0]
        item['blog_time'] = sel.xpath(
            '//span[@class="link_postdate"]/text()').extract()[0]
        item['blog_author'] = sel.xpath(
            '//div[@id="blog_title"]/h2/a/text()').extract()[0]
        item['blog_comment'] = ''
        alltext = sel.xpath('//span[@class="link_comments"]//text()').extract()

        for eacht in sel.xpath(
                '//span[@class="link_comments"]//text()').extract():

            item['blog_comment'] = item['blog_comment'] + eacht

        # print u'',item['blog_viewnum'],''
        item['blog_title'] = blog_name[0].replace("\r\n", '').replace(" ", '')

        item['blog_content'] = ''
        alltext = sel.xpath('//div[@id="article_content"]//text()').extract()

        # for eachtext in alltext:
        #     item['blog_content'] = item['blog_content'] + eachtext.replace("\r\n",'').replace(' ','')

        for i in range(0, alltext.__len__() - 1):

            onetext = alltext[i].replace("\r\n", '').replace(' ', '')
            if '$(function()' in onetext:
                print u'找到你啦'
                continue
            onetext = onetext.replace("'", "\\\'")

            onetext = onetext.replace('"', '\\\"')
            item['blog_content'] = item['blog_content'] + onetext

        print item['blog_content']

        item['blog_picture'] = []
        allimgurl = sel.xpath(
            '//div[@id="article_content"]//img/@src').extract()
        # print allimgurl
        # for eachurl in allimgurl:
        #     # print type(eachurl)
        #     item['blog_picture'] = item['blog_picture'].append(eachurl)
        item['blog_picture'] = allimgurl

        num = 0
        item['blog_url'] = blog_url.encode('utf-8')

        yield item

        preurl = sel.xpath(
            '//ul[@class="article_next_prev"]/li[@class="prev_article"]/a/@href'
        ).extract()
        #print preurl
        nexturl = sel.xpath(
            '//ul[@class="article_next_prev"]/li[@class="next_article"]/a/@href'
        ).extract()
        #print nexturl
        if (self.num < int(self.wantnum)):
            if (self.direct == '1'):
                if (preurl.__len__() != 0):
                    self.num = self.num + 1
                    yield Request(url=preurl[0], callback=self.parse)
            elif (self.direct == '2'):
                if (nexturl.__len__() != 0):
                    self.num = self.num + 1
                    yield Request(url=nexturl[0], callback=self.parse)
            else:
                print u'您输入的字符不合法'
                return
Esempio n. 18
0
    def parse(self, response):
        '''
        @Author: 孟红全
        @Time:   2019/4/22 上午10:21
        @文章标签增加1
        @k是类中的变量,要在方法中访问需要用self.k
        print '@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@'
        print self.k
        '''

        self.k = self.k+1
        for i in range(pageCount):
            '''
            @Author: 孟红全
            @Time:   2019/4/21 下午3:11
            @调试for循环
            print '@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@'
            print i
            print '//*[@id="feedlist_id"]/li[' + str(i) + ']'
            '''



            # 遍历每个标签下的每一页
            selectors_feedlist = response.xpath('//*[@id="feedlist_id"]/li[' + str(i) + ']')

            if selectors_feedlist:
                '''
                @Author: 孟红全
                @Time:   2019/4/23 下午5:20
                @提取数据并清洗
                '''
                # 用re正则表达式去除空格
                try:
                    title = selectors_feedlist.xpath('div/div[1]/h2/a/text()').extract_first(0)
                    title = re.sub(r'\s', '', title)
                    # 用strip()函数去除空格
                    # title.strip()

                    summary = selectors_feedlist.xpath('div/div[2]/text()').extract_first(0)
                    summary = re.sub(r'\s', '', summary)
                    # 用strip()函数去除空格
                    # summary.strip()

                    readCount = selectors_feedlist.xpath('div/dl/div[2]/dd[1]/a/span[2]/text()').extract_first(0)
                    readCount = re.sub(r'\s', '', readCount)

                    author = selectors_feedlist.xpath('div/dl/dd[1]/a/text()').extract_first(0)
                    author = re.sub(r'\s', '', author)

                    url = selectors_feedlist.xpath('div/div[1]/h2/a/@href').extract_first(0)
                    re.sub(r'\s', '', url)

                    date = selectors_feedlist.xpath('div/dl/dd[2]/text()').extract_first(0)
                    date = re.sub(r'\s', '', date)

                    # 创建item对象
                    # 提取每一页相应的item元素
                    item = CsdnItem()
                    item['title'] = title
                    item['summary'] = summary
                    item['readCount'] = readCount
                    item['author'] = author
                    item['tag'] = tags[self.k]
                    item['url'] = url
                    item['date'] = date
                except:
                    pass

                print (item)
                # 装配item
                yield item
            else:
                pass
Esempio n. 19
0
 def parse(self, response):
     item = CsdnItem()
     item['title'] = response.css("h1.title-article::text").extract_first()
     item['readcount'] = response.css("span.read-count::text").extract()
     return item