Esempio n. 1
0
    def parse(self, response):
        loader = TextLoader(item=TextItem(), response=response)
        path = self.pathextractor.weibo(settings.SNWB_STORE, response.url)
        loader.add_value('path', path)
        self.load_text(response, loader)
        item = loader.load_item()
        yield item

        next_page = self.load_next(response)
        if next_page:
            yield scrapy.Request(next_page, dont_filter=True)
Esempio n. 2
0
 def parse_view(self, response):
     loader = TextLoader(item = TextItem(), response = response)
     path = self.pathextractor.host(settings.WY163_STORE, response.url)
     loader.add_value('path', path)
     loader.add_xpath('title', '//h3/text()')
     loader.add_xpath('text', '//div[@class="feed-text"]/p/text()')
     return loader.load_item()
Esempio n. 3
0
    def parse_item(self, response):
        loader = TextLoader(item=TextItem(), response=response)

        path = self.pathextractor.host(settings.SBKK8_STORE, response.url)
        loader.add_value('path', path)
        loader.add_xpath('title', '//h1/text()')

        ps = response.xpath('//div[@id="f_article"]//p')
        if not ps:
            ps = response.xpath('//div[@id="f_article"]/div')
        if not ps:
            ps = response.xpath('//div[@id="f_article"]')
        if not ps:
            ps = response.xpath('//div[@id="articleText"]//p')

        for p in ps:
            ts = p.xpath('.//text()').extract()
            text = ''.join(ts)
            loader.add_value('text', text)

        item = loader.load_item()

        # if ('text' not in item) or (item['text'] == ''):
            # with open('url.txt', 'a') as url_file:
                # url = response.url + '\n'
                # url_file.write(url.encode('utf-8'))

        return item
Esempio n. 4
0
    def parse_spec(self, response):
        loader = TextLoader(item = TextItem(), response = response)
        path = self.pathextractor.host(settings.WY163_STORE, response.url)
        loader.add_value('path', path)
        loader.add_xpath('title', '//h1/text()')
        loader.add_xpath('text', '//h2/text()')
        loader.add_xpath('text', '//h3/text()')
        
        ps = response.xpath('//p')
        for p in ps:
            ts = p.xpath('./text()').extract()
            text = ''.join(ts)
            loader.add_value('text', text)

        return loader.load_item()
Esempio n. 5
0
    def parse_item(self, response):
        loader = TextLoader(item = TextItem(), response = response)

        path = self.pathextractor.baike(settings.BAIKE_STORE, response.url)
        loader.add_value('path', path)
        loader.add_xpath('title', '//h1/text()')
        loader.add_xpath('text', '//h2//text()')
        loader.add_xpath('text', '//h3//text()')

        ds = response.xpath('//div[@class="para"]')
        for d in ds:
            ts = d.xpath('.//text()').extract()
            text = ''.join(ts)
            loader.add_value('text', text)

        return loader.load_item()
Esempio n. 6
0
    def parse_item(self, response):
        loader = TextLoader(item=TextItem(), response=response)

        path = self.pathextractor.tianya(settings.TY_STORE, response)
        loader.add_value('path', path)
        loader.add_xpath('title', '//h1//text()')
        loader.add_xpath('text', '//div[@class="bbs-content"]/text()')

        return loader.load_item()
Esempio n. 7
0
    def parse_item(self, response):
        loader = TextLoader(item=TextItem(), response=response)

        path = self.pathextractor.host(settings.HAO1111_STORE, response.url)
        loader.add_value('path', path)
        loader.add_xpath('title', '//div[@class="article-title"]/h1/text()')
        #loader.add_xpath('title', '//div[class="article-summary"]/text()')
        loader.add_xpath('text', '//div[@class="article-content"]/text()')

        return loader.load_item()
Esempio n. 8
0
    def parse_item(self, response):
        loader = TextLoader(item = TextItem(), response = response)

        path = self.pathextractor.tieba(settings.TB_STORE, response)
        loader.add_value('path', path)
        loader.add_value('title', '')

        # main content
        #loader.add_xpath('text', '//div[re:test(@class, "d_post_content j_d_post_content[\s\S]*")]/text()')
        loader.add_xpath('text', '//div[contains(@class, "d_post_content j_d_post_content")]/text()')
        # comment content
        comnt_list = self.pat_fnd.findall(response.body.decode('utf-8'))
        for comnt in comnt_list:
            text = self.pat_sub.sub('', comnt)
            text = text.decode('raw_unicode_escape')
            # or text = eval('u"%s"' % text)
            loader.add_value('text', text)

        return loader.load_item()
Esempio n. 9
0
    def parse_shtml(self, response):
        loader = TextLoader(item=TextItem(), response=response)

        path = self.pathextractor.host(settings.SOHU_STORE, response.url)
        loader.add_value('path', path)
        loader.add_xpath('title', '//h1/text()')

        ps = response.xpath('//div[@id="contentText"]//p')
        for p in ps:
            ts = p.xpath('.//text()').extract()
            text = ''.join(ts)
            loader.add_value('text', text)

        # old pages
        if not ps:
            ps = response.xpath('//div[@id="sohu_content"]/p/text()')
            for p in ps:
                text = p.extract()
                loader.add_value('text', text)

        return loader.load_item()
Esempio n. 10
0
 def parse_wx(self, response):
     loader = TextLoader(item=TextItem(), response=response)
     path = self.pathextractor.host(settings.TY2016_STORE, response.url)
     loader.add_value('path', path)
     loader.add_value('title', '')
     loader.add_xpath('text', '//p//text()')
     item = loader.load_item()
     
     if ('text' not in item) or (item['text'] == ''):
         with open('url.txt', 'a') as url_file:
             url = response.url + '\n'
             url_file.write(url.encode('utf-8'))
     
     return item
Esempio n. 11
0
    def parse(self, response):
        loader = TextLoader(item=TextItem(), response=response)
        path = self.pathextractor.weibo(settings.SNWB_STORE, response.url)
        loader.add_value('path', path)
        self.load_text(response, loader)
        item = loader.load_item()
        yield item

        deny_serv = self.deny_serv(response)
        if deny_serv:
            yield self.snwblogin.login([response.url])
        else:
            next_page = self.load_next(response)
            #if next_page and ('text' in item):
            if next_page:
                log.msg('next page')
                yield scrapy.Request(next_page, dont_filter=True)
            else:
                follow = self.load_follow(response)
                log.msg('follow')
                log.msg(follow)
                yield scrapy.Request(follow, callback=self.parse_follow)
Esempio n. 12
0
    def parse_shtml(self, response):
        loader = TextLoader(item=TextItem(), response=response)

        path = self.pathextractor.host(settings.BJCG_STORE, response.url)
        loader.add_value('path', path)
        loader.add_xpath('title', '//div[@class="main_xl_bt"]/text()')

        ps = response.xpath('//div[@class="main_xl_center"]//p')
        for p in ps:
            ts = p.xpath('.//text()').extract()
            text = ''.join(ts)
            text = self.pat_text.sub('', text)
            loader.add_value('text', text)

        return loader.load_item()
Esempio n. 13
0
    def parse_item(self, response):
        loader = TextLoader(item=TextItem(), response=response)

        path = self.pathextractor.host(settings.SINA_STORE, response.url)
        loader.add_value('path', path)
        loader.add_xpath('text', '//h1/text()')
        
        ps = response.xpath('//div[@id="artibody"]//p')
        if not ps:
            ps = response.xpath('//div[@id="article"]//p')

        for p in ps:
            ts = p.xpath('.//text()').extract()
            text = ''.join(ts)
            loader.add_value('text', text)
        
        return loader.load_item()
Esempio n. 14
0
    def parse_item(self, response):
        loader = TextLoader(item = TextItem(), response = response)

        path = self.pathextractor.host(settings.BK_STORE, response.url)
        loader.add_value('path', path)
        loader.add_xpath('title', '//h1/text()')

        # ps = response.xpath('//div[@class="basic-info cmn-clearfix"]//dt')
        # ps = response.xpath('//div[@class="basic-info cmn-clearfix"]//dd')
        # for p in ps:
            # ts = p.xpath('.//text()').extract()
            # text = ''.join(ts)
            # loader.add_value('text', text)

        loader.add_xpath('text', '//h2/span[@class="title-text"]/text()')
        loader.add_xpath('text', '//h3/span[@class="title-text"]/text()')

        ps = response.xpath('//div[@class="para"]')
        for p in ps:
            ts = p.xpath('.//text()').extract()
            text = ''.join(ts)
            loader.add_value('text', text)

        return loader.load_item()
Esempio n. 15
0
    def parse_item(self, response):
        loader = TextLoader(item=TextItem(), response=response)

        path = self.pathextractor.baidumusic(response.url)
        loader.add_value('path', path)

        text = self.getText(response)
        loader.add_value('text', text)
        
        return loader.load_item()
Esempio n. 16
0
    def parse_shtml(self, response):
        loader = TextLoader(item = TextItem(), response = response)

        path = self.pathextractor.host(settings.WY163_STORE, response.url)
        loader.add_value('path', path)
        loader.add_xpath('title', '//h1/text()')

        ps = response.xpath('//div[@id="endText"]/p')
        # old pages:http://news.163.com/05/0130/10/1BBB83S30001121Q.html
        if not ps:
            ps = response.xpath('//div[@id="text"]/p')
        if not ps:
            ps = response.xpath('//div[@id="content"]/p')

        for p in ps:
            ts = p.xpath('.//text()').extract()
            text = ''.join(ts)
            loader.add_value('text', text)

        return loader.load_item()
Esempio n. 17
0
    def parse_shtml(self, response):
        loader = TextLoader(item=TextItem(), response=response)

        path = self.pathextractor.host(settings.QQ_STORE, response.url)
        loader.add_value('path', path)

        loader.add_xpath('title', '//h1/text()')
        ps = response.xpath('//div[@id="Cnt-Main-Article-QQ"]/p[@style="TEXT-INDENT: 2em"]')
        if not ps:
            ps = response.xpath('//div[@id="Cnt-Main-Article-QQ"]/p')

        # old pages
        if not ps:
            loader.replace_xpath('title', '//div[@id="ArtTit"]/text()')
            ps = response.xpath('//div[@id="ArtCnt"]//p')
        if not ps:
            loader.replace_xpath('title', '//div[@id="ArticleTit"]/text()')
            ps = response.xpath('//div[@id="ArticleCnt"]//p')

        for p in ps:
            if p.xpath('./script'):
                continue
            ts = p.xpath('.//text()').extract()
            text = ''.join(ts)
            loader.add_value('text', text)

        return loader.load_item()
Esempio n. 18
0
    def parse_original(self, response):
        loader = TextLoader(item=TextItem(), response=response)

        path = self.pathextractor.host(settings.QQ_STORE, response.url)
        loader.add_value('path', path)
        loader.add_xpath('title', '//h1/text()')
        loader.add_xpath('text', '//div[@class="daoyu"]//div[@class="intr"]/text()')
        loader.add_xpath('text', '//div[@id="articleContent"]/h2/text()')
        loader.add_xpath('text', '//div[@id="articleContent"]/h3/text()')
        loader.add_xpath('text', '//div[@id="articleContent"]/p/text()')
        loader.add_xpath('text', '//div[@class="jieyu"]//text()')

        return loader.load_item()
Esempio n. 19
0
    def parse_item(self, response):
        loader = TextLoader(item=TextItem(), response=response)

        path = self.pathextractor.zhihu(settings.ZH_STORE, response.url)
        loader.add_value("path", path)
        loader.add_value("title", "")

        loader.add_xpath("text", '//div[@id="zh-question-title"]//text()')
        loader.add_xpath("text", '//div[@id="zh-question-detail"]//text()')

        ps = response.xpath('//div[@class="zm-editable-content clearfix"]')
        for p in ps:
            ts = p.xpath(".//text()").extract()
            text = "".join(ts)
            loader.add_value("text", text)

        return loader.load_item()