コード例 #1
0
    def parse(self, response):
        for l in response.css('div.article'):
            featured = l.css('div.article-featured').extract_first()
            item = NewsItem()
            if featured:
                item['title'] = l.css(
                    'div.article-meta > h3::text').extract_first()
                # item['pub_date'] = l.css('p.timeauthor > time::attr(datetime)').extract_first()
                item['description'] = 'some description'
                item['source'] = 'coindesk'
                item['image_url'] = l.css(
                    'div.subfeatured-picture > img::attr(src)').extract_first(
                    )
                item['featured'] = True
            else:
                item['title'] = l.css('h3 > a::text').extract_first()
                # item['pub_date'] = l.css('p.timeauthor > time::attr(datetime)').extract_first()
                item['description'] = 'some description'
                item['source'] = 'coindesk'
                item['image_url'] = l.css(
                    'img.wp-post-image::attr(src)').extract_first()
                item['featured'] = False
            item.save()

        # Pagination
        self.curr_page += 1
        next = 'http://www.coindesk.com/page/' + str(self.curr_page) + '/'
        if next and self.curr_page <= MAX_PAGES:
            yield scrapy.Request(url=next, callback=self.parse)
        print('finished')
コード例 #2
0
    def parse(self, response):
        items = []

        for l in response.xpath('//div[@data-context="listing"]'):
            item = NewsItem()
            item['use_in_report'] = False
            item['title'] = l.css('a.title::text').extract_first()[:254]
            item['description'] = ''

            image_url = l.css('a.thumbnail img::attr(src)').extract_first()
            item['image_url'] = 'https:' + image_url if image_url else ''

            domain = l.css('span.domain a::text').extract_first()
            item['category'] = detect_category(domain)

            comments = l.css('a.bylink::text').extract_first()
            number = re.sub('[ comments]', '', comments)
            item['comments'] = int(number)

            votes = l.css('div.unvoted::text').extract_first()
            item['votes'] = get_vote_number(votes)

            item.save()
            items.append(item)

        # Pagination
        self.curr_page += 1
        next = response.css('span.next-button a::attr(href)').extract_first()
        if next and self.curr_page < MAX_PAGES:
            yield scrapy.Request(url=next, callback=self.parse)
        print('finished ' + str(self.curr_page) + ' page')
コード例 #3
0
ファイル: gmanews_spider.py プロジェクト: makmac213/arkhive
    def parse_items(self, response):
        title = response.xpath('//div[@class="story"]/div[@class="title"]\
                                /h1/text()').extract()
        if len(title):
            item = None
            link = response.url
            #if not News.objects.filter(link=link).count():
            title = title[0]
            created = response.xpath('//div[@class="story"]/div\
                        /span[@class="timestamp"]/text()').extract()[0]
            created = created[:-2]
            created = time.strptime(created, "%B %d, %Y %I:%M")
            content = response.xpath('//div[@class="story"]\
                        /div[@class="main"]/div[@class="text_body"]').extract(
            )
            tags = response.xpath('//div[@class="story"]\
                        /div[@class="main"]/div[@class="tags"]\
                        /a[@class="tag"]/text()').extract()

            item = NewsItem()
            item['link'] = link
            item['title'] = title
            item['created'] = strftime('%Y-%m-%d', created)
            item['content'] = content
            item['tags'] = list(set(tags))
            item.save()

            return item
コード例 #4
0
    def parse_page(self, response):

        x = response.xpath('//meta[contains(@property, "og:type")]//@content'
                           ).extract_first()

        if x in 'article':
            title = response.xpath(
                '//meta[contains(@property, "og:title")]//@content'
            ).extract_first()
            file_id = hashlib.md5(title.encode('utf-8')).hexdigest()

            l = ItemLoader(item=NewsItem(), response=response)
            l.add_xpath('headline',
                        '//meta[contains(@property, "og:title")]//@content')
            l.add_value('file_id', file_id)
            l.add_value('title', title)
            l.add_value('link', response.url)
            l.add_xpath(
                'description',
                '//meta[contains(@property, "og:description")]//@content')
            l.add_xpath('author',
                        '//meta[contains(@name, "author")]//@content')
            l.add_xpath('content',
                        '//div[contains(@class, "body")]//p//text()')
            l.add_xpath('pubDate', '//time//text()')
            l.add_value('source', 'wired')

            yield l.load_item()
        else:
            next
コード例 #5
0
    def parse_page(self, response, title, description, pubDate, author):
        x = response.xpath('//meta[contains(@property, "og:type")]//@content'
                           ).extract_first()

        if x in 'article':
            file_id = hashlib.md5(title.encode('utf-8')).hexdigest()

            l = ItemLoader(item=NewsItem(), response=response)
            l.add_xpath('headline',
                        '//h1[contains(@class, "headline")]//text()')
            l.add_value('file_id', file_id)
            l.add_value('title', title)
            l.add_value('link', response.url)
            l.add_value('description', description)
            #l.add_xpath('author', '//*[contains(@href,"journalists")]//text()')
            l.add_xpath('author',
                        '//meta[contains(@name, "Author")]//@content')
            l.add_xpath(
                'content', '//div[contains(@class, "StandardArticleBody") or'
                'contains( @class , "Attribution_attribution")]/p//text()')
            l.add_xpath(
                'content',
                '//div[contains(@class, "StandardArticleBody")]/h3//text()')
            #l.add_xpath('content', '//div[contains(@class, "paragraph__component BasicArticle")]//text()')
            l.add_value('pubDate', pubDate)
            l.add_value('source', 'reuters')

            yield l.load_item()
        else:
            next
コード例 #6
0
    def parse_page(self, response):
        x = response.xpath('//meta[contains(@property, "og:type")]//@content'
                           ).extract_first()

        if x in 'article':
            link = response.url
            file_id = hashlib.md5(link.encode('utf-8')).hexdigest()

            l = ItemLoader(item=NewsItem(), response=response)
            l.add_xpath('headline',
                        '//meta[contains(@property, "og:title")]//@content')
            l.add_value('file_id', file_id)
            l.add_xpath('title', '//h1[contains(@role, "heading")]//text()')
            l.add_value('link', link)
            #l.add_value('description', 'None')
            l.add_xpath(
                'description',
                '//meta[contains(@property, "description")]//@content')
            #l.add_xpath('author', '//a[contains(@href, "/profile/")]//text()')
            l.add_xpath('author',
                        '//meta[contains(@name, "author")]//@content')
            l.add_xpath('content',
                        '//article[contains(@role, "article")]//p//text()')
            l.add_xpath('pubDate', '//time/@datetime')
            l.add_value('source', 'thetimesUK')

            yield l.load_item()
        else:
            next
コード例 #7
0
    def parse_page(self, response, title, author):

        x = response.xpath('//meta[contains(@property, "og:type")]//@content'
                           ).extract_first()

        if x in 'article':
            title = response.xpath(
                '//h1[contains(@class, "entry-title")]//text()').extract_first(
                )
            file_id = hashlib.md5(title.encode('utf-8')).hexdigest()

            l = ItemLoader(item=NewsItem(), response=response)
            l.add_xpath('headline',
                        '//h1[contains(@class, "entry-title")]//text()')
            l.add_value('file_id', file_id)
            l.add_value('title', title)
            l.add_value('link', response.url)
            l.add_xpath('description',
                        '//meta[contains(@name, "description")]//@content')
            l.add_xpath('author', '//a[contains(@rel, "author")]//text()')
            l.add_xpath(
                'content', '//div[contains(@itemprop, "articleBody") or'
                'contains(@class, "post-content")]//p//text()')
            l.add_xpath('pubDate', '//article/@data-date')
            l.add_value('source', 'theobserver')

            yield l.load_item()
コード例 #8
0
    def parse_page(self, response):

        x = response.xpath('//meta[contains(@property, "og:type")]//@content'
                           ).extract_first()

        if x in 'article':
            file_id = hashlib.md5(title.encode('utf-8')).hexdigest()

            l = ItemLoader(item=NewsItem(), response=response)
            l.add_xpath('headline',
                        '//meta[contains(@property, "og:title")]//@content')
            l.add_value('file_id', file_id)
            l.add_value('title', '//meta[contains(@name, "title")]//@content')
            l.add_value('link', response.url)
            l.add_value('description',
                        '//meta[contains(@name, "description")]//@content')
            l.add_xpath('author',
                        '//meta[contains(@name, "author")]//@content')
            l.add_xpath(
                'content', '//div[contains(@class, "zn-body__paragraph") or '
                'contains(@class, "BasicArticle") or'
                'contains(@class, "Paragraph__component")]//text()')
            l.add_xpath(
                'content',
                '//div[contains(@class, "zn-body__paragraph")]//h3//text()')
            l.add_value('pubDate',
                        '//meta[contains(@name, "pubdate")]//@content')
            l.add_value('source', 'cnn')

            yield l.load_item()
        else:
            next
コード例 #9
0
    def parse_page(self, response, title, description, pubDate, author):

        x = response.xpath('//meta[contains(@property, "og:type")]//@content'
                           ).extract_first()

        if x in 'article':
            file_id = hashlib.md5(title.encode('utf-8')).hexdigest()

            l = ItemLoader(item=NewsItem(), response=response)
            l.add_xpath('headline',
                        '//meta[contains(@property, "og:title")]//@content')
            l.add_value('file_id', file_id)
            l.add_value('title', title)
            l.add_value('link', response.url)
            l.add_value('description', description)
            l.add_xpath(
                'author',
                '//meta[contains(@property, "article:author")]//@content')
            l.add_xpath(
                'content', '//div[contains(@class, "story-body__inner") or'
                'contains(@id, "story-body") or'
                'contains(@class, "media__summary")]//p//text()')
            l.add_xpath(
                'content',
                '//div[contains(@class, "story-body__inner")]//h2//text()')
            l.add_value('pubDate', pubDate)
            l.add_value('source', 'bbc')

            yield l.load_item()
        else:
            next
コード例 #10
0
ファイル: inquirer_spider.py プロジェクト: makmac213/arkhive
    def parse_items(self, response):
        title = response.xpath('//div[@class="al-headline"]/\
                                div[@class="container"]/h1').extract()
        if len(title):
            item = None
            link = response.url

            title = strip_tags(title[0])

            # parse date
            created = response.xpath('//h4[@class="byline"]').extract()[0]
            created = created.split('>')[-2].strip()[:-4]
            ord_str = None
            if 'st,' in created:
                ord_str = 'st'
            elif 'nd,' in created:
                ord_str = 'nd'
            elif 'rd,' in created:
                ord_str = 'rd'
            elif 'th,' in created:
                ord_str = 'th'
            created_format = '%H:%M %p | %A, %B %d' + ord_str + ', %Y'
            created = time.strptime(created, created_format)

            #content = response.xpath('/html/body/div[6]/div[8]/div/div[2]/div[2]').extract()
            content = response.xpath('//div[@class="main-article"]').extract()

            #tags = response.xpath('//div[@class="story"]\
            #            /div[@class="main"]/div[@class="tags"]\
            #            /a[@class="tag"]/text()').extract()

            item = NewsItem()
            item['link'] = link
            item['title'] = title
            item['created'] = strftime('%Y-%m-%d', created)
            item['content'] = content
            #item['tags'] = list(set(tags))
            item.save()

            return item
コード例 #11
0
ファイル: inquirer_spider.py プロジェクト: makmac213/arkhive
    def parse_items(self, response):
        title = response.xpath('//div[@class="al-headline"]/\
                                div[@class="container"]/h1').extract()
        if len(title):
            item = None            
            link = response.url
            
            title = strip_tags(title[0])

            # parse date
            created = response.xpath('//h4[@class="byline"]').extract()[0]
            created = created.split('>')[-2].strip()[:-4]
            ord_str = None
            if 'st,' in created:
                ord_str = 'st'
            elif 'nd,' in created:
                ord_str = 'nd'
            elif 'rd,' in created:
                ord_str = 'rd'
            elif 'th,' in created:
                ord_str = 'th'
            created_format = '%H:%M %p | %A, %B %d' + ord_str +', %Y'
            created = time.strptime(created, created_format)

            #content = response.xpath('/html/body/div[6]/div[8]/div/div[2]/div[2]').extract()
            content = response.xpath('//div[@class="main-article"]').extract()

            #tags = response.xpath('//div[@class="story"]\
            #            /div[@class="main"]/div[@class="tags"]\
            #            /a[@class="tag"]/text()').extract()

            item = NewsItem()
            item['link'] = link
            item['title'] = title
            item['created'] = strftime('%Y-%m-%d', created)
            item['content'] = content
            #item['tags'] = list(set(tags))
            item.save()

            return item
コード例 #12
0
    def parse_page(self, response, title, description, pubDate, author):
        x = response.xpath('//meta[contains(@property, "og:type")]//@content'
                           ).extract_first()

        if x in 'article':
            file_id = hashlib.md5(title.encode('utf-8')).hexdigest()

            l = ItemLoader(item=NewsItem(), response=response)
            l.add_xpath('headline', '//*//h1//text()')
            l.add_value('file_id', file_id)
            l.add_value('title', title)
            l.add_value('link', response.url)
            l.add_value('description', description)
            l.add_value('author', author)
            l.add_xpath(
                'content',
                '//section[contains(@name, "articleBody")]//p//text()')
            l.add_value('pubDate', pubDate)
            l.add_value('source', 'nytimes')

            yield l.load_item()
        else:
            next