Ejemplo n.º 1
0
    def parse(self, response):
        # the headlines we need to get need to be obtained via 3
        # different xpaths.
        head1 = response.xpath("//ul[@class='headlines medium']//div//h2[@class='title']//a")
        head2 = response.xpath("//ul[contains(@class, 'headlines large')]//div//h2[@class='title']//a")
        head3 = response.xpath("//ul[contains(@class, 'headlines small')]//div//h2[@class='title']//a")
        # for ease of use, all three lists will be combined into one large one
        # that we can iterate over.
        headlines = head1 + head2 + head3

        if headlines:
            for head in headlines:
                nsil = NewsScraperItemLoader(selector=head)
                anchor = nsil.get_xpath("./@href", Join())
                new_url = urljoin(response.url, anchor)

                yield Request(url=new_url, callback=self.parse_article)

            if 'news' in response.url:
                next_anchor = "/ajax.php?action=frontpage&page=%s&platform=&type=news&topic=" % self.page
            elif 'reviews' in response.url:
                next_anchor = "/ajax.php?action=frontpage&page=%s&platform=&type=reviews&topic=" % self.page

            self.page += 1
            next_page = urljoin(response.url, next_anchor)

            yield Request(url=next_page, callback=self.parse, dont_filter=True)
Ejemplo n.º 2
0
    def parse_index(self, response):
        # news and reviews for now.
        links = response.xpath("//div[@class='headline-wrap']//div[@class='headline']//h3/parent::a")

        # I'm hoping that this will fire when I expect it to.
        # that is to say, at the end of the run, when there are
        # no links left.
        if not links:
            raise CloseSpider("No results remaining.")

        for link in links:
            nsil = NewsScraperItemLoader(selector=link)
            rel = nsil.get_xpath(".//@href", Join())
            new_url = urljoin(response.url, rel)

            yield Request(url=new_url, callback=self.parse_article,
                          dont_filter=True)

        if 'reviews' in response.url:
            page_rel = "?page=%s" % self.reviews_page_num
            new_url = urljoin(response.url, page_rel)
            self.reviews_page_num += 1
            print "Sending request for %s" % new_url

            yield Request(url=new_url, callback=self.parse_index,
                          dont_filter=True)
        elif 'news' in response.url:
            page_rel = "?page=%s" % self.news_page_num
            new_url = urljoin(response.url, page_rel)
            self.news_page_num += 1
            print "Sending request for %s" % new_url

            yield Request(url=new_url, callback=self.parse_index,
                          dont_filter=True)
Ejemplo n.º 3
0
    def parse(self, response): 
        sel = Selector(response)

        cats = sel.xpath("//ul[@id='main-menu']//li/a")

        for cat in cats:
            nsil = NewsScraperItemLoader(selector=cat)
            url = nsil.get_xpath(".//@href", Join())
            category = nsil.get_xpath(".//text()", Join())

            yield Request(url=url, callback=self.parse_articles,
                meta={'category': category})
Ejemplo n.º 4
0
    def parse(self, response):
        nsil = NewsScraperItemLoader(selector=response.selector)
        link_xpath = ["//nav[@class='nav-links']//li/a[contains(., 'News')]/@href",
                      "//nav[@class='nav-links']//li/a[contains(., 'Reviews')]/@href" ]

        rels = nsil.get_xpath(link_xpath)

        for rel in rels:
            new_url = urljoin(response.url, rel)

            print new_url

            yield Request(url=new_url, callback=self.parse_index, dont_filter=True)
Ejemplo n.º 5
0
    def parse(self, response):
        sel = Selector(response)
        headlines = sel.xpath("//h2[@itemprop='headline']/a")
        next_button = sel.xpath("//ul[@class='post-pagination']/li[@class='older']/a")

        for line in headlines:
            nsil = NewsScraperItemLoader(selector=line)
            yield Request(url=nsil.get_xpath(".//@href", Join()), 
                callback=self.parse_article)

        if next_button:
            nsil = NewsScraperItemLoader(selector=next_button)
            yield Request(url=urljoin(response.url, nsil.get_xpath(".//@href", Join())),
                callback=self.parse, dont_filter=True)
Ejemplo n.º 6
0
    def parse(self, response):
        # grab all the menu links
        sel = Selector(response)
        # IIRC the response has a built-in selector now?
        # need to try that later.
        menu_links = sel.xpath("//div[@id='main-menu']/div[@id='sections']//a")

        for link in menu_links:
            nsil = NewsScraperItemLoader(selector=link)
            # so the default input/output processor doesn't 
            # fire when you use get_xpath.
            anchor = nsil.get_xpath(".//@href", Join()) 
            category_name = nsil.get_xpath(".//text()", NormalizedJoin())
            yield Request(url=urljoin(response.url, anchor), callback=self.parse_articles,
                meta={"category": category_name})
    def parse(self, response):
        links = response.xpath("//div[@data-post-type='post']//h3/a")

        if links:
            for link in links:
                nsil = NewsScraperItemLoader(selector=link)
                new_url = nsil.get_xpath(".//@href", Join())

                yield Request(url=new_url, callback=self.parse_article)

            # pagination must be done this way since there is no way to tell
            # in advance how many pages of news there actually are.
            self.page_num += 1
            raw_url = "http://www.digitaltrends.com/page/%s" % self.page_num

            yield Request(url=raw_url, callback=self.parse, dont_filter=True)
    def parse(self, response):
        links = response.xpath("//div[@class='post-inner']//h2/a")
        next_button = None

        for link in links:
            nsil = NewsScraperItemLoader(selector=link)
            new_url = nsil.get_xpath(".//@href", Join())

            yield Request(url=new_url, callback=self.parse_article)

        if next_button:
            nsil = NewsScraperItemLoader(selector=next_button)
            new_url = nsil.get_xpath("//div[@class='navigations']//a[contains(text(), 'Older')]", Join())

            yield Request(url=new_url, callback=self.parse,
                          dont_filter=True)
Ejemplo n.º 9
0
    def parse(self, response):
        sel = Selector(response)

        links = sel.xpath("//a[@class='read-more']")
        next_button = sel.xpath("//li[@class='next']//a")

        for link in links:
            nsil = NewsScraperItemLoader(selector=link)
            new_url = nsil.get_xpath(".//@href", Join())

            yield Request(url=new_url, callback=self.parse_article)

        if next_button:
            nsil = NewsScraperItemLoader(selector=next_button)
            new_url = urljoin(response.url, nsil.get_xpath(".//@href", Join()))

            yield Request(url=new_url, callback=self.parse, dont_filter=True)
Ejemplo n.º 10
0
    def parse(self, response):
        links = response.xpath("//div[@class='blog_post']//h1//a")
        next_page = response.xpath("//div[@class='olderposts']//a")

        for link in links:
            nsil = NewsScraperItemLoader(selector=link)

            new_url = nsil.get_xpath(".//@href", Join())

            yield Request(url=new_url, callback=self.parse_article)

        if next_page:
            nsil = NewsScraperItemLoader(selector=next_page)
            new_url = nsil.get_xpath(".//@href", Join())

            yield Request(url=new_url, callback=self.parse,
                dont_filter=True)
    def parse(self, response):
        articles = response.xpath("//article/a")
        next_button = response.xpath("//div[@class='load-articles']/button")

        for art in articles:
            nsil = NewsScraperItemLoader(selector=art)
            anchor = nsil.get_xpath(".//@href", Join())
            new_url = urljoin(response.url, anchor)

            yield Request(url=new_url, callback=self.parse_article)

        if next_button:
            nsil = NewsScraperItemLoader(selector=next_button)
            anchor = nsil.get_xpath(".//@data-next", Join())
            new_url = urljoin(response.url, anchor)

            yield Request(url=new_url, callback=self.parse_index,
                          dont_filter=True)
Ejemplo n.º 12
0
    def parse_articles(self, response):
        sel = Selector(response)

        articles = sel.xpath("//div[@class='stories']//ul/li//a")
        older_posts = sel.xpath("//a[@class='btn-prev']")

        for art in articles:
            nsil = NewsScraperItemLoader(selector=art)
            url = nsil.get_xpath(".//@href", Join())

            yield Request(url=url, callback=self.parse_body,
                meta={'category': response.meta["category"]})

        if older_posts:
            nsil = NewsScraperItemLoader(selector=older_posts)
            url = nsil.get_xpath(".//@href", Join())

            yield Request(url=url, callback=self.parse, dont_filter=True)
Ejemplo n.º 13
0
    def parse(self, response):
        stuff = response.xpath("//div[@class='post-info']//h4//a")
        next_page = response.xpath("//div[@class='paginate_right']//a")

        for post in stuff:
            nsil = NewsScraperItemLoader(selector=post)
            anchor = nsil.get_xpath(".//@href", Join())
            author = nsil.get_xpath("./../../div[@id='subtitle']/small/a/text()", Join())
            new_url = urljoin(response.url, anchor)

            yield Request(url=new_url, callback=self.parse_article,
                meta={'author': author})

        if next_page:
            nsil = NewsScraperItemLoader(selector=next_page)
            anchor = nsil.get_xpath(".//@href", Join())
            new_url = urljoin(response.url, anchor)

            yield Request(url=new_url, callback=self.parse,
                dont_filter=True)
Ejemplo n.º 14
0
    def parse(self, response):
        review_links = response.xpath("//ul[@class='global_list reviews']//li//h2/a")
        # the :-1 should slice it so only the first element is retrieved.
        next_button = response.xpath("//div[contains(@class, 'gr_pagination')]//a[@title='Next Page']")[:-1]

        for link in review_links:
            nsil = NewsScraperItemLoader(selector=link)
            anchor = nsil.get_xpath(".//@href", Join())
            new_url = urljoin(response.url, anchor)

            yield Request(url=new_url, callback=self.parse_review,
                dont_filter=True, meta=response.meta)

        if next_button:
            nsil = NewsScraperItemLoader(selector=next_button)
            anchor = nsil.get_xpath(".//@href", Join())
            new_url = urljoin(response.url, anchor)

            yield Request(url=new_url, callback=self.parse,
                dont_filter=True, meta=response.meta)
Ejemplo n.º 15
0
    def parse_articles(self, response):
        sel = Selector(response)
        # get articles, yield links to articles
        # get next page links, yield links to next pages.
        article_links = sel.xpath("//h1[@class='heading']/a")

        for link in article_links:
            nsil = NewsScraperItemLoader(selector=link)
            title = nsil.get_xpath("./text()")

            yield Request(url=nsil.get_xpath("./@href", Join()), 
                callback=self.parse_page,
                meta=response.meta)

        # this block of code checks to see if there is an 'older posts'
        # page. articles are listed newest first on arstechnica.
        older_posts = sel.xpath("//table//td[contains(@class,'older')]/a")
        if older_posts:
            nsil = NewsScraperItemLoader(selector=older_posts)
            anchor = nsil.get_xpath("./@href", Join())
            yield Request(url=urljoin(response.url, anchor), 
                callback=self.parse_articles,
                meta=response.meta,
                dont_filter=True)
Ejemplo n.º 16
0
    def parse_article(self, response):
        nsil = NewsScraperItemLoader(selector=response.selector)

        nsil.add_xpath("headline", "//h1//text()")
        nsil.add_xpath("author", "//h2//text()", re=r"By: (.*)")
        nsil.add_xpath("date_published", "//h3//text()", re=r"On: (.*)")
        nsil.add_xpath("body", "//div[@class='blog_post']//p", Declutter())
        nsil.add_value("publication", "TIGSource")

        nsil.add_value("date_scraped", str(datetime.datetime.now()))
        nsil.add_value("scraped_by", self.name)
        nsil.add_value("scraped_from", response.url)

        yield nsil.load_item()
Ejemplo n.º 17
0
    def parse_page(self, response):
        sel = Selector(response)
        nsil = NewsScraperItemLoader(selector=sel)

        # data pre-processing
        author_raw = nsil.get_xpath("//p[contains(@itemprop, 'author')]//text()",
            NormalizedJoin())

        date_raw = nsil.get_xpath("//p[contains(@itemprop, 'author')]//text()",
            NormalizedJoin())

        author = "".join(AUTHOR_RE.findall(author_raw))
        date = "".join(DATE_RE.findall(date_raw))

        # article data first
        nsil.add_xpath("headline", "//header/h1[@class='heading']//text()", Join())
        nsil.add_value("publication", "Ars Technica")
        nsil.add_value("date_published", date)
        nsil.add_value("category", response.meta["category"])
        nsil.add_value("author", author)
        nsil.add_xpath("body", "//div[@itemprop='articleBody']//text()", NormalizedJoin())

        # metadata
        nsil.add_value("date_scraped", str(datetime.datetime.now()))
        nsil.add_value("scraped_by", self.name)
        nsil.add_value("scraped_from", response.url)

        yield nsil.load_item()
    def parse_article(self, response):
        nsil = NewsScraperItemLoader(selector=response.selector)

        nsil.add_xpath("headline", "//div[@class='headline']//h1[@class='title']//text()")
        nsil.add_xpath("date_published", "//div[@class='date']//text()")
        nsil.add_value("publication", "Killscreen Daily")
        nsil.add_xpath("author", "//div[@class='author']//a/text()")
        nsil.add_xpath("body", "//div[@class='article-content']//p", Declutter())

        nsil.add_value("date_scraped", str(datetime.datetime.now()))
        nsil.add_value("scraped_by", self.name)
        nsil.add_value("scraped_from", response.url)

        yield nsil.load_item()
Ejemplo n.º 19
0
    def parse_article(self, response):
        nsil = NewsScraperItemLoader(selector=response.selector)

        nsil.add_xpath("headline", "//header/h1/text()", NormalizedJoin())
        nsil.add_value("publication", "Eurogamer")
        nsil.add_xpath("author", "//p[@class='byline']//a[contains(@href, 'author')]/text()")
        nsil.add_xpath("date_published", "//p[@class='byline']//span[@itemprop='datePublished']")
        nsil.add_xpath("body", "//article/section/p", Declutter())

        nsil.add_value("date_scraped", str(datetime.datetime.now()))
        nsil.add_value("scraped_by", self.name)
        nsil.add_value("scraped_from", response.url)

        yield nsil.load_item()
Ejemplo n.º 20
0
    def parse_article(self, response):
        sel = Selector(response)
        nsil = NewsScraperItemLoader(selector=sel)

        nsil.add_xpath("headline", "//h1[contains(@class, 'tweet-title')]//text()")
        nsil.add_value("publication", "TechCrunch")
        nsil.add_xpath("author", "//a[@rel='author']//text()", Join())
        nsil.add_xpath("body", "//div[contains(@class, 'article-entry')]//p//text()", Join())

        nsil.add_value("date_scraped", str(datetime.datetime.now()))
        nsil.add_value("scraped_by", self.name)
        nsil.add_value("scraped_from", response.url)

        yield nsil.load_item()
    def parse_article(self, response):
        nsil = NewsScraperItemLoader(selector=response.selector)

        nsil.add_xpath("headline", "//div[@class='post-inner']//h2//text()")
        nsil.add_xpath("author", "//div[@class='entry']//a[contains(@href, 'mailto')]//text()")
        nsil.add_xpath("date_published", "//div[@class='entry']//aside/p/text()[2]", re=r"on (.*)")
        nsil.add_value("publication", "RockPaperShotgun")
        nsil.add_xpath("body", "//div[@class='entry']/p", Declutter())

        nsil.add_value("date_scraped", str(datetime.datetime.now()))
        nsil.add_value("scraped_by", self.name)
        nsil.add_value("scraped_from", response.url)

        yield nsil.load_item()
Ejemplo n.º 22
0
    def parse_article(self, response):
        nsil = NewsScraperItemLoader(selector=response.selector)

        nsil.add_xpath("headline", "//header/h1[@class='title']//text()", NormalizedJoin())
        nsil.add_xpath("author", "//span[@class='vcard']//a/text()", NormalizedJoin())
        nsil.add_xpath("date_published", "//span[@class='value-title']/time//text()", NormalizedJoin())
        nsil.add_value("publication", "DigitalTrends")
        nsil.add_xpath("body", "//article[contains(@class, 'm-content')]//p", Declutter(), NormalizedJoin())

        nsil.add_value("date_scraped", str(datetime.datetime.now()))
        nsil.add_value("scraped_by", self.name)
        nsil.add_value("scraped_from", response.url)

        yield nsil.load_item()
Ejemplo n.º 23
0
    def parse_body(self, response):
        sel = Selector(response)
        nsil = NewsScraperItemLoader(selector=sel)

        nsil.add_xpath("headline", "//div[@class='title']//h2/text()")
        nsil.add_value("publication", "ExtremeTech")
        nsil.add_xpath("date_published", "//div[@class='title']//span[contains(@class, 'by vcard')]//text()", re=r"on (.*)")
        nsil.add_xpath("author", "//div[@class='title']//span[contains(@class, 'by vcard')]//text()", re=r"(.*) on")
        nsil.add_xpath("body", "//div[@class='content']//p//text()")

        nsil.add_value("scraped_by", self.name)
        nsil.add_value("scraped_from", response.url)
        nsil.add_value("date_scraped", str(datetime.datetime.now()))

        yield nsil.load_item()
Ejemplo n.º 24
0
    def parse_article(self, response):
        nsil = NewsScraperItemLoader(selector=response.selector)

        # templates across PCG site are inconsistent.
        # this is what I've been able to figure out by
        # analyzing samples of the results pulled.
        body_xpath = ["//div[@class='body']//p/.",
                      "//div[@class='section-wrap']//div[@class='textcomponent']//p",
                      "//div[@class='gallery_desc']//p",]

        author_xpath = ["//h3[@class='author']//text()",
                        "//div[@class='review_header']//h3//text()"]

        date_published_xpath = ["//span[@class='localized byline']//text()",
                                "//div[@class='review_header']//span[@class='localized']//text()"]

        nsil.add_xpath("headline", "//h1//text()", NormalizedJoin())
        nsil.add_value("publication", "PC Gamer")
        nsil.add_value("author", self.select_first_xpath(author_xpath, nsil))
        nsil.add_value("body", self.select_first_xpath(body_xpath, nsil, Declutter()))
        nsil.add_value("date_published", self.select_first_xpath(date_published_xpath, nsil))

        nsil.add_value("date_scraped", str(datetime.datetime.now()))
        nsil.add_value("scraped_by", self.name)
        nsil.add_value("scraped_from", response.url)

        yield nsil.load_item()
Ejemplo n.º 25
0
    def parse_article(self, response):
        sel = Selector(response)
        nsil = NewsScraperItemLoader(selector=sel)

        # article data
        nsil.add_xpath("headline", "//h1[@itemprop='headline']//text()", Join())
        nsil.add_value("publication", "Engadget")
        nsil.add_xpath("date_published", "//span[@class='timeago']/@datetime", Join())
        nsil.add_xpath("author", "//strong[@itemprop='author']//text()")
        nsil.add_xpath("body", "//p[@class='read-more']//preceding-sibling::p", Declutter())
        nsil.add_xpath("category", "//strong[contains(text(), 'ags')]//following-sibling::span/a[1]/text()", Join())

        nsil.add_xpath("source_article_name", "//strong[contains(text(), 'ource')]/following-sibling::a[1]/text()", Join())
        nsil.add_xpath("source_article_link", "//strong[contains(text(), 'ource')]/following-sibling::a[1]/@href", Join())

        nsil.add_value("date_scraped", str(datetime.datetime.now()))
        nsil.add_value("scraped_by", self.name)
        nsil.add_value("scraped_from", response.url)

        yield nsil.load_item()
Ejemplo n.º 26
0
    def parse_review(self, response):
        nsil = NewsScraperItemLoader(selector=response.selector)

        nsil.add_xpath("headline", "//h1[@itemprop='name']//text()")
        nsil.add_xpath("author", "//div[@class='review_header']//div[@class='byline']/a[@rel='author']//text()")
        nsil.add_xpath("date_published", "//div[@class='review_header']//div[@class='byline']/text()", re=r"on (.*)")
        nsil.add_xpath("publication", "GamesRadar")
        nsil.add_xpath("body", "//div[contains(@class, 'grArticleBody_contents')]//p/text()")

        nsil.add_value("date_scraped", str(datetime.datetime.now()))
        nsil.add_value("scraped_by", self.name)
        nsil.add_value("scraped_from", response.url)

        yield nsil.load_item()
Ejemplo n.º 27
0
    def parse_article(self, response):
        nsil = NewsScraperItemLoader(selector=response.selector)

        nsil.add_value("author", response.meta['author'])
        nsil.add_value("publication", "Destructoid")
        nsil.add_xpath("headline", "//h3[@class='fancy-title']//text()")
        nsil.add_xpath("date_published", ["//h6[1]/b/text()", "//h6[1]/text()"], NormalizedJoin())
        nsil.add_xpath("body", "//div[@id='desktoppadding']//p", Declutter())

        nsil.add_value("date_scraped", str(datetime.datetime.now()))
        nsil.add_value("scraped_by", self.name)
        nsil.add_value("scraped_from", response.url)

        yield nsil.load_item()
    def parse_article(self, response):
        nsil = NewsScraperItemLoader(selector=response.selector)

        #   nsil.add_xpath("headline", "//h1[@class='headline']//text()")
        nsil.add_xpath("author", "//div[@class='author-description']//h5/a[contains(@href, 'profile')]//text()")
        nsil.add_xpath("date_published", "//span[@class='featureDate']//text()")
        nsil.add_value("publication", "GamesRadar")
        nsil.add_xpath("body", "//div[contains(@class, 'grArticleBody_contents')]//p/text()")

        nsil.add_value("date_scraped", str(datetime.datetime.now()))
        nsil.add_value("scraped_by", self.name)
        nsil.add_value("scraped_from", response.url)

        yield nsil.load_item()