def parse(self, response): # the headlines we need to get need to be obtained via 3 # different xpaths. head1 = response.xpath("//ul[@class='headlines medium']//div//h2[@class='title']//a") head2 = response.xpath("//ul[contains(@class, 'headlines large')]//div//h2[@class='title']//a") head3 = response.xpath("//ul[contains(@class, 'headlines small')]//div//h2[@class='title']//a") # for ease of use, all three lists will be combined into one large one # that we can iterate over. headlines = head1 + head2 + head3 if headlines: for head in headlines: nsil = NewsScraperItemLoader(selector=head) anchor = nsil.get_xpath("./@href", Join()) new_url = urljoin(response.url, anchor) yield Request(url=new_url, callback=self.parse_article) if 'news' in response.url: next_anchor = "/ajax.php?action=frontpage&page=%s&platform=&type=news&topic=" % self.page elif 'reviews' in response.url: next_anchor = "/ajax.php?action=frontpage&page=%s&platform=&type=reviews&topic=" % self.page self.page += 1 next_page = urljoin(response.url, next_anchor) yield Request(url=next_page, callback=self.parse, dont_filter=True)
def parse_index(self, response): # news and reviews for now. links = response.xpath("//div[@class='headline-wrap']//div[@class='headline']//h3/parent::a") # I'm hoping that this will fire when I expect it to. # that is to say, at the end of the run, when there are # no links left. if not links: raise CloseSpider("No results remaining.") for link in links: nsil = NewsScraperItemLoader(selector=link) rel = nsil.get_xpath(".//@href", Join()) new_url = urljoin(response.url, rel) yield Request(url=new_url, callback=self.parse_article, dont_filter=True) if 'reviews' in response.url: page_rel = "?page=%s" % self.reviews_page_num new_url = urljoin(response.url, page_rel) self.reviews_page_num += 1 print "Sending request for %s" % new_url yield Request(url=new_url, callback=self.parse_index, dont_filter=True) elif 'news' in response.url: page_rel = "?page=%s" % self.news_page_num new_url = urljoin(response.url, page_rel) self.news_page_num += 1 print "Sending request for %s" % new_url yield Request(url=new_url, callback=self.parse_index, dont_filter=True)
def parse(self, response): sel = Selector(response) cats = sel.xpath("//ul[@id='main-menu']//li/a") for cat in cats: nsil = NewsScraperItemLoader(selector=cat) url = nsil.get_xpath(".//@href", Join()) category = nsil.get_xpath(".//text()", Join()) yield Request(url=url, callback=self.parse_articles, meta={'category': category})
def parse(self, response): nsil = NewsScraperItemLoader(selector=response.selector) link_xpath = ["//nav[@class='nav-links']//li/a[contains(., 'News')]/@href", "//nav[@class='nav-links']//li/a[contains(., 'Reviews')]/@href" ] rels = nsil.get_xpath(link_xpath) for rel in rels: new_url = urljoin(response.url, rel) print new_url yield Request(url=new_url, callback=self.parse_index, dont_filter=True)
def parse(self, response): sel = Selector(response) headlines = sel.xpath("//h2[@itemprop='headline']/a") next_button = sel.xpath("//ul[@class='post-pagination']/li[@class='older']/a") for line in headlines: nsil = NewsScraperItemLoader(selector=line) yield Request(url=nsil.get_xpath(".//@href", Join()), callback=self.parse_article) if next_button: nsil = NewsScraperItemLoader(selector=next_button) yield Request(url=urljoin(response.url, nsil.get_xpath(".//@href", Join())), callback=self.parse, dont_filter=True)
def parse(self, response): # grab all the menu links sel = Selector(response) # IIRC the response has a built-in selector now? # need to try that later. menu_links = sel.xpath("//div[@id='main-menu']/div[@id='sections']//a") for link in menu_links: nsil = NewsScraperItemLoader(selector=link) # so the default input/output processor doesn't # fire when you use get_xpath. anchor = nsil.get_xpath(".//@href", Join()) category_name = nsil.get_xpath(".//text()", NormalizedJoin()) yield Request(url=urljoin(response.url, anchor), callback=self.parse_articles, meta={"category": category_name})
def parse(self, response): links = response.xpath("//div[@data-post-type='post']//h3/a") if links: for link in links: nsil = NewsScraperItemLoader(selector=link) new_url = nsil.get_xpath(".//@href", Join()) yield Request(url=new_url, callback=self.parse_article) # pagination must be done this way since there is no way to tell # in advance how many pages of news there actually are. self.page_num += 1 raw_url = "http://www.digitaltrends.com/page/%s" % self.page_num yield Request(url=raw_url, callback=self.parse, dont_filter=True)
def parse(self, response): links = response.xpath("//div[@class='post-inner']//h2/a") next_button = None for link in links: nsil = NewsScraperItemLoader(selector=link) new_url = nsil.get_xpath(".//@href", Join()) yield Request(url=new_url, callback=self.parse_article) if next_button: nsil = NewsScraperItemLoader(selector=next_button) new_url = nsil.get_xpath("//div[@class='navigations']//a[contains(text(), 'Older')]", Join()) yield Request(url=new_url, callback=self.parse, dont_filter=True)
def parse(self, response): sel = Selector(response) links = sel.xpath("//a[@class='read-more']") next_button = sel.xpath("//li[@class='next']//a") for link in links: nsil = NewsScraperItemLoader(selector=link) new_url = nsil.get_xpath(".//@href", Join()) yield Request(url=new_url, callback=self.parse_article) if next_button: nsil = NewsScraperItemLoader(selector=next_button) new_url = urljoin(response.url, nsil.get_xpath(".//@href", Join())) yield Request(url=new_url, callback=self.parse, dont_filter=True)
def parse(self, response): links = response.xpath("//div[@class='blog_post']//h1//a") next_page = response.xpath("//div[@class='olderposts']//a") for link in links: nsil = NewsScraperItemLoader(selector=link) new_url = nsil.get_xpath(".//@href", Join()) yield Request(url=new_url, callback=self.parse_article) if next_page: nsil = NewsScraperItemLoader(selector=next_page) new_url = nsil.get_xpath(".//@href", Join()) yield Request(url=new_url, callback=self.parse, dont_filter=True)
def parse(self, response): articles = response.xpath("//article/a") next_button = response.xpath("//div[@class='load-articles']/button") for art in articles: nsil = NewsScraperItemLoader(selector=art) anchor = nsil.get_xpath(".//@href", Join()) new_url = urljoin(response.url, anchor) yield Request(url=new_url, callback=self.parse_article) if next_button: nsil = NewsScraperItemLoader(selector=next_button) anchor = nsil.get_xpath(".//@data-next", Join()) new_url = urljoin(response.url, anchor) yield Request(url=new_url, callback=self.parse_index, dont_filter=True)
def parse_articles(self, response): sel = Selector(response) articles = sel.xpath("//div[@class='stories']//ul/li//a") older_posts = sel.xpath("//a[@class='btn-prev']") for art in articles: nsil = NewsScraperItemLoader(selector=art) url = nsil.get_xpath(".//@href", Join()) yield Request(url=url, callback=self.parse_body, meta={'category': response.meta["category"]}) if older_posts: nsil = NewsScraperItemLoader(selector=older_posts) url = nsil.get_xpath(".//@href", Join()) yield Request(url=url, callback=self.parse, dont_filter=True)
def parse(self, response): stuff = response.xpath("//div[@class='post-info']//h4//a") next_page = response.xpath("//div[@class='paginate_right']//a") for post in stuff: nsil = NewsScraperItemLoader(selector=post) anchor = nsil.get_xpath(".//@href", Join()) author = nsil.get_xpath("./../../div[@id='subtitle']/small/a/text()", Join()) new_url = urljoin(response.url, anchor) yield Request(url=new_url, callback=self.parse_article, meta={'author': author}) if next_page: nsil = NewsScraperItemLoader(selector=next_page) anchor = nsil.get_xpath(".//@href", Join()) new_url = urljoin(response.url, anchor) yield Request(url=new_url, callback=self.parse, dont_filter=True)
def parse(self, response): review_links = response.xpath("//ul[@class='global_list reviews']//li//h2/a") # the :-1 should slice it so only the first element is retrieved. next_button = response.xpath("//div[contains(@class, 'gr_pagination')]//a[@title='Next Page']")[:-1] for link in review_links: nsil = NewsScraperItemLoader(selector=link) anchor = nsil.get_xpath(".//@href", Join()) new_url = urljoin(response.url, anchor) yield Request(url=new_url, callback=self.parse_review, dont_filter=True, meta=response.meta) if next_button: nsil = NewsScraperItemLoader(selector=next_button) anchor = nsil.get_xpath(".//@href", Join()) new_url = urljoin(response.url, anchor) yield Request(url=new_url, callback=self.parse, dont_filter=True, meta=response.meta)
def parse_articles(self, response): sel = Selector(response) # get articles, yield links to articles # get next page links, yield links to next pages. article_links = sel.xpath("//h1[@class='heading']/a") for link in article_links: nsil = NewsScraperItemLoader(selector=link) title = nsil.get_xpath("./text()") yield Request(url=nsil.get_xpath("./@href", Join()), callback=self.parse_page, meta=response.meta) # this block of code checks to see if there is an 'older posts' # page. articles are listed newest first on arstechnica. older_posts = sel.xpath("//table//td[contains(@class,'older')]/a") if older_posts: nsil = NewsScraperItemLoader(selector=older_posts) anchor = nsil.get_xpath("./@href", Join()) yield Request(url=urljoin(response.url, anchor), callback=self.parse_articles, meta=response.meta, dont_filter=True)
def parse_article(self, response): nsil = NewsScraperItemLoader(selector=response.selector) nsil.add_xpath("headline", "//h1//text()") nsil.add_xpath("author", "//h2//text()", re=r"By: (.*)") nsil.add_xpath("date_published", "//h3//text()", re=r"On: (.*)") nsil.add_xpath("body", "//div[@class='blog_post']//p", Declutter()) nsil.add_value("publication", "TIGSource") nsil.add_value("date_scraped", str(datetime.datetime.now())) nsil.add_value("scraped_by", self.name) nsil.add_value("scraped_from", response.url) yield nsil.load_item()
def parse_page(self, response): sel = Selector(response) nsil = NewsScraperItemLoader(selector=sel) # data pre-processing author_raw = nsil.get_xpath("//p[contains(@itemprop, 'author')]//text()", NormalizedJoin()) date_raw = nsil.get_xpath("//p[contains(@itemprop, 'author')]//text()", NormalizedJoin()) author = "".join(AUTHOR_RE.findall(author_raw)) date = "".join(DATE_RE.findall(date_raw)) # article data first nsil.add_xpath("headline", "//header/h1[@class='heading']//text()", Join()) nsil.add_value("publication", "Ars Technica") nsil.add_value("date_published", date) nsil.add_value("category", response.meta["category"]) nsil.add_value("author", author) nsil.add_xpath("body", "//div[@itemprop='articleBody']//text()", NormalizedJoin()) # metadata nsil.add_value("date_scraped", str(datetime.datetime.now())) nsil.add_value("scraped_by", self.name) nsil.add_value("scraped_from", response.url) yield nsil.load_item()
def parse_article(self, response): nsil = NewsScraperItemLoader(selector=response.selector) nsil.add_xpath("headline", "//div[@class='headline']//h1[@class='title']//text()") nsil.add_xpath("date_published", "//div[@class='date']//text()") nsil.add_value("publication", "Killscreen Daily") nsil.add_xpath("author", "//div[@class='author']//a/text()") nsil.add_xpath("body", "//div[@class='article-content']//p", Declutter()) nsil.add_value("date_scraped", str(datetime.datetime.now())) nsil.add_value("scraped_by", self.name) nsil.add_value("scraped_from", response.url) yield nsil.load_item()
def parse_article(self, response): nsil = NewsScraperItemLoader(selector=response.selector) nsil.add_xpath("headline", "//header/h1/text()", NormalizedJoin()) nsil.add_value("publication", "Eurogamer") nsil.add_xpath("author", "//p[@class='byline']//a[contains(@href, 'author')]/text()") nsil.add_xpath("date_published", "//p[@class='byline']//span[@itemprop='datePublished']") nsil.add_xpath("body", "//article/section/p", Declutter()) nsil.add_value("date_scraped", str(datetime.datetime.now())) nsil.add_value("scraped_by", self.name) nsil.add_value("scraped_from", response.url) yield nsil.load_item()
def parse_article(self, response): sel = Selector(response) nsil = NewsScraperItemLoader(selector=sel) nsil.add_xpath("headline", "//h1[contains(@class, 'tweet-title')]//text()") nsil.add_value("publication", "TechCrunch") nsil.add_xpath("author", "//a[@rel='author']//text()", Join()) nsil.add_xpath("body", "//div[contains(@class, 'article-entry')]//p//text()", Join()) nsil.add_value("date_scraped", str(datetime.datetime.now())) nsil.add_value("scraped_by", self.name) nsil.add_value("scraped_from", response.url) yield nsil.load_item()
def parse_article(self, response): nsil = NewsScraperItemLoader(selector=response.selector) nsil.add_xpath("headline", "//div[@class='post-inner']//h2//text()") nsil.add_xpath("author", "//div[@class='entry']//a[contains(@href, 'mailto')]//text()") nsil.add_xpath("date_published", "//div[@class='entry']//aside/p/text()[2]", re=r"on (.*)") nsil.add_value("publication", "RockPaperShotgun") nsil.add_xpath("body", "//div[@class='entry']/p", Declutter()) nsil.add_value("date_scraped", str(datetime.datetime.now())) nsil.add_value("scraped_by", self.name) nsil.add_value("scraped_from", response.url) yield nsil.load_item()
def parse_article(self, response): nsil = NewsScraperItemLoader(selector=response.selector) nsil.add_xpath("headline", "//header/h1[@class='title']//text()", NormalizedJoin()) nsil.add_xpath("author", "//span[@class='vcard']//a/text()", NormalizedJoin()) nsil.add_xpath("date_published", "//span[@class='value-title']/time//text()", NormalizedJoin()) nsil.add_value("publication", "DigitalTrends") nsil.add_xpath("body", "//article[contains(@class, 'm-content')]//p", Declutter(), NormalizedJoin()) nsil.add_value("date_scraped", str(datetime.datetime.now())) nsil.add_value("scraped_by", self.name) nsil.add_value("scraped_from", response.url) yield nsil.load_item()
def parse_body(self, response): sel = Selector(response) nsil = NewsScraperItemLoader(selector=sel) nsil.add_xpath("headline", "//div[@class='title']//h2/text()") nsil.add_value("publication", "ExtremeTech") nsil.add_xpath("date_published", "//div[@class='title']//span[contains(@class, 'by vcard')]//text()", re=r"on (.*)") nsil.add_xpath("author", "//div[@class='title']//span[contains(@class, 'by vcard')]//text()", re=r"(.*) on") nsil.add_xpath("body", "//div[@class='content']//p//text()") nsil.add_value("scraped_by", self.name) nsil.add_value("scraped_from", response.url) nsil.add_value("date_scraped", str(datetime.datetime.now())) yield nsil.load_item()
def parse_article(self, response): nsil = NewsScraperItemLoader(selector=response.selector) # templates across PCG site are inconsistent. # this is what I've been able to figure out by # analyzing samples of the results pulled. body_xpath = ["//div[@class='body']//p/.", "//div[@class='section-wrap']//div[@class='textcomponent']//p", "//div[@class='gallery_desc']//p",] author_xpath = ["//h3[@class='author']//text()", "//div[@class='review_header']//h3//text()"] date_published_xpath = ["//span[@class='localized byline']//text()", "//div[@class='review_header']//span[@class='localized']//text()"] nsil.add_xpath("headline", "//h1//text()", NormalizedJoin()) nsil.add_value("publication", "PC Gamer") nsil.add_value("author", self.select_first_xpath(author_xpath, nsil)) nsil.add_value("body", self.select_first_xpath(body_xpath, nsil, Declutter())) nsil.add_value("date_published", self.select_first_xpath(date_published_xpath, nsil)) nsil.add_value("date_scraped", str(datetime.datetime.now())) nsil.add_value("scraped_by", self.name) nsil.add_value("scraped_from", response.url) yield nsil.load_item()
def parse_article(self, response): sel = Selector(response) nsil = NewsScraperItemLoader(selector=sel) # article data nsil.add_xpath("headline", "//h1[@itemprop='headline']//text()", Join()) nsil.add_value("publication", "Engadget") nsil.add_xpath("date_published", "//span[@class='timeago']/@datetime", Join()) nsil.add_xpath("author", "//strong[@itemprop='author']//text()") nsil.add_xpath("body", "//p[@class='read-more']//preceding-sibling::p", Declutter()) nsil.add_xpath("category", "//strong[contains(text(), 'ags')]//following-sibling::span/a[1]/text()", Join()) nsil.add_xpath("source_article_name", "//strong[contains(text(), 'ource')]/following-sibling::a[1]/text()", Join()) nsil.add_xpath("source_article_link", "//strong[contains(text(), 'ource')]/following-sibling::a[1]/@href", Join()) nsil.add_value("date_scraped", str(datetime.datetime.now())) nsil.add_value("scraped_by", self.name) nsil.add_value("scraped_from", response.url) yield nsil.load_item()
def parse_review(self, response): nsil = NewsScraperItemLoader(selector=response.selector) nsil.add_xpath("headline", "//h1[@itemprop='name']//text()") nsil.add_xpath("author", "//div[@class='review_header']//div[@class='byline']/a[@rel='author']//text()") nsil.add_xpath("date_published", "//div[@class='review_header']//div[@class='byline']/text()", re=r"on (.*)") nsil.add_xpath("publication", "GamesRadar") nsil.add_xpath("body", "//div[contains(@class, 'grArticleBody_contents')]//p/text()") nsil.add_value("date_scraped", str(datetime.datetime.now())) nsil.add_value("scraped_by", self.name) nsil.add_value("scraped_from", response.url) yield nsil.load_item()
def parse_article(self, response): nsil = NewsScraperItemLoader(selector=response.selector) nsil.add_value("author", response.meta['author']) nsil.add_value("publication", "Destructoid") nsil.add_xpath("headline", "//h3[@class='fancy-title']//text()") nsil.add_xpath("date_published", ["//h6[1]/b/text()", "//h6[1]/text()"], NormalizedJoin()) nsil.add_xpath("body", "//div[@id='desktoppadding']//p", Declutter()) nsil.add_value("date_scraped", str(datetime.datetime.now())) nsil.add_value("scraped_by", self.name) nsil.add_value("scraped_from", response.url) yield nsil.load_item()
def parse_article(self, response): nsil = NewsScraperItemLoader(selector=response.selector) # nsil.add_xpath("headline", "//h1[@class='headline']//text()") nsil.add_xpath("author", "//div[@class='author-description']//h5/a[contains(@href, 'profile')]//text()") nsil.add_xpath("date_published", "//span[@class='featureDate']//text()") nsil.add_value("publication", "GamesRadar") nsil.add_xpath("body", "//div[contains(@class, 'grArticleBody_contents')]//p/text()") nsil.add_value("date_scraped", str(datetime.datetime.now())) nsil.add_value("scraped_by", self.name) nsil.add_value("scraped_from", response.url) yield nsil.load_item()