def extract_comments(self, sel): assert type(sel) == Selector raw_comments = sel.xpath('//section[@class="comment-content comment"]') comments= [] for comment in raw_comments: comments.append(mergeListElements(comment.xpath('./p/text()').extract())) return comments
def parse_item(self, response): sel = Selector(response) item = BlogItem() item['blog_name'] = "Vox" item['url'] = response.url item['releasedate'] = dparser.parse(mergeListElements(sel.xpath('/html/body/div[2]/div/div[2]/p/text()').extract())[0], fuzzy=True) item['crawldate'] = datetime.now().isoformat() item['author'] = self.extract_authors(safepop(sel.xpath('/html/body/div[2]/div/div[2]/p/strong/text()').extract(), 0)) item['headline'] = safepop(sel.xpath('/html/body/div[2]/div/div[2]/h1/text()').extract(), 0).strip() item['body'] = safepop(mergeListElements(sel.xpath('//div[@class="article-content"]/h1[contains(., "Reference")]/preceding-sibling::*/text()').extract()), 0) item['links'] = sel.xpath('//div[@class="format_text entry-content"]/p[@class="to_comments"]/preceding-sibling::*/descendant-or-self::*/text()').extract() item['references'] = self.extract_references(sel.xpath('//div[@class="article-content"]/h1[contains(., "Reference")]/following-sibling::*')) item['comments'] = "" item['tags'] = "" item["teaser"] = safepop(mergeListElements(sel.xpath('//div[contains(@class, "teaser")]/descendant-or-self::*/text()').extract()), 0).strip() self.log("parsed %s successfully" % response.url, level=log.INFO) return item
def parse_item(self, response): self.log(response.url, level=log.DEBUG) sel = Selector(response) item = BlogItem() item['blog_name'] = "MarginalRevolution" item['url'] = response.url item['releasedate'] = dparser.parse(mergeListElements(sel.xpath('//p[@class="headline_meta"]/text()').extract())[0], fuzzy=True) item['crawldate'] = datetime.now().isoformat() item['author'] = safepop(sel.xpath('//a[@rel="author"]/text()').extract(), 0) item['headline'] = safepop(sel.xpath('//h1[@class="entry-title"]/text()').extract(), 0) item['body'] = safepop(mergeListElements(sel.xpath('//div[@class="format_text entry-content"]/p[@class="to_comments"]/preceding-sibling::*/descendant-or-self::text()').extract()), 0) item['links'] = sel.xpath('//div[@class="article-content"]/h1[contains(., "Reference")]/following-sibling::*/a/@href').extract() item['references'] = "" item['comments'] = "" item['tags'] = sel.xpath('//span[@class="post-footers"]/a[@rel="author"]/following-sibling::*/text()').extract() item["teaser"] = "" self.log("parsed %s successfully" % response.url, level=log.INFO) return item
def parse_item(self, response): self.log(response.url, level=log.DEBUG) sel = Selector(response) item = BlogItem() item['blog_name'] = "Brad DeLong's Grasping Reality..." item['url'] = response.url item['releasedate'] = dparser.parse(mergeListElements(sel.xpath('//h2[@class="date-header"]/text()').extract())[0], fuzzy=True) item['crawldate'] = datetime.now().isoformat() item['author'] = safepop(sel.xpath('//a[@rel="author"]/text()').extract(), 0) item['headline'] = safepop(sel.xpath('//h3[@class="entry-header"]/text()').extract(), 0) item['body'] = safepop(mergeListElements(sel.xpath('//div[@class="entry-body"]/descendant-or-self::text()').extract()), 0) item['links'] = sel.xpath('//div[@class="entry-body"]/descendant-or-self::a/@href').extract() item['references'] = "" item['comments'] = "" item['tags'] = sel.xpath('//span[@class="post-footers"]/a[@rel="author"]/following-sibling::*/text()').extract() item["teaser"] = "" self.log("parsed %s successfully" % response.url, level=log.INFO) return item
def parse_item(self, response): self.log(response.url, level=log.DEBUG) sel = Selector(response) item = BlogItem() item['blog_name'] = "Economist's View" item['url'] = response.url item['releasedate'] = dparser.parse(mergeListElements(sel.xpath('//h2[@class="date-header"]/text()').extract())[0], fuzzy=True) item['crawldate'] = datetime.now().isoformat() item['author'] = safepop(sel.xpath('//a[@rel="author"]/text()').extract(), 0) item['headline'] = safepop(sel.xpath('//h3[@class="entry-header"]/a/text()').extract(), 0) item['body'] = safepop(mergeListElements(sel.xpath('//div[@class="entry-body"]/descendant-or-self::text()').extract()), 0) item['links'] = sel.xpath('//div[@class="entry-body"]/descendant-or-self::a/@href').extract() item['references'] = "" item['comments'] = "" #the last two elements are always "permalink" an and "comment", hence they can be discarded item['tags'] = sel.xpath('//p[@class="posted"]/a[@rel="author"]/following-sibling::*/text()').extract()[0:-2] item["teaser"] = "" self.log("parsed %s successfully" % response.url, level=log.INFO) return item
def parse_item(self, response): self.log(response.url, level=log.INFO) sel = Selector(response) item = BlogItem() item['blog_name'] = "Zero Hedge" item['url'] = response.url item['releasedate'] = dparser.parse(sel.xpath('//span[@class="submitted"]/text()').extract()[-1], fuzzy=True) item['crawldate'] = datetime.now().isoformat() item['author'] = safepop(sel.xpath('//span[@class="submitted"]/a/text()').extract(), 0) item['headline'] = safepop(sel.xpath('//h1[@class="print-title"]/text()').extract(), 0) item['body'] = safepop(mergeListElements(sel.xpath('//div[@class="content"]/descendant-or-self::text()').extract()), 0) item['links'] = self.extract_links(sel.xpath('//div[@class="print-links"]/p/text()').extract()) item['references'] = "" item['comments'] = "" #the last two elements are always "permalink" an and "comment", hence they can be discarded item['tags'] = sel.xpath('//ul[@class="links"]/li/a/text()').extract() item["teaser"] = "" self.log("parsed %s successfully" % response.url, level=log.INFO) return item
def parse_item(self, response): self.log(response.url, level=log.DEBUG) sel = Selector(response) item = BlogItem() item['blog_name'] = "The BIG Picture" item['url'] = response.url item['releasedate'] = dparser.parse(sel.xpath('//p[@class="byline"]/text()').extract()[-1], fuzzy=True) item['crawldate'] = datetime.now().isoformat() item['author'] = safepop(sel.xpath('//span[@class="author"]/text()').extract(), 0) item['headline'] = safepop(sel.xpath('//div[@class="headline"]/h2/a/text()').extract(), 0) item['body'] = safepop(mergeListElements(sel.xpath('//a[@rel="category tag"]/parent::*/preceding-sibling::*/descendant-or-self::*/text()').extract()), 0) item['links'] = sel.xpath('//a[@rel="category tag"]/parent::*/preceding-sibling::*/descendant-or-self::*/@href').extract() item['references'] = "" item['comments'] = sel.xpath('//div[@class="comment-meta commentmetadata"]/following-sibling::p/text()').extract() item['tags'] = sel.xpath('//a[@rel="category tag"]/text()').extract() item["teaser"] = "" self.log("parsed %s successfully" % response.url, level=log.INFO) return item
def parse_item(self, response): self.log(response.url, level=log.DEBUG) sel = Selector(response) item = BlogItem() item['blog_name'] = "naked capitalism" item['url'] = response.url item['releasedate'] = dparser.parse(safepop(sel.xpath('//time[@class="entry-date date updated"]/text()').extract(), 0), fuzzy=True) item['crawldate'] = datetime.now().isoformat() item['author'] = safepop(sel.xpath('//a[@rel="author"]/text()').extract(), 0) item['headline'] = safepop(sel.xpath('//h1[@class="entry-title"]/descendant-or-self::text()').extract(), 0) item['body'] = safepop(mergeListElements(sel.xpath('//div[@class="pf-content"]/descendant-or-self::text()').extract()), 0) item['links'] = sel.xpath('//div[@class="pf-content"]/descendant-or-self::a/@href').extract() item['references'] = "" item['comments'] = self.extract_comments(sel) item['tags'] = sel.xpath('//footer[@class="entry-meta"]/a/text()').extract() item["teaser"] = "" self.log("parsed %s successfully" % response.url, level=log.INFO) return item
def parse_item(self, response): self.log(response.url, level=log.DEBUG) sel = Selector(response) item = BlogItem() item['blog_name'] = "econbrowser" item['url'] = response.url item['releasedate'] = dparser.parse(safepop(sel.xpath('//time[@class="entry-date"]/@datetime').extract(), 0), fuzzy=True) item['crawldate'] = datetime.now().isoformat() item['author'] = safepop(sel.xpath('//a[@rel="author"]/text()').extract(), 0) item['headline'] = safepop(sel.xpath('//h1[@class="entry-title"]/descendant-or-self::text()').extract(), 0) item['body'] = safepop(mergeListElements(sel.xpath('//div[@class="entry-content"]/div[@class="addtoany_share_save_container addtoany_content_bottom"]/preceding-sibling::*/descendant-or-self::text()').extract()), 0) item['links'] = sel.xpath('//div[@class="entry-content"]/div[@class="addtoany_share_save_container addtoany_content_bottom"]/preceding-sibling::*/descendant-or-self::a/@href').extract() item['references'] = "" item['comments'] = self.extract_comments(sel) item['tags'] = "" item["teaser"] = "" self.log("parsed %s successfully" % response.url, level=log.INFO) return item
def parse(self, response): sel = Selector(response) links = [] item = BlogItem() item['blog_name'] = "The Conscience of a Liberal" item['url'] = safepop(sel.xpath('//div[@data-url]/attribute::data-url').extract(), 0) item['releasedate'] = dparser.parse(safepop(sel.xpath('//time/@datetime').extract(), 0)).replace(tzinfo=None) item['crawldate'] = datetime.datetime.now().isoformat() item['author'] = "Paul Krugman" item['headline'] = safepop(sel.xpath('//h1[@class="entry-title"]/text()').extract(), 0) item['body'] = safepop(toolbox.mergeListElements(sel.xpath('//p[@class="story-body-text"]/text() | //p[@class="story-body-text"]/a/text()').extract()), 0) for i in sel.xpath("//p[@class='story-body-text']/a/attribute::href").extract(): links.append(i) item['links'] = links item['references'] = "" item['comments'] = "" item['tags'] = "" item['teaser'] = "" #self.log("parsed %s successfully" % response.url, level=log.INFO) return item
def parse(self, response): sel = Selector(response) item = BlogItem() links = [] item['blog_name'] = "The Upshot" item['url'] = response.url item['releasedate'] = sel.xpath('//p[@class="byline-dateline"]/time[@class="dateline"]/attribute::datetime').extract() item['crawldate'] = datetime.datetime.now().isoformat() item['author'] = sel.xpath('//span[@class="byline-author"]/text()').extract() item['headline'] = sel.xpath('//h1[@class="story-heading" and @itemprop="headline"]/text()').extract() item['body'] = safepop(mergeListElements(sel.xpath('//p[@class="story-body-text story-content"]/text() | //p[@class="story-body-text story-content"]/a/text()').extract()), 0) for i in sel.xpath("//p[@class='story-body-text story-content']/a/attribute::href").extract(): links.append(i) item['links'] = links item['references'] = "" item['comments'] = "" item['tags'] = "" item['teaser'] = "" self.log("parsed %s successfully" % response.url, level=log.INFO) return item
def extract_references(self, selector): references = [] for n in selector: references.append(mergeListElements(n.xpath(".//text()").extract())) return references