def parse(self, response): for l in response.css('div.article'): featured = l.css('div.article-featured').extract_first() item = NewsItem() if featured: item['title'] = l.css( 'div.article-meta > h3::text').extract_first() # item['pub_date'] = l.css('p.timeauthor > time::attr(datetime)').extract_first() item['description'] = 'some description' item['source'] = 'coindesk' item['image_url'] = l.css( 'div.subfeatured-picture > img::attr(src)').extract_first( ) item['featured'] = True else: item['title'] = l.css('h3 > a::text').extract_first() # item['pub_date'] = l.css('p.timeauthor > time::attr(datetime)').extract_first() item['description'] = 'some description' item['source'] = 'coindesk' item['image_url'] = l.css( 'img.wp-post-image::attr(src)').extract_first() item['featured'] = False item.save() # Pagination self.curr_page += 1 next = 'http://www.coindesk.com/page/' + str(self.curr_page) + '/' if next and self.curr_page <= MAX_PAGES: yield scrapy.Request(url=next, callback=self.parse) print('finished')
def parse(self, response): items = [] for l in response.xpath('//div[@data-context="listing"]'): item = NewsItem() item['use_in_report'] = False item['title'] = l.css('a.title::text').extract_first()[:254] item['description'] = '' image_url = l.css('a.thumbnail img::attr(src)').extract_first() item['image_url'] = 'https:' + image_url if image_url else '' domain = l.css('span.domain a::text').extract_first() item['category'] = detect_category(domain) comments = l.css('a.bylink::text').extract_first() number = re.sub('[ comments]', '', comments) item['comments'] = int(number) votes = l.css('div.unvoted::text').extract_first() item['votes'] = get_vote_number(votes) item.save() items.append(item) # Pagination self.curr_page += 1 next = response.css('span.next-button a::attr(href)').extract_first() if next and self.curr_page < MAX_PAGES: yield scrapy.Request(url=next, callback=self.parse) print('finished ' + str(self.curr_page) + ' page')
def parse_items(self, response): title = response.xpath('//div[@class="story"]/div[@class="title"]\ /h1/text()').extract() if len(title): item = None link = response.url #if not News.objects.filter(link=link).count(): title = title[0] created = response.xpath('//div[@class="story"]/div\ /span[@class="timestamp"]/text()').extract()[0] created = created[:-2] created = time.strptime(created, "%B %d, %Y %I:%M") content = response.xpath('//div[@class="story"]\ /div[@class="main"]/div[@class="text_body"]').extract( ) tags = response.xpath('//div[@class="story"]\ /div[@class="main"]/div[@class="tags"]\ /a[@class="tag"]/text()').extract() item = NewsItem() item['link'] = link item['title'] = title item['created'] = strftime('%Y-%m-%d', created) item['content'] = content item['tags'] = list(set(tags)) item.save() return item
def parse_page(self, response): x = response.xpath('//meta[contains(@property, "og:type")]//@content' ).extract_first() if x in 'article': title = response.xpath( '//meta[contains(@property, "og:title")]//@content' ).extract_first() file_id = hashlib.md5(title.encode('utf-8')).hexdigest() l = ItemLoader(item=NewsItem(), response=response) l.add_xpath('headline', '//meta[contains(@property, "og:title")]//@content') l.add_value('file_id', file_id) l.add_value('title', title) l.add_value('link', response.url) l.add_xpath( 'description', '//meta[contains(@property, "og:description")]//@content') l.add_xpath('author', '//meta[contains(@name, "author")]//@content') l.add_xpath('content', '//div[contains(@class, "body")]//p//text()') l.add_xpath('pubDate', '//time//text()') l.add_value('source', 'wired') yield l.load_item() else: next
def parse_page(self, response, title, description, pubDate, author): x = response.xpath('//meta[contains(@property, "og:type")]//@content' ).extract_first() if x in 'article': file_id = hashlib.md5(title.encode('utf-8')).hexdigest() l = ItemLoader(item=NewsItem(), response=response) l.add_xpath('headline', '//h1[contains(@class, "headline")]//text()') l.add_value('file_id', file_id) l.add_value('title', title) l.add_value('link', response.url) l.add_value('description', description) #l.add_xpath('author', '//*[contains(@href,"journalists")]//text()') l.add_xpath('author', '//meta[contains(@name, "Author")]//@content') l.add_xpath( 'content', '//div[contains(@class, "StandardArticleBody") or' 'contains( @class , "Attribution_attribution")]/p//text()') l.add_xpath( 'content', '//div[contains(@class, "StandardArticleBody")]/h3//text()') #l.add_xpath('content', '//div[contains(@class, "paragraph__component BasicArticle")]//text()') l.add_value('pubDate', pubDate) l.add_value('source', 'reuters') yield l.load_item() else: next
def parse_page(self, response): x = response.xpath('//meta[contains(@property, "og:type")]//@content' ).extract_first() if x in 'article': link = response.url file_id = hashlib.md5(link.encode('utf-8')).hexdigest() l = ItemLoader(item=NewsItem(), response=response) l.add_xpath('headline', '//meta[contains(@property, "og:title")]//@content') l.add_value('file_id', file_id) l.add_xpath('title', '//h1[contains(@role, "heading")]//text()') l.add_value('link', link) #l.add_value('description', 'None') l.add_xpath( 'description', '//meta[contains(@property, "description")]//@content') #l.add_xpath('author', '//a[contains(@href, "/profile/")]//text()') l.add_xpath('author', '//meta[contains(@name, "author")]//@content') l.add_xpath('content', '//article[contains(@role, "article")]//p//text()') l.add_xpath('pubDate', '//time/@datetime') l.add_value('source', 'thetimesUK') yield l.load_item() else: next
def parse_page(self, response, title, author): x = response.xpath('//meta[contains(@property, "og:type")]//@content' ).extract_first() if x in 'article': title = response.xpath( '//h1[contains(@class, "entry-title")]//text()').extract_first( ) file_id = hashlib.md5(title.encode('utf-8')).hexdigest() l = ItemLoader(item=NewsItem(), response=response) l.add_xpath('headline', '//h1[contains(@class, "entry-title")]//text()') l.add_value('file_id', file_id) l.add_value('title', title) l.add_value('link', response.url) l.add_xpath('description', '//meta[contains(@name, "description")]//@content') l.add_xpath('author', '//a[contains(@rel, "author")]//text()') l.add_xpath( 'content', '//div[contains(@itemprop, "articleBody") or' 'contains(@class, "post-content")]//p//text()') l.add_xpath('pubDate', '//article/@data-date') l.add_value('source', 'theobserver') yield l.load_item()
def parse_page(self, response): x = response.xpath('//meta[contains(@property, "og:type")]//@content' ).extract_first() if x in 'article': file_id = hashlib.md5(title.encode('utf-8')).hexdigest() l = ItemLoader(item=NewsItem(), response=response) l.add_xpath('headline', '//meta[contains(@property, "og:title")]//@content') l.add_value('file_id', file_id) l.add_value('title', '//meta[contains(@name, "title")]//@content') l.add_value('link', response.url) l.add_value('description', '//meta[contains(@name, "description")]//@content') l.add_xpath('author', '//meta[contains(@name, "author")]//@content') l.add_xpath( 'content', '//div[contains(@class, "zn-body__paragraph") or ' 'contains(@class, "BasicArticle") or' 'contains(@class, "Paragraph__component")]//text()') l.add_xpath( 'content', '//div[contains(@class, "zn-body__paragraph")]//h3//text()') l.add_value('pubDate', '//meta[contains(@name, "pubdate")]//@content') l.add_value('source', 'cnn') yield l.load_item() else: next
def parse_page(self, response, title, description, pubDate, author): x = response.xpath('//meta[contains(@property, "og:type")]//@content' ).extract_first() if x in 'article': file_id = hashlib.md5(title.encode('utf-8')).hexdigest() l = ItemLoader(item=NewsItem(), response=response) l.add_xpath('headline', '//meta[contains(@property, "og:title")]//@content') l.add_value('file_id', file_id) l.add_value('title', title) l.add_value('link', response.url) l.add_value('description', description) l.add_xpath( 'author', '//meta[contains(@property, "article:author")]//@content') l.add_xpath( 'content', '//div[contains(@class, "story-body__inner") or' 'contains(@id, "story-body") or' 'contains(@class, "media__summary")]//p//text()') l.add_xpath( 'content', '//div[contains(@class, "story-body__inner")]//h2//text()') l.add_value('pubDate', pubDate) l.add_value('source', 'bbc') yield l.load_item() else: next
def parse_items(self, response): title = response.xpath('//div[@class="al-headline"]/\ div[@class="container"]/h1').extract() if len(title): item = None link = response.url title = strip_tags(title[0]) # parse date created = response.xpath('//h4[@class="byline"]').extract()[0] created = created.split('>')[-2].strip()[:-4] ord_str = None if 'st,' in created: ord_str = 'st' elif 'nd,' in created: ord_str = 'nd' elif 'rd,' in created: ord_str = 'rd' elif 'th,' in created: ord_str = 'th' created_format = '%H:%M %p | %A, %B %d' + ord_str + ', %Y' created = time.strptime(created, created_format) #content = response.xpath('/html/body/div[6]/div[8]/div/div[2]/div[2]').extract() content = response.xpath('//div[@class="main-article"]').extract() #tags = response.xpath('//div[@class="story"]\ # /div[@class="main"]/div[@class="tags"]\ # /a[@class="tag"]/text()').extract() item = NewsItem() item['link'] = link item['title'] = title item['created'] = strftime('%Y-%m-%d', created) item['content'] = content #item['tags'] = list(set(tags)) item.save() return item
def parse_items(self, response): title = response.xpath('//div[@class="al-headline"]/\ div[@class="container"]/h1').extract() if len(title): item = None link = response.url title = strip_tags(title[0]) # parse date created = response.xpath('//h4[@class="byline"]').extract()[0] created = created.split('>')[-2].strip()[:-4] ord_str = None if 'st,' in created: ord_str = 'st' elif 'nd,' in created: ord_str = 'nd' elif 'rd,' in created: ord_str = 'rd' elif 'th,' in created: ord_str = 'th' created_format = '%H:%M %p | %A, %B %d' + ord_str +', %Y' created = time.strptime(created, created_format) #content = response.xpath('/html/body/div[6]/div[8]/div/div[2]/div[2]').extract() content = response.xpath('//div[@class="main-article"]').extract() #tags = response.xpath('//div[@class="story"]\ # /div[@class="main"]/div[@class="tags"]\ # /a[@class="tag"]/text()').extract() item = NewsItem() item['link'] = link item['title'] = title item['created'] = strftime('%Y-%m-%d', created) item['content'] = content #item['tags'] = list(set(tags)) item.save() return item
def parse_page(self, response, title, description, pubDate, author): x = response.xpath('//meta[contains(@property, "og:type")]//@content' ).extract_first() if x in 'article': file_id = hashlib.md5(title.encode('utf-8')).hexdigest() l = ItemLoader(item=NewsItem(), response=response) l.add_xpath('headline', '//*//h1//text()') l.add_value('file_id', file_id) l.add_value('title', title) l.add_value('link', response.url) l.add_value('description', description) l.add_value('author', author) l.add_xpath( 'content', '//section[contains(@name, "articleBody")]//p//text()') l.add_value('pubDate', pubDate) l.add_value('source', 'nytimes') yield l.load_item() else: next