def parse(self, response): # iterate entries for entry in response.css('div').css('li.archive-item-component'): #retrieve info for our current post item = ScrapyItem() # p.post-excerpt class de type p = post-excerpt item['source'] = 'wired' temp_string = entry.css('time::text').extract_first() item['brief'] = entry.css('a').css('p.archive-item-component__desc::text').extract_first() item['url'] = entry.css('a::attr(href)').extract_first() item['title'] = entry.css('a').css('h2::text').extract_first() # check time now = datetime.datetime.now() now = now.strftime("%Y-%m-%dT%H:%M:%S.%f%z") item['tstamp'] = now # transfer time into ISO 8601 temp = timestring.Date(temp_string).date item['date'] = temp.strftime("%Y-%m-%dT%H:%M:%S.%f%z") # item['date'] = entry.css('time::text').extract_first() # item['brief'] = entry.css('p.post-excerpt::text').extract_first() # item['url'] = entry.css('h2').css('a::attr(href)').extract_first() # item['title'] = entry.css('h2').css('a::text').extract_first() yield item
def parse(self, response): news = response.css('div.sr') titles = news.css('h4') briefs = news.css('p') # iterate entries for j in range(len(titles)): #retrieve info for our current post item = ScrapyItem() item['source'] = 'esa' temp_string = briefs[j].css('::text').extract_first() if briefs[j].css('b::text').extract_first() != '...': continue item['brief'] = briefs[j].css('::text').extract()[1] item['url'] = titles[j].css('a::attr(href)').extract_first() item['title'] = titles[j].css('a::text').extract_first() # check time now = datetime.datetime.now() now = now.strftime("%Y-%m-%dT%H:%M:%S.%f%z") item['tstamp'] = now # transfer time into ISO 8601 temp = timestring.Date(temp_string).date item['date'] = temp.strftime("%Y-%m-%dT%H:%M:%S.%f%z") yield item
def parse(self, response): # iterate entries for entry in response.css('div.listing-wide__inner'): item = ScrapyItem() #retrieve info for our current post item['source'] = 'scientificamerican' temp_string = entry.css('div.t_meta::text').extract_first().split( ' — ')[0] item['brief'] = entry.css('p::text').extract_first() item['url'] = entry.css('h2').css('a::attr(href)').extract_first() item['title'] = entry.css('h2').css('a::text').extract_first() # check time now = datetime.datetime.now() now = now.strftime("%Y-%m-%dT%H:%M:%S.%f%z") item['tstamp'] = now # transfer time into ISO 8601 temp = timestring.Date(temp_string).date item['date'] = temp.strftime("%Y-%m-%dT%H:%M:%S.%f%z") # request to article page request = scrapy.Request(item['url'], callback=self.parse_article) request.meta['item'] = item yield request paginate = response.css('div.pagination__right') if paginate.css('a'): next_link = 'https://www.scientificamerican.com/search/' + paginate.css( 'a::attr(href)').extract_first() yield scrapy.Request(next_link)
def parse(self, response): for entry in response.css('li.archive-item-component'): item = ScrapyItem() url_temp = entry.css('a::attr(href)').extract_first() item['url'] = 'https://www.wired.com' + url_temp item['source'] = 'wired' temp_string = entry.css('time::text').extract_first() item['brief'] = entry.css('a').css( 'p.archive-item-component__desc::text').extract_first() item['title'] = entry.css('a').css('h2::text').extract_first() # check time now = datetime.datetime.now() now = now.strftime("%Y-%m-%dT%H:%M:%S.%f%z") item['tstamp'] = now # transfer time into ISO 8601 temp = timestring.Date(temp_string).date item['date'] = temp.strftime("%Y-%m-%dT%H:%M:%S.%f%z") # request to article page request = scrapy.Request(item['url'], callback=self.parse_article) request.meta['item'] = item yield request # go to next page if exists next_url = response.css('li.pagination-component__caret--right').css( 'a::attr(href)').extract_first() if next_url: yield scrapy.Request('https://www.wired.com' + next_url)
def parse(self, response): #iterate entries for entry in response.css('div.element2'): #retrieve info for our current post item = ScrapyItem() item['source'] = 'NYTimes' temp_string = entry.css('span.dateline::text').extract_first() item['brief'] = entry.css('p.summary').extract_first() item['url'] = entry.css('a::attr(href)').extract_first() item['title'] = entry.css('a::text').extract_first() # check time now = datetime.datetime.now() now = now.strftime("%Y-%m-%dT%H:%M:%S.%f%z") item['tstamp'] = now # transfer time into ISO 8601 temp = timestring.Date(temp_string).date item['date'] = temp.strftime("%Y-%m-%dT%H:%M:%S.%f%z") print(item) yield item
def parse(self, response): # iterate entries for entry in response.css('li.wd_item'): #retrieve info for our current post item = ScrapyItem() item['source'] = 'lockheed_martin' temp_string = entry.css('div.wd_date::text').extract_first() item['brief'] = entry.css('div').css('p::text').extract_first() item['url'] = entry.css('a::attr(href)').extract_first() item['title'] = entry.css('div').css('a::text').extract_first() # check time now = datetime.datetime.now() now = now.strftime("%Y-%m-%dT%H:%M:%S.%f%z") item['tstamp'] = now # transfer time into ISO 8601 temp = timestring.Date(temp_string).date item['date'] = temp.strftime("%Y-%m-%dT%H:%M:%S.%f%z") # item['date'] = entry.css('time::text').extract_first() # item['brief'] = entry.css('p.post-excerpt::text').extract_first() # item['url'] = entry.css('h2').css('a::attr(href)').extract_first() # item['title'] = entry.css('h2').css('a::text').extract_first() yield item
def parse(self, response): item = ScrapyItem() i = 0 if NewsSpider.temp == False: if NewsSpider.j == 0: for entry in response.css('div').css( 'li.archive-item-component'): np.put(NewsSpider.url_list, [i], entry.css('a::attr(href)').extract_first()) #url_list = np.append(entry.css('a::attr(href)').extract_first()) url_temp = entry.css('a::attr(href)').extract_first() item['source'] = 'wired' temp_string = entry.css('time::text').extract_first() item['brief'] = entry.css('a').css( 'p.archive-item-component__desc::text').extract_first( ) item['url'] = entry.css('a::attr(href)').extract_first() item['title'] = entry.css('a').css( 'h2::text').extract_first() # check time now = datetime.datetime.now() now = now.strftime("%Y-%m-%dT%H:%M:%S.%f%z") item['tstamp'] = now # transfer time into ISO 8601 temp = timestring.Date(temp_string).date item['date'] = temp.strftime("%Y-%m-%dT%H:%M:%S.%f%z") i = i + 1 yield item next_page = response.urljoin(NewsSpider.url_list.item( NewsSpider.j)) NewsSpider.temp = True NewsSpider.j = NewsSpider.j + 1 yield scrapy.Request(next_page, callback=self.parse) else: item['article'] = response.css('article').css('div').css( 'p::text').extract() yield item previous_page = 'https://www.wired.com/search/?page=1&q=rocket&size=1&sort=publishDate_tdt%20desc&types%5B0%5D=article' NewsSpider.temp = False yield scrapy.Request(previous_page, callback=self.parse)
def parse(self, response): # iterate entries for entry in response.css('div.newslist'): #retrieve info for our current post item = ScrapyItem() item['source'] = 'roscosmos' item['company'] = 'roscosmos' temp_string = entry.css('div.date::text').extract_first() item['brief'] = 'none' item['url'] = entry.css('a::attr(href)').extract_first() item['title'] = entry.css('div').css('a.name::text').extract_first() # check time now = datetime.datetime.now() now = now.strftime("%Y-%m-%dT%H:%M:%S.%f%z") item['tstamp'] = now # transfer time into ISO 8601 temp = timestring.Date(temp_string).date item['date'] = temp.strftime("%Y-%m-%dT%H:%M:%S.%f%z") # request to article page next_page = response.urljoin(item['url']) if (temp.year<=2015): request = scrapy.Request(next_page, callback=self.parse_article_before2015) else: request = scrapy.Request(next_page, callback=self.parse_article_after2015) request.meta['item'] = item yield request if NewsSpider.temp == False: NewsSpider.urls_list = response.css('div.text').css('a::attr(href)').extract() NewsSpider.temp = True for url in NewsSpider.urls_list: next_search_page = response.urljoin(url) if next_search_page: yield scrapy.Request(next_search_page, callback=self.parse)
def parse(self, response): # iterate entries for entry in response.css('div.listing-wide__inner'): #retrieve info for our current post item = ScrapyItem() item['source'] = 'scientificamerican' item['date'] = entry.css('div.t_meta::text').extract_first().split( ' — ')[0] item['brief'] = entry.css('p::text').extract_first() item['url'] = entry.css('h2').css('a::attr(href)').extract_first() item['title'] = entry.css('h2').css('a::text').extract_first() # check time now = datetime.datetime.now() item['tstamp'] = now yield item
def parse(self, response): # iterate entries if not response.css('li.wd_item'): return for entry in response.css('li.wd_item'): # retrieve info for our current post item = ScrapyItem() item['source'] = 'lockheedmartin' item['company'] = 'lockheedmartin' # retrieve time string temp_string = entry.css('div.wd_date::text').extract_first() # transfer time into ISO 8601 temp = timestring.Date(temp_string).date item['date'] = temp.strftime("%Y-%m-%dT%H:%M:%S.%f%z") # retrieve abstract item['brief'] = entry.css('div').css('p::text').extract_first() # retrieve url item['url'] = entry.css('a::attr(href)').extract_first() # retrieve title item['title'] = entry.css('div').css('a::text').extract_first() # check time now = datetime.datetime.now() now = now.strftime("%Y-%m-%dT%H:%M:%S.%f%z") item['tstamp'] = now request = scrapy.Request(item['url'], self.parse_body) request.meta['item'] = item yield request if len(response.css('li.wd_item')) == 1000: # This website limits the articles shown articles to 1000; redo the request changing the 'end' date if the count reaches 1000 next_page = '&'.join( self.start_urls[0].split('&')[:-1]) + '&end=' + item['date'] yield scrapy.Request(next_page)
def parse(self, response): for entry in response.css('#archt').css('tr'): # Declare the container item item = ScrapyItem() item['source'] = 'esa' item['company'] = 'esa' # Extract the date tds = entry.css('td') temp_string = tds[1].css('::text').extract_first() temp_string += ' ' + tds[0].css('::text').extract_first() temp_string += ' ' + tds[2].css('::text').extract_first() # transfer time into ISO 8601 temp = timestring.Date(temp_string).date item['date'] = temp.strftime("%Y-%m-%dT%H:%M:%S.%f%z") # Extract the title item['title'] = tds[3].css('a::text').extract_first() # Extract the URL item['url'] = 'http://www.esa.int' + tds[3].css( 'a::attr(href)').extract_first() # Save current time now = datetime.datetime.now() now = now.strftime("%Y-%m-%dT%H:%M:%S.%f%z") item['tstamp'] = now # Proceed to retrieve the body and the abstract request = scrapy.Request(item['url'], callback=self.parse_body_brief) request.meta['item'] = item yield request # Go to next page if possible nextpage_url = response.css( 'a[title="Next Page"]::attr(href)').extract_first() print('\n\n' + nextpage_url + '\n\n') if nextpage_url: yield scrapy.Request('http://www.esa.int' + nextpage_url)
def parse(self, response): # iterate entries for entry in response.css('div.launch-article'): # create information container item = ScrapyItem() item['source'] = 'spacenews' # retrieve publication date temp_string = entry.css('time::text').extract_first() # transfer time into ISO 8601 temp = timestring.Date(temp_string).date item['date'] = temp.strftime("%Y-%m-%dT%H:%M:%S.%f%z") # get abstract item['brief'] = entry.css('p.post-excerpt::text').extract_first() # get article url item['url'] = entry.css('h2').css('a::attr(href)').extract_first() # get article title item['title'] = entry.css('h2').css('a::text').extract_first() # check time now = datetime.datetime.now() now = now.strftime("%Y-%m-%dT%H:%M:%S.%f%z") item['tstamp'] = now # request to article page request = scrapy.Request(item['url'], callback=self.parse_article) request.meta['item'] = item yield request next_url = response.css('p.infinitescroll').css( 'a::attr(href)').extract_first() if next_url: yield scrapy.Request(next_url)
def parse(self, response): # iterate entries for entry in response.css('article.list-tpl__article'): # retrieve info for our current post item = ScrapyItem() item['source'] = 'arianespace' item['company'] = 'arianespace' temp_string = entry.css( 'span.list-article__date::text').extract_first() item['url'] = entry.css( 'a.list-article__title::attr(href)').extract_first() item['title'] = entry.css( 'a.list-article__title::text').extract_first() item['title'] = ' '.join( item['title'].split()) #Extract \t\n symbols and alike # get current time now = datetime.datetime.now() now = now.strftime("%Y-%m-%dT%H:%M:%S.%f%z") item['tstamp'] = now # convert article time to ISO 8601 temp = timestring.Date(temp_string).date item['date'] = temp.strftime("%Y-%m-%dT%H:%M:%S.%f%z") # get body and brief request = scrapy.Request(item['url'], callback=self.get_brief_body, meta={'item': item}) request.meta['item'] = item yield request paginate = response.css('div.paginate').css('a') for paginate_link in paginate: if paginate_link.css('button.paginate__button--right'): yield scrapy.Request( paginate_link.css('::attr(href)').extract_first())
def parse(self, response): # iterate entries for entry in response.css('div.big__list__item__info'): #retrieve info for our current post item = ScrapyItem() item['source'] = 'thales' item['date'] = 'NotAvalaible' item['brief'] = entry.css( 'div.field__item even::text').extract_first() item['url'] = entry.css('a::attr(href)').extract_first() item['title'] = entry.css('a::text').extract_first() # check time now = datetime.datetime.now() item['tstamp'] = now print(item) yield item
def parse(self, response): for entry in response.css('div.views-row'): # Declare the container item item = ScrapyItem() item['source'] = 'spacex' item['company'] = 'spacex' # Extract the date temp_string = entry.css('div.date::text').extract_first() # transfer time into ISO 8601 temp = timestring.Date(temp_string).date item['date'] = temp.strftime("%Y-%m-%dT%H:%M:%S.%f%z") # Extract the title item['title'] = entry.css('h2.title').css('a::text').extract_first() # Extract the URL item['url'] = 'http://www.spacex.com' + entry.css('h2.title').css('a::attr(href)').extract_first() # Extract the brief item['brief'] = entry.css('div.summary').css('p::text').extract_first() # Save current time now = datetime.datetime.now() now = now.strftime("%Y-%m-%dT%H:%M:%S.%f%z") item['tstamp'] = now # Proceed to retrieve the body and the abstract request = scrapy.Request(item['url'], callback=self.parse_body) request.meta['item'] = item yield request # Go to next page if possible nextpage_url = response.css('li.pager-next').css('a::attr(href)').extract_first() if nextpage_url: yield scrapy.Request('http://www.spacex.com' + nextpage_url)
def parse(self, response): #iterate entries for entry in response.css('div.pb-feed-headline.ng-scope'): #retrieve info for our current post item = ScrapyItem() item['source'] = 'WP' item['date'] = entry.css( 'span.pb-timestamp.ng-binding::text').extract_first() item['brief'] = entry.css( 'div.pb-feed-description.ng-binding').extract_first() item['url'] = entry.css('a.ng-binding::attr(href)').extract_first() item['title'] = entry.css('a.ng-binding::text').extract_first() # check time now = datetime.datetime.now() item['tstamp'] = now print(item) yield item