Beispiel #1
0
	def get_articles(self, year, month=None, day=None):
		different_date_formats = utils.get_all_date_formats(year, month, day)
		articles = []
		for format in different_date_formats:
			response = self.request_api(keyword=format, end_date=utils.get_the_date_before(year, month, day))
			if response:
				for article in response['response']['docs']:
					# escaping conditions
					if article.get('document_type') not in ('article', 'blog'):
						# it's not an article
						continue
					if article.get('web_url') in [_.url for _ in articles] :
						# this url is already added in the response
						continue
					a = Article(NewYorkTimes.__module__)
					a.url      = article.get('web_url')
					a.title    = article.get('headline')['main']
					a.source   = article.get('source') or "The New York Times"
					a.pub_date = datetime.datetime.strptime(article.get('pub_date'), "%Y-%m-%dT%H:%M:%SZ")
					a.snippet  = article.get('snippet')
					# a.images   = TODO
					# scrape body from page
					a.body     = self.scrape_body_article(a.url)
					time.sleep(.11)
					articles.append(a)
		return articles
Beispiel #2
0
 def get_articles(self, year, month=None, day=None):
     different_date_formats = utils.get_all_date_formats(year, month, day)
     articles = []
     for format in different_date_formats:
         response = self.request_api(keyword=format,
                                     end_date=utils.get_the_date_before(
                                         year, month, day))
         if response:
             for article in response['response']['docs']:
                 # escaping conditions
                 if article.get('document_type') not in ('article', 'blog'):
                     # it's not an article
                     continue
                 if article.get('web_url') in [_.url for _ in articles]:
                     # this url is already added in the response
                     continue
                 a = Article(NewYorkTimes.__module__)
                 a.url = article.get('web_url')
                 a.title = article.get('headline')['main']
                 a.source = article.get('source') or "The New York Times"
                 a.pub_date = datetime.datetime.strptime(
                     article.get('pub_date'), "%Y-%m-%dT%H:%M:%SZ")
                 a.snippet = article.get('snippet')
                 # a.images   = TODO
                 # scrape body from page
                 a.body = self.scrape_body_article(a.url)
                 time.sleep(.11)
                 articles.append(a)
     return articles
Beispiel #3
0
 def get_articles(self, year, month=None, day=None):
     different_date_formats = utils.get_all_date_formats(year, month, day)
     articles = []
     for format in different_date_formats:
         response = self.request_api(keyword=format,
                                     end_date=utils.get_the_date_before(
                                         year, month, day))
         if response:
             for article in response['response']['results']:
                 # escaping conditions
                 if article.get('web_url') in [_.url for _ in articles]:
                     # this url is already added in the response
                     continue
                 a = Article(TheGuardian.__module__)
                 a.url = article.get('webUrl')
                 a.title = article.get('webTitle')
                 a.source = "The Guardian"
                 a.pub_date = datetime.datetime.strptime(
                     article.get('webPublicationDate'),
                     "%Y-%m-%dT%H:%M:%SZ")
                 a.snippet = article.get('fields').get('trailText')
                 # a.images   = TODO
                 # scrape body from page
                 a.body = self.scrape_body_article(a.url)
                 time.sleep(.11)
                 if a.body:
                     articles.append(a)
                 else:
                     warning("no body for article %s" % (a.__dict__))
                     pass
     return articles
Beispiel #4
0
	def get_articles(self, year, month=None, day=None):
		different_date_formats = utils.get_all_date_formats(year, month, day)
		articles = []
		for format in different_date_formats:
			response = self.request_api(keyword=format, end_date=utils.get_the_date_before(year, month, day))
			if response:
				for article in response['response']['results']:
					# escaping conditions
					if article.get('web_url') in [_.url for _ in articles] :
						# this url is already added in the response
						continue
					a = Article(TheGuardian.__module__)
					a.url      = article.get('webUrl')
					a.title    = article.get('webTitle')
					a.source   = "The Guardian"
					a.pub_date = datetime.datetime.strptime(article.get('webPublicationDate'), "%Y-%m-%dT%H:%M:%SZ")
					a.snippet  = article.get('fields').get('trailText')
					# a.images   = TODO
					# scrape body from page
					a.body     = self.scrape_body_article(a.url)
					time.sleep(.11)
					if a.body:
						articles.append(a)
					else:
						warning("no body for article %s" % (a.__dict__))
						pass
		return articles