Beispiel #1
0
	def test_get_articles(self):
		from brokenpromises import Article
		date = (2011, 11, 2)
		a = Article(url="test")
		a.add_ref_date(date)
		self.storage.save_article(a)
		res = self.storage.get_articles(date)
		assert type(res[0]) is Article, type(res)
		assert len(res) == 1
Beispiel #2
0
 def test_get_articles(self):
     from brokenpromises import Article
     date = (2011, 11, 2)
     a = Article(url="test")
     a.add_ref_date(date)
     self.storage.save_article(a)
     res = self.storage.get_articles(date)
     assert type(res[0]) is Article, type(res)
     assert len(res) == 1
Beispiel #3
0
 def get_articles(self, year, month=None, day=None):
     different_date_formats = utils.get_all_date_formats(year, month, day)
     articles = []
     for format in different_date_formats:
         response = self.request_api(keyword=format,
                                     end_date=utils.get_the_date_before(
                                         year, month, day))
         if response:
             for article in response['response']['results']:
                 # escaping conditions
                 if article.get('web_url') in [_.url for _ in articles]:
                     # this url is already added in the response
                     continue
                 a = Article(TheGuardian.__module__)
                 a.url = article.get('webUrl')
                 a.title = article.get('webTitle')
                 a.source = "The Guardian"
                 a.pub_date = datetime.datetime.strptime(
                     article.get('webPublicationDate'),
                     "%Y-%m-%dT%H:%M:%SZ")
                 a.snippet = article.get('fields').get('trailText')
                 # a.images   = TODO
                 # scrape body from page
                 a.body = self.scrape_body_article(a.url)
                 time.sleep(.11)
                 if a.body:
                     articles.append(a)
                 else:
                     warning("no body for article %s" % (a.__dict__))
                     pass
     return articles
Beispiel #4
0
 def get_articles(self, year, month=None, day=None):
     different_date_formats = utils.get_all_date_formats(year, month, day)
     articles = []
     for format in different_date_formats:
         response = self.request_api(keyword=format,
                                     end_date=utils.get_the_date_before(
                                         year, month, day))
         if response:
             for article in response['response']['docs']:
                 # escaping conditions
                 if article.get('document_type') not in ('article', 'blog'):
                     # it's not an article
                     continue
                 if article.get('web_url') in [_.url for _ in articles]:
                     # this url is already added in the response
                     continue
                 a = Article(NewYorkTimes.__module__)
                 a.url = article.get('web_url')
                 a.title = article.get('headline')['main']
                 a.source = article.get('source') or "The New York Times"
                 a.pub_date = datetime.datetime.strptime(
                     article.get('pub_date'), "%Y-%m-%dT%H:%M:%SZ")
                 a.snippet = article.get('snippet')
                 # a.images   = TODO
                 # scrape body from page
                 a.body = self.scrape_body_article(a.url)
                 time.sleep(.11)
                 articles.append(a)
     return articles
Beispiel #5
0
	def get_articles(self, year, month=None, day=None):
		different_date_formats = utils.get_all_date_formats(year, month, day)
		articles = []
		for format in different_date_formats:
			response = self.request_api(keyword=format, end_date=utils.get_the_date_before(year, month, day))
			if response:
				for article in response['response']['docs']:
					# escaping conditions
					if article.get('document_type') not in ('article', 'blog'):
						# it's not an article
						continue
					if article.get('web_url') in [_.url for _ in articles] :
						# this url is already added in the response
						continue
					a = Article(NewYorkTimes.__module__)
					a.url      = article.get('web_url')
					a.title    = article.get('headline')['main']
					a.source   = article.get('source') or "The New York Times"
					a.pub_date = datetime.datetime.strptime(article.get('pub_date'), "%Y-%m-%dT%H:%M:%SZ")
					a.snippet  = article.get('snippet')
					# a.images   = TODO
					# scrape body from page
					a.body     = self.scrape_body_article(a.url)
					time.sleep(.11)
					articles.append(a)
		return articles
	def get_articles(self, year, month=None, day=None):
		different_date_formats = utils.get_all_date_formats(year, month, day)
		articles = []
		for format in different_date_formats:
			try:
				response = self.request_api(keyword=format)
			except Exception as e:
				# TODO: log error
				import sys
				print >> sys.stderr, e
				continue
			for article in response['response']['results']:
				# escaping conditions
				if article.get('web_url') in [_.url for _ in articles] :
					# this url is already added in the response
					continue
				a = Article(TheGuardian.__module__)
				a.url      = article.get('webUrl')
				a.title    = article.get('webTitle')
				a.source   = "The Guardian"
				a.pub_date = datetime.datetime.strptime(article.get('webPublicationDate'), "%Y-%m-%dT%H:%M:%SZ")
				a.snippet  = article.get('fields').get('trailText')
				# a.images   = TODO
				# scrape body from page
				a.body     = self.scrape_body_article(a.url)
				if a.body:
					articles.append(a)
				else:
					# TODO Loggin
					# print a, a.url
					pass
		return articles
Beispiel #7
0
	def get_articles(self, year, month=None, day=None):
		different_date_formats = utils.get_all_date_formats(year, month, day)
		articles = []
		for format in different_date_formats:
			response = self.request_api(keyword=format, end_date=utils.get_the_date_before(year, month, day))
			if response:
				for article in response['response']['results']:
					# escaping conditions
					if article.get('web_url') in [_.url for _ in articles] :
						# this url is already added in the response
						continue
					a = Article(TheGuardian.__module__)
					a.url      = article.get('webUrl')
					a.title    = article.get('webTitle')
					a.source   = "The Guardian"
					a.pub_date = datetime.datetime.strptime(article.get('webPublicationDate'), "%Y-%m-%dT%H:%M:%SZ")
					a.snippet  = article.get('fields').get('trailText')
					# a.images   = TODO
					# scrape body from page
					a.body     = self.scrape_body_article(a.url)
					time.sleep(.11)
					if a.body:
						articles.append(a)
					else:
						warning("no body for article %s" % (a.__dict__))
						pass
		return articles
Beispiel #8
0
 def test_save_article(self):
     a = Article(url="test")
     # insert
     res, code = self.storage.save_article(a)
     assert code in (CODE_UPDATE, CODE_INSERT)
     assert self.storage.get_collection(
         Storage.COLLECTION_ARTICLES).count() > 0
     assert type(res) is Article, type(res)
     assert res._id
     # update
     res, code = self.storage.save_article(a)
     assert code in (CODE_UPDATE, CODE_INSERT)
     assert self.storage.get_collection(
         Storage.COLLECTION_ARTICLES).count() > 0
     assert type(res) is Article, type(res)
     assert res._id
Beispiel #9
0
    def save_article(self, article):
        """
		save or update an article using its url.
		Returns a status CODE 
		"""
        if type(article) in (list, tuple):
            return map(self.save_article, article)
        assert article.url, "article needs an url to be saved"
        articles_collection = self.get_collection(Storage.COLLECTION_ARTICLES)
        previous = articles_collection.find_one({"url": article.url})
        if not previous:
            articles_collection.insert(article.__dict__)
            return (article, CODE_INSERT)
        else:
            article_merged = dict(previous.items() + article.__dict__.items())
            articles_collection.update({'_id': previous['_id']},
                                       article_merged)
            return (Article(**article_merged), CODE_UPDATE)
        return (article, CODE_ERROR)
Beispiel #10
0
 def get_articles(self, date=None, limit=0, skip=0):
     return [
         Article(**article)
         for article in self._get_articles(date, limit=limit, skip=skip)
     ]