Example #1
0
 def get_articles(self, year, month=None, day=None):
     different_date_formats = utils.get_all_date_formats(year, month, day)
     articles = []
     for format in different_date_formats:
         response = self.request_api(keyword=format,
                                     end_date=utils.get_the_date_before(
                                         year, month, day))
         if response:
             for article in response['response']['results']:
                 # escaping conditions
                 if article.get('web_url') in [_.url for _ in articles]:
                     # this url is already added in the response
                     continue
                 a = Article(TheGuardian.__module__)
                 a.url = article.get('webUrl')
                 a.title = article.get('webTitle')
                 a.source = "The Guardian"
                 a.pub_date = datetime.datetime.strptime(
                     article.get('webPublicationDate'),
                     "%Y-%m-%dT%H:%M:%SZ")
                 a.snippet = article.get('fields').get('trailText')
                 # a.images   = TODO
                 # scrape body from page
                 a.body = self.scrape_body_article(a.url)
                 time.sleep(.11)
                 if a.body:
                     articles.append(a)
                 else:
                     warning("no body for article %s" % (a.__dict__))
                     pass
     return articles
Example #2
0
 def get_articles(self, year, month=None, day=None):
     different_date_formats = utils.get_all_date_formats(year, month, day)
     articles = []
     for format in different_date_formats:
         response = self.request_api(keyword=format,
                                     end_date=utils.get_the_date_before(
                                         year, month, day))
         if response:
             for article in response['response']['docs']:
                 # escaping conditions
                 if article.get('document_type') not in ('article', 'blog'):
                     # it's not an article
                     continue
                 if article.get('web_url') in [_.url for _ in articles]:
                     # this url is already added in the response
                     continue
                 a = Article(NewYorkTimes.__module__)
                 a.url = article.get('web_url')
                 a.title = article.get('headline')['main']
                 a.source = article.get('source') or "The New York Times"
                 a.pub_date = datetime.datetime.strptime(
                     article.get('pub_date'), "%Y-%m-%dT%H:%M:%SZ")
                 a.snippet = article.get('snippet')
                 # a.images   = TODO
                 # scrape body from page
                 a.body = self.scrape_body_article(a.url)
                 time.sleep(.11)
                 articles.append(a)
     return articles
Example #3
0
 def test_get_articles(self):
     from brokenpromises import Article
     date = (2011, 11, 2)
     a = Article(url="test")
     a.add_ref_date(date)
     self.storage.save_article(a)
     res = self.storage.get_articles(date)
     assert type(res[0]) is Article, type(res)
     assert len(res) == 1
Example #4
0
 def test_save_article(self):
     a = Article(url="test")
     # insert
     res, code = self.storage.save_article(a)
     assert code in (CODE_UPDATE, CODE_INSERT)
     assert self.storage.get_collection(
         Storage.COLLECTION_ARTICLES).count() > 0
     assert type(res) is Article, type(res)
     assert res._id
     # update
     res, code = self.storage.save_article(a)
     assert code in (CODE_UPDATE, CODE_INSERT)
     assert self.storage.get_collection(
         Storage.COLLECTION_ARTICLES).count() > 0
     assert type(res) is Article, type(res)
     assert res._id
Example #5
0
    def save_article(self, article):
        """
		save or update an article using its url.
		Returns a status CODE 
		"""
        if type(article) in (list, tuple):
            return map(self.save_article, article)
        assert article.url, "article needs an url to be saved"
        articles_collection = self.get_collection(Storage.COLLECTION_ARTICLES)
        previous = articles_collection.find_one({"url": article.url})
        if not previous:
            articles_collection.insert(article.__dict__)
            return (article, CODE_INSERT)
        else:
            article_merged = dict(previous.items() + article.__dict__.items())
            articles_collection.update({'_id': previous['_id']},
                                       article_merged)
            return (Article(**article_merged), CODE_UPDATE)
        return (article, CODE_ERROR)
Example #6
0
 def get_articles(self, date=None, limit=0, skip=0):
     return [
         Article(**article)
         for article in self._get_articles(date, limit=limit, skip=skip)
     ]