def get_articles(self, year, month=None, day=None): different_date_formats = utils.get_all_date_formats(year, month, day) articles = [] for format in different_date_formats: response = self.request_api(keyword=format, end_date=utils.get_the_date_before( year, month, day)) if response: for article in response['response']['results']: # escaping conditions if article.get('web_url') in [_.url for _ in articles]: # this url is already added in the response continue a = Article(TheGuardian.__module__) a.url = article.get('webUrl') a.title = article.get('webTitle') a.source = "The Guardian" a.pub_date = datetime.datetime.strptime( article.get('webPublicationDate'), "%Y-%m-%dT%H:%M:%SZ") a.snippet = article.get('fields').get('trailText') # a.images = TODO # scrape body from page a.body = self.scrape_body_article(a.url) time.sleep(.11) if a.body: articles.append(a) else: warning("no body for article %s" % (a.__dict__)) pass return articles
def get_articles(self, year, month=None, day=None): different_date_formats = utils.get_all_date_formats(year, month, day) articles = [] for format in different_date_formats: response = self.request_api(keyword=format, end_date=utils.get_the_date_before( year, month, day)) if response: for article in response['response']['docs']: # escaping conditions if article.get('document_type') not in ('article', 'blog'): # it's not an article continue if article.get('web_url') in [_.url for _ in articles]: # this url is already added in the response continue a = Article(NewYorkTimes.__module__) a.url = article.get('web_url') a.title = article.get('headline')['main'] a.source = article.get('source') or "The New York Times" a.pub_date = datetime.datetime.strptime( article.get('pub_date'), "%Y-%m-%dT%H:%M:%SZ") a.snippet = article.get('snippet') # a.images = TODO # scrape body from page a.body = self.scrape_body_article(a.url) time.sleep(.11) articles.append(a) return articles
def test_get_articles(self): from brokenpromises import Article date = (2011, 11, 2) a = Article(url="test") a.add_ref_date(date) self.storage.save_article(a) res = self.storage.get_articles(date) assert type(res[0]) is Article, type(res) assert len(res) == 1
def test_save_article(self): a = Article(url="test") # insert res, code = self.storage.save_article(a) assert code in (CODE_UPDATE, CODE_INSERT) assert self.storage.get_collection( Storage.COLLECTION_ARTICLES).count() > 0 assert type(res) is Article, type(res) assert res._id # update res, code = self.storage.save_article(a) assert code in (CODE_UPDATE, CODE_INSERT) assert self.storage.get_collection( Storage.COLLECTION_ARTICLES).count() > 0 assert type(res) is Article, type(res) assert res._id
def save_article(self, article): """ save or update an article using its url. Returns a status CODE """ if type(article) in (list, tuple): return map(self.save_article, article) assert article.url, "article needs an url to be saved" articles_collection = self.get_collection(Storage.COLLECTION_ARTICLES) previous = articles_collection.find_one({"url": article.url}) if not previous: articles_collection.insert(article.__dict__) return (article, CODE_INSERT) else: article_merged = dict(previous.items() + article.__dict__.items()) articles_collection.update({'_id': previous['_id']}, article_merged) return (Article(**article_merged), CODE_UPDATE) return (article, CODE_ERROR)
def get_articles(self, date=None, limit=0, skip=0): return [ Article(**article) for article in self._get_articles(date, limit=limit, skip=skip) ]