def get_articles(self, year, month=None, day=None): different_date_formats = utils.get_all_date_formats(year, month, day) articles = [] for format in different_date_formats: response = self.request_api(keyword=format, end_date=utils.get_the_date_before(year, month, day)) if response: for article in response['response']['docs']: # escaping conditions if article.get('document_type') not in ('article', 'blog'): # it's not an article continue if article.get('web_url') in [_.url for _ in articles] : # this url is already added in the response continue a = Article(NewYorkTimes.__module__) a.url = article.get('web_url') a.title = article.get('headline')['main'] a.source = article.get('source') or "The New York Times" a.pub_date = datetime.datetime.strptime(article.get('pub_date'), "%Y-%m-%dT%H:%M:%SZ") a.snippet = article.get('snippet') # a.images = TODO # scrape body from page a.body = self.scrape_body_article(a.url) time.sleep(.11) articles.append(a) return articles
def get_articles(self, year, month=None, day=None): different_date_formats = utils.get_all_date_formats(year, month, day) articles = [] for format in different_date_formats: try: response = self.request_api(keyword=format) except Exception as e: # TODO: log error import sys print >> sys.stderr, e continue for article in response['response']['results']: # escaping conditions if article.get('web_url') in [_.url for _ in articles] : # this url is already added in the response continue a = Article(TheGuardian.__module__) a.url = article.get('webUrl') a.title = article.get('webTitle') a.source = "The Guardian" a.pub_date = datetime.datetime.strptime(article.get('webPublicationDate'), "%Y-%m-%dT%H:%M:%SZ") a.snippet = article.get('fields').get('trailText') # a.images = TODO # scrape body from page a.body = self.scrape_body_article(a.url) if a.body: articles.append(a) else: # TODO Loggin # print a, a.url pass return articles
def get_articles(self, year, month=None, day=None): different_date_formats = utils.get_all_date_formats(year, month, day) articles = [] for format in different_date_formats: response = self.request_api(keyword=format, end_date=utils.get_the_date_before( year, month, day)) if response: for article in response['response']['docs']: # escaping conditions if article.get('document_type') not in ('article', 'blog'): # it's not an article continue if article.get('web_url') in [_.url for _ in articles]: # this url is already added in the response continue a = Article(NewYorkTimes.__module__) a.url = article.get('web_url') a.title = article.get('headline')['main'] a.source = article.get('source') or "The New York Times" a.pub_date = datetime.datetime.strptime( article.get('pub_date'), "%Y-%m-%dT%H:%M:%SZ") a.snippet = article.get('snippet') # a.images = TODO # scrape body from page a.body = self.scrape_body_article(a.url) time.sleep(.11) articles.append(a) return articles
def get_articles(self, year, month=None, day=None): different_date_formats = utils.get_all_date_formats(year, month, day) articles = [] for format in different_date_formats: response = self.request_api(keyword=format, end_date=utils.get_the_date_before( year, month, day)) if response: for article in response['response']['results']: # escaping conditions if article.get('web_url') in [_.url for _ in articles]: # this url is already added in the response continue a = Article(TheGuardian.__module__) a.url = article.get('webUrl') a.title = article.get('webTitle') a.source = "The Guardian" a.pub_date = datetime.datetime.strptime( article.get('webPublicationDate'), "%Y-%m-%dT%H:%M:%SZ") a.snippet = article.get('fields').get('trailText') # a.images = TODO # scrape body from page a.body = self.scrape_body_article(a.url) time.sleep(.11) if a.body: articles.append(a) else: warning("no body for article %s" % (a.__dict__)) pass return articles
def get_articles(self, year, month=None, day=None): different_date_formats = utils.get_all_date_formats(year, month, day) articles = [] for format in different_date_formats: response = self.request_api(keyword=format, end_date=utils.get_the_date_before(year, month, day)) if response: for article in response['response']['results']: # escaping conditions if article.get('web_url') in [_.url for _ in articles] : # this url is already added in the response continue a = Article(TheGuardian.__module__) a.url = article.get('webUrl') a.title = article.get('webTitle') a.source = "The Guardian" a.pub_date = datetime.datetime.strptime(article.get('webPublicationDate'), "%Y-%m-%dT%H:%M:%SZ") a.snippet = article.get('fields').get('trailText') # a.images = TODO # scrape body from page a.body = self.scrape_body_article(a.url) time.sleep(.11) if a.body: articles.append(a) else: warning("no body for article %s" % (a.__dict__)) pass return articles