def get_latest_fetched_articles(db_root): providers = get_subdirectories(db_root) last_articles = {} last_errors = {} # todo: fix that shit fetched_date = datetime.today().date() for p in providers: provider_dir = os.path.join(db_root, p) all_days = get_subdirectories(provider_dir) last_day = get_latest_day(all_days) last_day_dir = os.path.join(provider_dir, last_day) all_hours = get_subdirectories(last_day_dir) last_hour = get_latest_hour(all_hours) fetched_date = make_date_from_string(last_day, last_hour) filename = os.path.join(last_day_dir, last_hour, 'articles.json') dump = get_provider_dump(filename) articles, errors = [], [] for article in dump['articles']: articles.append(ArticleData.from_json(article)) for error in dump['errors']: errors.append(error) last_articles[p] = articles last_errors[p] = errors return fetched_date, last_articles, last_errors
def get_articles_from_batch(db_root, source_name, date_string, batch_time): json_file = os.path.join(db_root, source_name, date_string, batch_time, 'articles.json') with open(json_file, 'r') as f: json_content = json.load(f) articles = [ ArticleData.from_json(json_string) for json_string in json_content['articles'] ] return articles
def get_articles_per_batch(db_root, source_name, date_string): path = os.path.join(db_root, source_name, date_string) all_batch_times = get_subdirectories(path) all_batches = [] for batch_time in all_batch_times: json_file = os.path.join(path, batch_time, 'articles.json') with open(json_file, 'r') as f: json_content = json.load(f) articles = [ArticleData.from_json(json_string) for json_string in json_content['articles']] all_batches.append((batch_time, articles)) all_batches.sort(key=lambda x: x[0]) return all_batches
def get_batch_content(self, date_string, batch_time_string): """ Returns the data saved for a specific batch """ batch_dir = os.path.join(self.directory, date_string, batch_time_string) if os.path.exists(batch_dir): json_filepath = os.path.join(batch_dir, ARTICLES_FILENAME) with open(json_filepath, 'r') as f: json_content = json.load(f) articles = [ArticleData.from_json(json_string) for json_string in json_content['articles']] articles.sort(key=lambda art: art.url) n_errors = len(json_content['errors']) return articles, n_errors else: raise NonExistentBatchError(self.name, date_string, batch_time_string)
def get_articles_per_batch(db_root, source_name, date_string): path = os.path.join(db_root, source_name, date_string) all_batch_times = get_subdirectories(path) all_batches = [] for batch_time in all_batch_times: json_file = os.path.join(path, batch_time, 'articles.json') with open(json_file, 'r') as f: json_content = json.load(f) articles = [ ArticleData.from_json(json_string) for json_string in json_content['articles'] ] all_batches.append((batch_time, articles)) all_batches.sort(key=lambda x: x[0]) return all_batches
def get_batch_articles(self, date_string, batch_time_string): """ Returns the articles saved for a specific first batch. This function does not return the articles which might have been reprocessed after a (manual) error handling session. You should use the get_reprocessed_articles() function for that. The function return a sorted list of ArticleData instances. The list is sorted using the the article url as key. """ batch_dir = os.path.join(self.directory, date_string, batch_time_string) if os.path.exists(batch_dir): json_filepath = os.path.join(batch_dir, ARTICLES_FILENAME) with open(json_filepath, 'r') as f: json_content = json.load(f) articles = [ArticleData.from_json(json_string) for json_string in json_content['articles']] articles.sort(key=lambda art: art.url) return articles else: raise NonExistentBatchError(self.name, date_string, batch_time_string)
def get_reprocessed_batch_articles(self, date_string, batch_time_string): """ Returns articles fetched during an error handling session. ((date_string, hour_string), articles) """ batch_dir = os.path.join(self.directory, date_string, batch_time_string) if os.path.exists(batch_dir): reprocessed_articles = list() for reprocessed_data_dir in [i for i in utils.get_subdirectories(batch_dir) if i.startswith(REPROCESSED_DIR_PREFIX)]: reprocessed_date, reprocessed_time = reprocessed_data_dir.split("_")[1:] json_filepath = os.path.join(batch_dir, reprocessed_data_dir, ARTICLES_FILENAME) with open(json_filepath, 'r') as f: json_content = json.load(f) articles = [ArticleData.from_json(json_string) for json_string in json_content['articles']] articles.sort(key=lambda art: art.url) reprocessed_articles.append(((reprocessed_date, reprocessed_time), articles)) return reprocessed_articles else: raise NonExistentBatchError(self.name, date_string, batch_time_string)
def get_articles_from_batch(db_root, source_name, date_string, batch_time): json_file = os.path.join(db_root, source_name, date_string, batch_time, 'articles.json') with open(json_file, 'r') as f: json_content = json.load(f) articles = [ArticleData.from_json(json_string) for json_string in json_content['articles']] return articles