def test_last_update_date_ignores_invalid_updates(self): updates = [ 'ArxivIterativeCollect_2019-03-14', 'ArxivIterativeCollect_2019-03-13', 'ArxivIterativeCollect_2019-04-99' ] latest = extract_last_update_date('ArxivIterativeCollect', updates) assert latest == datetime.datetime(2019, 3, 14) updates = [ 'ArxivIterativeCollect_2019.03.14', 'ArxivIterativeCollect_2019-03-13', 'ArxivIterativeCollect_2019' ] latest = extract_last_update_date('ArxivIterativeCollect', updates) assert latest == datetime.datetime(2019, 3, 13) updates = [ 'ArxivIterativeCollect_2019-03-14', 'ArxivIterativeCollect_2019-03-13', 'ArxivIterativeCollection_2019-04-09' ] latest = extract_last_update_date('ArxivIterativeCollect', updates) assert latest == datetime.datetime(2019, 3, 14) updates = [ 'ArxivIterativeCollect_2019-03-13', 'badArxivIterativeCollect_2019-03-14', 'ArxivIterativeCollect_2018-04-01' ] latest = extract_last_update_date('ArxivIterativeCollect', updates) assert latest == datetime.datetime(2019, 3, 13)
def test_last_update_date_extracts_latest_date(self): updates = ['ArxivIterativeCollect_2019-03-14', 'ArxivIterativeCollect_2019-03-13', 'ArxivIterativeCollect_2001-01-01'] latest = extract_last_update_date('ArxivIterativeCollect', updates) assert latest == datetime.datetime(2019, 3, 14) updates = ['ArxivIterativeCollect_2018-03-14', 'ArxivIterativeCollect_2019-03-13', 'ArxivIterativeCollect_2020-01-01'] latest = extract_last_update_date('ArxivIterativeCollect', updates) assert latest == datetime.datetime(2020, 1, 1) updates = ['ArxivIterativeCollect_2001-01-14', 'ArxivIterativeCollect_2001-02-03', 'ArxivIterativeCollect_2001-02-03'] latest = extract_last_update_date('ArxivIterativeCollect', updates) assert latest == datetime.datetime(2001, 2, 3)
def requires(self): """ Collects the last date of successful update from the database and launches the iterative data collection task. """ # database setup database = 'dev' if self.test else 'production' logging.warning(f"Using {database} database") self.engine = get_mysql_engine(self.db_config_env, 'mysqldb', database) if self.articles_from_date is None: logging.info("Extracting latest update date from database") query = text("SELECT update_id FROM luigi_table_updates " f"WHERE update_id LIKE '{UPDATE_PREFIX}%'") with db_session(self.engine) as session: previous_updates = session.execute(query).fetchall() previous_updates = [ update_id for (update_id, ) in previous_updates ] try: latest_update = extract_last_update_date( UPDATE_PREFIX, previous_updates) except ValueError: raise ValueError( "Date for iterative data collection could not be determined. Set the date manually with --articles-from-date" ) self.articles_from_date = datetime.strftime( latest_update, '%Y-%m-%d') logging.info( f"Updating arxiv data from date: {self.articles_from_date}") yield CollectNewTask(date=self.date, _routine_id=self._routine_id, db_config_path=self.db_config_path, db_config_env=self.db_config_env, test=self.test, insert_batch_size=self.insert_batch_size, articles_from_date=self.articles_from_date)
def test_last_update_date_raises_valueerror_if_none_found(self): updates = [] with pytest.raises(ValueError): extract_last_update_date('ArxivIterativeCollect', updates)