Beispiel #1
0
    def test_last_update_date_ignores_invalid_updates(self):
        updates = [
            'ArxivIterativeCollect_2019-03-14',
            'ArxivIterativeCollect_2019-03-13',
            'ArxivIterativeCollect_2019-04-99'
        ]
        latest = extract_last_update_date('ArxivIterativeCollect', updates)
        assert latest == datetime.datetime(2019, 3, 14)

        updates = [
            'ArxivIterativeCollect_2019.03.14',
            'ArxivIterativeCollect_2019-03-13', 'ArxivIterativeCollect_2019'
        ]
        latest = extract_last_update_date('ArxivIterativeCollect', updates)
        assert latest == datetime.datetime(2019, 3, 13)

        updates = [
            'ArxivIterativeCollect_2019-03-14',
            'ArxivIterativeCollect_2019-03-13',
            'ArxivIterativeCollection_2019-04-09'
        ]
        latest = extract_last_update_date('ArxivIterativeCollect', updates)
        assert latest == datetime.datetime(2019, 3, 14)

        updates = [
            'ArxivIterativeCollect_2019-03-13',
            'badArxivIterativeCollect_2019-03-14',
            'ArxivIterativeCollect_2018-04-01'
        ]
        latest = extract_last_update_date('ArxivIterativeCollect', updates)
        assert latest == datetime.datetime(2019, 3, 13)
Beispiel #2
0
    def test_last_update_date_extracts_latest_date(self):
        updates = ['ArxivIterativeCollect_2019-03-14',
                   'ArxivIterativeCollect_2019-03-13',
                   'ArxivIterativeCollect_2001-01-01']
        latest = extract_last_update_date('ArxivIterativeCollect', updates)
        assert latest == datetime.datetime(2019, 3, 14)

        updates = ['ArxivIterativeCollect_2018-03-14',
                   'ArxivIterativeCollect_2019-03-13',
                   'ArxivIterativeCollect_2020-01-01']
        latest = extract_last_update_date('ArxivIterativeCollect', updates)
        assert latest == datetime.datetime(2020, 1, 1)

        updates = ['ArxivIterativeCollect_2001-01-14',
                   'ArxivIterativeCollect_2001-02-03',
                   'ArxivIterativeCollect_2001-02-03']
        latest = extract_last_update_date('ArxivIterativeCollect', updates)
        assert latest == datetime.datetime(2001, 2, 3)
Beispiel #3
0
    def requires(self):
        """
        Collects the last date of successful update from the database and launches the
        iterative data collection task.
        """
        # database setup
        database = 'dev' if self.test else 'production'
        logging.warning(f"Using {database} database")
        self.engine = get_mysql_engine(self.db_config_env, 'mysqldb', database)

        if self.articles_from_date is None:
            logging.info("Extracting latest update date from database")
            query = text("SELECT update_id FROM luigi_table_updates "
                         f"WHERE update_id LIKE '{UPDATE_PREFIX}%'")
            with db_session(self.engine) as session:
                previous_updates = session.execute(query).fetchall()
            previous_updates = [
                update_id for (update_id, ) in previous_updates
            ]
            try:
                latest_update = extract_last_update_date(
                    UPDATE_PREFIX, previous_updates)
            except ValueError:
                raise ValueError(
                    "Date for iterative data collection could not be determined. Set the date manually with --articles-from-date"
                )
            self.articles_from_date = datetime.strftime(
                latest_update, '%Y-%m-%d')

        logging.info(
            f"Updating arxiv data from date: {self.articles_from_date}")

        yield CollectNewTask(date=self.date,
                             _routine_id=self._routine_id,
                             db_config_path=self.db_config_path,
                             db_config_env=self.db_config_env,
                             test=self.test,
                             insert_batch_size=self.insert_batch_size,
                             articles_from_date=self.articles_from_date)
Beispiel #4
0
    def test_last_update_date_raises_valueerror_if_none_found(self):
        updates = []

        with pytest.raises(ValueError):
            extract_last_update_date('ArxivIterativeCollect', updates)