def test_main_unsuccessful(self):
        """Run the main program with failure, then commit changes."""

        # TODO: use an actual argparse object for the args instead of a MagicMock
        args = MagicMock(log_file=None,
                         data_dir='data',
                         is_wip_override=False,
                         not_wip_override=False,
                         specific_issue_date=False)
        mock_database = MagicMock()
        mock_database.count_all_rows.return_value = 0
        fake_database_impl = lambda: mock_database
        mock_upload_archive = MagicMock(side_effect=Exception('testing'))
        mock_collect_files = MagicMock()
        mock_collect_files.return_value = [("a", False)]

        with self.assertRaises(Exception):
            main(args,
                 database_impl=fake_database_impl,
                 collect_files_impl=mock_collect_files,
                 upload_archive_impl=mock_upload_archive)

        self.assertTrue(mock_upload_archive.called)
        self.assertEqual(mock_upload_archive.call_args[0][0], [("a", False)])

        self.assertTrue(mock_database.connect.called)
        self.assertTrue(mock_database.disconnect.called)
        self.assertTrue(mock_database.disconnect.call_args[0][0])
    def test_main_early_exit(self):
        """Run the main program with an empty receiving directory."""

        # TODO: use an actual argparse object for the args instead of a MagicMock
        args = MagicMock(log_file=None,
                         data_dir='data',
                         is_wip_override=False,
                         not_wip_override=False,
                         specific_issue_date=False)
        mock_database = MagicMock()
        mock_database.count_all_rows.return_value = 0
        fake_database_impl = lambda: mock_database
        mock_collect_files = MagicMock()
        mock_collect_files.return_value = []
        mock_upload_archive = MagicMock()

        main(args,
             database_impl=fake_database_impl,
             collect_files_impl=mock_collect_files,
             upload_archive_impl=mock_upload_archive)

        self.assertTrue(mock_collect_files.called)
        self.assertEqual(mock_collect_files.call_args[0][0], 'data')

        self.assertFalse(mock_upload_archive.called)

        self.assertFalse(mock_database.connect.called)
        self.assertFalse(mock_database.disconnect.called)
Beispiel #3
0
    def test_main_successful(self):
        """Run the main program successfully, then commit changes."""

        # TODO: use an actual argparse object for the args instead of a MagicMock
        args = MagicMock(data_dir='data',
                         is_wip_override=False,
                         not_wip_override=False,
                         specific_issue_date=False)
        mock_database = MagicMock()
        mock_database.count_all_rows.return_value = 0
        fake_database_impl = lambda: mock_database
        mock_collect_files = MagicMock()
        mock_collect_files.return_value = [("a", False)]
        mock_upload_archive = MagicMock()

        main(args,
             database_impl=fake_database_impl,
             collect_files_impl=mock_collect_files,
             upload_archive_impl=mock_upload_archive)

        self.assertTrue(mock_collect_files.called)
        self.assertEqual(mock_collect_files.call_args[0][0], 'data')

        self.assertTrue(mock_upload_archive.called)
        self.assertEqual(mock_upload_archive.call_args[0][0], [("a", False)])

        self.assertTrue(mock_database.connect.called)
        self.assertTrue(mock_database.disconnect.called)
        self.assertTrue(mock_database.disconnect.call_args[0][0])
    def test_uploading(self):
        """Scan, parse, upload, archive, serve, and fetch a covidcast signal."""

        # print full diff if something unexpected comes out
        self.maxDiff = None

        # make some fake data files
        data_dir = 'covid/data'
        source_receiving_dir = data_dir + '/receiving/src-name'
        os.makedirs(source_receiving_dir, exist_ok=True)

        # valid
        with open(source_receiving_dir + '/20200419_state_test.csv', 'w') as f:
            f.write('geo_id,val,se,sample_size\n')
            f.write('ca,1,0.1,10\n')
            f.write('tx,2,0.2,20\n')
            f.write('fl,3,0.3,30\n')

        # valid wip
        with open(source_receiving_dir + '/20200419_state_wip_prototype.csv',
                  'w') as f:
            f.write('geo_id,val,se,sample_size\n')
            f.write('me,10,0.01,100\n')
            f.write('nd,20,0.02,200\n')
            f.write('wa,30,0.03,300\n')

        # invalid
        with open(
                source_receiving_dir +
                '/20200419_state_wip_really_long_name_that_will_be_accepted.csv',
                'w') as f:
            f.write('geo_id,val,se,sample_size\n')
            f.write('pa,100,5.4,624\n')

        # invalid
        with open(
                source_receiving_dir +
                '/20200419_state_wip_really_long_name_that_will_get_truncated_lorem_ipsum_dolor_sit_amet.csv',
                'w') as f:
            f.write('geo_id,val,se,sample_size\n')
            f.write('pa,100,5.4,624\n')

        # invalid
        with open(source_receiving_dir + '/20200420_state_test.csv', 'w') as f:
            f.write('this,header,is,wrong\n')

        # invalid
        with open(source_receiving_dir + '/hello.csv', 'w') as f:
            f.write('file name is wrong\n')

        # upload CSVs
        # TODO: use an actual argparse object for the args instead of a MagicMock
        args = MagicMock(data_dir=data_dir,
                         is_wip_override=False,
                         not_wip_override=False,
                         specific_issue_date=False)
        main(args)

        # request CSV data from the API
        response = Epidata.covidcast('src-name', 'test', 'day', 'state',
                                     20200419, '*')

        expected_issue_day = date.today()
        expected_issue = expected_issue_day.strftime("%Y%m%d")

        def apply_lag(expected_epidata):
            for dct in expected_epidata:
                dct['issue'] = int(expected_issue)
                time_value_day = date(year=dct['time_value'] // 10000,
                                      month=dct['time_value'] % 10000 // 100,
                                      day=dct['time_value'] % 100)
                expected_lag = (expected_issue_day - time_value_day).days
                dct['lag'] = expected_lag
            return expected_epidata

        # verify data matches the CSV
        # NB these are ordered by geo_value
        self.assertEqual(
            response, {
                'result':
                1,
                'epidata':
                apply_lag([
                    {
                        'time_value': 20200419,
                        'geo_value': 'ca',
                        'value': 1,
                        'stderr': 0.1,
                        'sample_size': 10,
                        'direction': None,
                        'signal': 'test',
                    },
                    {
                        'time_value': 20200419,
                        'geo_value': 'fl',
                        'value': 3,
                        'stderr': 0.3,
                        'sample_size': 30,
                        'direction': None,
                        'signal': 'test',
                    },
                    {
                        'time_value': 20200419,
                        'geo_value': 'tx',
                        'value': 2,
                        'stderr': 0.2,
                        'sample_size': 20,
                        'direction': None,
                        'signal': 'test',
                    },
                ]),
                'message':
                'success',
            })

        # request CSV data from the API on WIP signal
        response = Epidata.covidcast('src-name', 'wip_prototype', 'day',
                                     'state', 20200419, '*')

        # verify data matches the CSV
        # NB these are ordered by geo_value
        self.assertEqual(
            response, {
                'result':
                1,
                'epidata':
                apply_lag([
                    {
                        'time_value': 20200419,
                        'geo_value': 'me',
                        'value': 10,
                        'stderr': 0.01,
                        'sample_size': 100,
                        'direction': None,
                        'signal': 'wip_prototype',
                    },
                    {
                        'time_value': 20200419,
                        'geo_value': 'nd',
                        'value': 20,
                        'stderr': 0.02,
                        'sample_size': 200,
                        'direction': None,
                        'signal': 'wip_prototype',
                    },
                    {
                        'time_value': 20200419,
                        'geo_value': 'wa',
                        'value': 30,
                        'stderr': 0.03,
                        'sample_size': 300,
                        'direction': None,
                        'signal': 'wip_prototype',
                    },
                ]),
                'message':
                'success',
            })

        # request CSV data from the API on the signal with name length 32<x<64
        response = Epidata.covidcast(
            'src-name', 'wip_really_long_name_that_will_be_accepted', 'day',
            'state', 20200419, '*')

        # verify data matches the CSV
        self.assertEqual(
            response, {
                'result':
                1,
                'message':
                'success',
                'epidata':
                apply_lag([
                    {
                        'time_value': 20200419,
                        'geo_value': 'pa',
                        'value': 100,
                        'stderr': 5.4,
                        'sample_size': 624,
                        'direction': None,
                        'signal': 'wip_really_long_name_that_will_be_accepted',
                    },
                ])
            })

        # request CSV data from the API on the long-named signal
        response = Epidata.covidcast(
            'src-name',
            'wip_really_long_name_that_will_get_truncated_lorem_ipsum_dolor_s',
            'day', 'state', 20200419, '*')

        # verify data matches the CSV
        # if the CSV failed correctly there should be no results
        self.assertEqual(response, {
            'result': -2,
            'message': 'no results',
        })

        # verify timestamps and default values are reasonable
        self.cur.execute(
            'select value_updated_timestamp, direction_updated_timestamp, direction from covidcast'
        )
        for value_updated_timestamp, direction_updated_timestamp, direction in self.cur:
            self.assertGreater(value_updated_timestamp, 0)
            self.assertEqual(direction_updated_timestamp, 0)
            self.assertIsNone(direction)

        # verify that the CSVs were archived
        for sig in ["test", "wip_prototype"]:
            path = data_dir + f'/archive/successful/src-name/20200419_state_{sig}.csv.gz'
            self.assertIsNotNone(os.stat(path))
        path = data_dir + '/archive/failed/src-name/20200420_state_test.csv'
        self.assertIsNotNone(os.stat(path))
        path = data_dir + '/archive/failed/unknown/hello.csv'
        self.assertIsNotNone(os.stat(path))
    def test_uploading(self):
        """Scan, parse, upload, archive, serve, and fetch a covidcast signal."""

        # print full diff if something unexpected comes out
        self.maxDiff = None

        # make some fake data files
        data_dir = 'covid/data'
        source_receiving_dir = data_dir + '/receiving/src-name'
        log_file_directory = "/var/log/"
        os.makedirs(source_receiving_dir, exist_ok=True)
        os.makedirs(log_file_directory, exist_ok=True)
        # TODO: use an actual argparse object for the args instead of a MagicMock
        args = MagicMock(log_file=log_file_directory + "output.log",
                         data_dir=data_dir,
                         is_wip_override=False,
                         not_wip_override=False,
                         specific_issue_date=False)
        uploader_column_rename = {
            "geo_id": "geo_value",
            "val": "value",
            "se": "stderr",
            "missing_val": "missing_value",
            "missing_se": "missing_stderr"
        }

        with self.subTest("Valid CSV with correct missing columns"):
            values = pd.DataFrame({
                "geo_id": ["ca", "fl", "tx"],
                "val": [1.0, 2.0, 3.0],
                "se": [0.1, 0.2, 0.3],
                "sample_size": [10.0, 20.0, 30.0],
                "missing_val": [Nans.NOT_MISSING] * 3,
                "missing_se": [Nans.NOT_MISSING] * 3,
                "missing_sample_size": [Nans.NOT_MISSING] * 3
            })
            signal_name = "test"
            values.to_csv(source_receiving_dir +
                          f'/20200419_state_{signal_name}.csv',
                          index=False)

            # upload CSVs
            main(args)
            response = Epidata.covidcast('src-name', signal_name, 'day',
                                         'state', 20200419, '*')

            expected_values = pd.concat(
                [
                    values,
                    pd.DataFrame({
                        "time_value": [20200419] * 3,
                        "signal": [signal_name] * 3,
                        "direction": [None] * 3
                    })
                ],
                axis=1).rename(columns=uploader_column_rename).to_dict(
                    orient="records")
            expected_response = {
                'result': 1,
                'epidata': self.apply_lag(expected_values),
                'message': 'success'
            }

            self.assertEqual(response, expected_response)
            self.verify_timestamps_and_defaults()

            # Verify that files were archived
            path = data_dir + f'/archive/successful/src-name/20200419_state_test.csv.gz'
            self.assertIsNotNone(os.stat(path))

            self.tearDown()
            self.setUp()

        with self.subTest(
                "Valid CSV with no missing columns should set intelligent defaults"
        ):
            values = pd.DataFrame(
                {
                    "geo_id": ["ca", "fl", "tx"],
                    "val": [None, 2.0, 3.0],
                    "se": [0.1, None, 0.3],
                    "sample_size": [10.0, 20.0, None]
                },
                dtype=object)
            signal_name = "test_no_missing_cols"
            values.to_csv(source_receiving_dir +
                          f'/20200419_state_{signal_name}.csv',
                          index=False)

            # upload CSVs
            main(args)
            response = Epidata.covidcast('src-name', signal_name, 'day',
                                         'state', 20200419, '*')

            expected_values = pd.concat(
                [
                    values,
                    pd.DataFrame({
                        "time_value": [20200419] * 3,
                        "signal": [signal_name] * 3,
                        "direction": [None] * 3,
                        "missing_value": [Nans.OTHER] + [Nans.NOT_MISSING] * 2,
                        "missing_stderr":
                        [Nans.NOT_MISSING, Nans.OTHER, Nans.NOT_MISSING],
                        "missing_sample_size":
                        [Nans.NOT_MISSING] * 2 + [Nans.OTHER]
                    })
                ],
                axis=1).rename(columns=uploader_column_rename).to_dict(
                    orient="records")
            expected_response = {
                'result': 1,
                'epidata': self.apply_lag(expected_values),
                'message': 'success'
            }

            self.assertEqual(response, expected_response)
            self.verify_timestamps_and_defaults()

            self.tearDown()
            self.setUp()

        with self.subTest("Invalid, missing with an inf value"):
            values = pd.DataFrame({
                "geo_id": ["tx"],
                "val": [np.inf],
                "se": [0.3],
                "sample_size": [None],
                "missing_value": [Nans.OTHER],
                "missing_stderr": [Nans.NOT_MISSING],
                "missing_sample_size": [Nans.NOT_MISSING]
            })
            signal_name = "test_with_inf"
            values.to_csv(source_receiving_dir +
                          f'/20200419_state_{signal_name}.csv',
                          index=False)

            # upload CSVs
            main(args)
            response = Epidata.covidcast('src-name', signal_name, 'day',
                                         'state', 20200419, '*')

            expected_response = {'result': -2, 'message': 'no results'}

            self.assertEqual(response, expected_response)
            self.verify_timestamps_and_defaults()
            self.tearDown()
            self.setUp()

        with self.subTest(
                "Valid, missing with incorrect missing codes, fixed by acquisition"
        ):
            values = pd.DataFrame({
                "geo_id": ["tx"],
                "val": [None],
                "se": [0.3],
                "sample_size": [30.0],
                "missing_val": [Nans.NOT_MISSING],
                "missing_se": [Nans.NOT_MISSING],
                "missing_sample_size": [Nans.OTHER]
            }).replace({np.nan: None})
            signal_name = "test_incorrect_missing_codes"
            values.to_csv(source_receiving_dir +
                          f'/20200419_state_{signal_name}.csv',
                          index=False)

            # upload CSVs
            main(args)
            response = Epidata.covidcast('src-name', signal_name, 'day',
                                         'state', 20200419, '*')

            expected_values_df = pd.concat(
                [
                    values,
                    pd.DataFrame({
                        "time_value": [20200419],
                        "signal": [signal_name],
                        "direction": [None]
                    })
                ],
                axis=1).rename(columns=uploader_column_rename)
            expected_values_df["missing_value"].iloc[0] = Nans.OTHER
            expected_values_df["missing_sample_size"].iloc[
                0] = Nans.NOT_MISSING
            expected_values = expected_values_df.to_dict(orient="records")
            expected_response = {
                'result': 1,
                'epidata': self.apply_lag(expected_values),
                'message': 'success'
            }

            self.assertEqual(response, expected_response)
            self.verify_timestamps_and_defaults()

            self.tearDown()
            self.setUp()

        with self.subTest("Valid wip"):
            values = pd.DataFrame({
                "geo_id": ["me", "nd", "wa"],
                "val": [10.0, 20.0, 30.0],
                "se": [0.01, 0.02, 0.03],
                "sample_size": [100.0, 200.0, 300.0],
                "missing_val": [Nans.NOT_MISSING] * 3,
                "missing_se": [Nans.NOT_MISSING] * 3,
                "missing_sample_size": [Nans.NOT_MISSING] * 3
            })
            signal_name = "wip_prototype"
            values.to_csv(source_receiving_dir +
                          f'/20200419_state_{signal_name}.csv',
                          index=False)

            # upload CSVs
            main(args)
            response = Epidata.covidcast('src-name', signal_name, 'day',
                                         'state', 20200419, '*')

            expected_values = pd.concat(
                [
                    values,
                    pd.DataFrame({
                        "time_value": [20200419] * 3,
                        "signal": [signal_name] * 3,
                        "direction": [None] * 3
                    })
                ],
                axis=1).rename(columns=uploader_column_rename).to_dict(
                    orient="records")
            expected_response = {
                'result': 1,
                'epidata': self.apply_lag(expected_values),
                'message': 'success'
            }

            self.assertEqual(response, expected_response)
            self.verify_timestamps_and_defaults()

            # Verify that files were archived
            path = data_dir + f'/archive/successful/src-name/20200419_state_wip_prototype.csv.gz'
            self.assertIsNotNone(os.stat(path))

            self.tearDown()
            self.setUp()

        with self.subTest("Valid signal with name length 32<x<64"):
            values = pd.DataFrame({
                "geo_id": ["pa"],
                "val": [100.0],
                "se": [5.4],
                "sample_size": [624.0],
                "missing_val": [Nans.NOT_MISSING],
                "missing_se": [Nans.NOT_MISSING],
                "missing_sample_size": [Nans.NOT_MISSING]
            })
            signal_name = "wip_really_long_name_that_will_be_accepted"
            values.to_csv(source_receiving_dir +
                          f'/20200419_state_{signal_name}.csv',
                          index=False)

            # upload CSVs
            main(args)
            response = Epidata.covidcast('src-name', signal_name, 'day',
                                         'state', 20200419, '*')

            expected_values = pd.concat(
                [
                    values,
                    pd.DataFrame({
                        "time_value": [20200419],
                        "signal": [signal_name],
                        "direction": [None]
                    })
                ],
                axis=1).rename(columns=uploader_column_rename).to_dict(
                    orient="records")
            expected_response = {
                'result': 1,
                'epidata': self.apply_lag(expected_values),
                'message': 'success'
            }

            self.assertEqual(response, expected_response)
            self.verify_timestamps_and_defaults()

            self.tearDown()
            self.setUp()

        with self.subTest("Invalid signal with a too-long name"):
            values = pd.DataFrame({
                "geo_id": ["pa"],
                "val": [100.0],
                "se": [5.4],
                "sample_size": [624.0],
                "missing_val": [Nans.NOT_MISSING],
                "missing_se": [Nans.NOT_MISSING],
                "missing_sample_size": [Nans.NOT_MISSING]
            })
            signal_name = "wip_really_long_name_that_will_get_truncated_lorem_ipsum_dolor_sit_amet"
            values.to_csv(source_receiving_dir +
                          f'/20200419_state_{signal_name}.csv',
                          index=False)

            # upload CSVs
            main(args)
            response = Epidata.covidcast('src-name', signal_name, 'day',
                                         'state', 20200419, '*')

            expected_response = {'result': -2, 'message': 'no results'}

            self.assertEqual(response, expected_response)
            self.verify_timestamps_and_defaults()

            self.tearDown()
            self.setUp()

        with self.subTest("Invalid file with a wrong header"):
            with open(source_receiving_dir + '/20200420_state_test.csv',
                      'w') as f:
                f.write('this,header,is,wrong\n')

            main(args)

            path = data_dir + '/archive/failed/src-name/20200420_state_test.csv'
            self.assertIsNotNone(os.stat(path))

            self.tearDown()
            self.setUp()

        with self.subTest("Invalid file with a wrong name"):
            with open(source_receiving_dir + '/hello.csv', 'w') as f:
                f.write('file name is wrong\n')

            main(args)

            path = data_dir + '/archive/failed/unknown/hello.csv'
            self.assertIsNotNone(os.stat(path))

            self.tearDown()
            self.setUp()