Python load_truth_data Examples, utils.project_truth.load_truth_data Python Examples

Example #1

0

Show file

File: make_minimal_projects.py Project: reichlab/forecast-repository

def _make_docs_project(user):
    """
    Creates a project based on docs-project.json with forecasts from docs-predictions.json.
    """
    found_project = Project.objects.filter(name=DOCS_PROJECT_NAME).first()
    if found_project:
        click.echo("* deleting previous project: {}".format(found_project))
        delete_project_iteratively(found_project)

    project = create_project_from_json(Path('forecast_app/tests/projects/docs-project.json'), user)  # atomic
    project.name = DOCS_PROJECT_NAME
    project.save()

    load_truth_data(project, Path('forecast_app/tests/truth_data/docs-ground-truth.csv'),
                    file_name='docs-ground-truth.csv')

    forecast_model = ForecastModel.objects.create(project=project, name='docs forecast model', abbreviation='docs_mod')
    time_zero = project.timezeros.filter(timezero_date=datetime.date(2011, 10, 2)).first()
    forecast = Forecast.objects.create(forecast_model=forecast_model, source='docs-predictions.json',
                                       time_zero=time_zero, notes="a small prediction file")
    with open('forecast_app/tests/predictions/docs-predictions.json') as fp:
        json_io_dict_in = json.load(fp)
        load_predictions_from_json_io_dict(forecast, json_io_dict_in, is_validate_cats=False)  # atomic
        cache_forecast_metadata(forecast)  # atomic

    return project, time_zero, forecast_model, forecast

Example #2

0

Show file

    def test_load_truth_data_versions(self):
        _, _, po_user, _, _, _, _, _ = get_or_create_super_po_mo_users(
            is_create_super=True)
        project, time_zero, forecast_model, forecast = _make_docs_project(
            po_user)  # loads docs-ground-truth.csv

        oracle_model = oracle_model_for_project(project)
        self.assertEqual(3, oracle_model.forecasts.count(
        ))  # for 3 timezeros: 2011-10-02, 2011-10-09, 2011-10-16
        self.assertEqual(14, truth_data_qs(project).count())
        self.assertTrue(is_truth_data_loaded(project))

        with self.assertRaisesRegex(RuntimeError,
                                    'cannot load 100% duplicate data'):
            load_truth_data(
                project,
                Path('forecast_app/tests/truth_data/docs-ground-truth.csv'),
                file_name='docs-ground-truth.csv')

        load_truth_data(
            project,
            Path(
                'forecast_app/tests/truth_data/docs-ground-truth-non-dup.csv'),
            file_name='docs-ground-truth-non-dup.csv')
        self.assertEqual(3 * 2, oracle_model.forecasts.count())
        self.assertEqual(14 * 2, truth_data_qs(project).count())

Example #3

0

Show file

    def test_load_truth_data(self):
        load_truth_data(self.project,
                        Path('forecast_app/tests/truth_data/truths-ok.csv'),
                        is_convert_na_none=True)
        self.assertEqual(5, truth_data_qs(self.project).count())
        self.assertTrue(is_truth_data_loaded(self.project))

        # csv references non-existent TimeZero in Project: the bad timezero 2017-01-02 is skipped by
        # _read_truth_data_rows(), but the remaining data that's loaded (the three 2017-01-01 rows) is therefore a
        # subset. this raised 'new data is a subset of previous' prior to this issue:
        # [support truth "diff" uploads #319](https://github.com/reichlab/forecast-repository/issues/319), but now
        # subsets are allowed.
        load_truth_data(
            self.project,
            Path('forecast_app/tests/truth_data/truths-bad-timezero.csv'),
            'truths-bad-timezero.csv',
            is_convert_na_none=True)

        # csv references non-existent unit in Project: the bad unit is skipped, again resulting in a subset. again,
        # subsets are now allowed
        load_truth_data(
            self.project,
            Path('forecast_app/tests/truth_data/truths-bad-location.csv'),
            'truths-bad-location.csv',
            is_convert_na_none=True)

        # csv references non-existent target in Project: the bad target is skipped. subset is allowed
        load_truth_data(
            self.project,
            Path('forecast_app/tests/truth_data/truths-bad-target.csv'),
            'truths-bad-target.csv',
            is_convert_na_none=True)

        project2 = Project.objects.create()
        make_cdc_units_and_targets(project2)
        self.assertEqual(0, truth_data_qs(project2).count())
        self.assertFalse(is_truth_data_loaded(project2))

        TimeZero.objects.create(project=project2,
                                timezero_date=datetime.date(2017, 1, 1))
        load_truth_data(project2,
                        Path('forecast_app/tests/truth_data/truths-ok.csv'),
                        is_convert_na_none=True)
        self.assertEqual(5, truth_data_qs(project2).count())

        # test get_truth_data_preview()
        exp_truth_preview = [
            (datetime.date(2017, 1, 1), 'US National', '1 wk ahead', 0.73102),
            (datetime.date(2017, 1, 1), 'US National', '2 wk ahead', 0.688338),
            (datetime.date(2017, 1, 1), 'US National', '3 wk ahead', 0.732049),
            (datetime.date(2017, 1, 1), 'US National', '4 wk ahead', 0.911641),
            (datetime.date(2017, 1,
                           1), 'US National', 'Season onset', '2017-11-20')
        ]
        self.assertEqual(sorted(exp_truth_preview),
                         sorted(get_truth_data_preview(project2)))

Example #4

0

Show file

    def test_truth_batches(self):
        _, _, po_user, _, _, _, _, _ = get_or_create_super_po_mo_users(
            is_create_super=True)
        project, time_zero, forecast_model, forecast = _make_docs_project(
            po_user)  # loads batch: docs-ground-truth.csv

        # add a second batch
        load_truth_data(
            project,
            Path(
                'forecast_app/tests/truth_data/docs-ground-truth-non-dup.csv'),
            file_name='docs-ground-truth-non-dup.csv')
        oracle_model = oracle_model_for_project(project)
        first_forecast = oracle_model.forecasts.first()
        last_forecast = oracle_model.forecasts.last()

        # test truth_batches() and truth_batch_forecasts() for each batch
        batches = truth_batches(project)
        self.assertEqual(2, len(batches))
        self.assertEqual(first_forecast.source, batches[0][0])
        self.assertEqual(first_forecast.issued_at, batches[0][1])
        self.assertEqual(last_forecast.source, batches[1][0])
        self.assertEqual(last_forecast.issued_at, batches[1][1])

        for source, issued_at in batches:
            forecasts = truth_batch_forecasts(project, source, issued_at)
            self.assertEqual(3, len(forecasts))
            for forecast in forecasts:
                self.assertEqual(source, forecast.source)
                self.assertEqual(issued_at, forecast.issued_at)

        # test truth_batch_summary_table(). NB: utctimetuple() makes sqlite comparisons work
        exp_table = [(source, issued_at.utctimetuple(),
                      len(truth_batch_forecasts(project, source, issued_at)))
                     for source, issued_at in batches]
        act_table = [(source, issued_at.utctimetuple(), num_forecasts)
                     for source, issued_at, num_forecasts in
                     truth_batch_summary_table(project)]
        self.assertEqual(exp_table, act_table)

        # finally, test deleting a batch. try deleting the first, which should fail due to version rules.
        # transaction.atomic() somehow avoids the second `truth_delete_batch()` call getting the error:
        # django.db.transaction.TransactionManagementError: An error occurred in the current transaction. You can't execute queries until the end of the 'atomic' block.
        with transaction.atomic():
            with self.assertRaisesRegex(
                    RuntimeError,
                    'you cannot delete a forecast that has any newer versions'
            ):
                truth_delete_batch(project, batches[0][0], batches[0][1])

        # delete second batch - should not fail
        truth_delete_batch(project, batches[1][0], batches[1][1])
        batches = truth_batches(project)
        self.assertEqual(1, len(batches))
        self.assertEqual(first_forecast.source, batches[0][0])
        self.assertEqual(first_forecast.issued_at, batches[0][1])

Example #5

0

Show file

    def test_query_truth_for_project_null_rows(self):
        _, _, po_user, _, _, _, _, _ = get_or_create_super_po_mo_users(
            is_create_super=True)
        project = create_project_from_json(
            Path('forecast_app/tests/projects/docs-project.json'), po_user)
        load_truth_data(
            project,
            Path(
                'forecast_app/tests/truth_data/docs-ground-truth-null-value.csv'
            ),
            is_convert_na_none=True)

        exp_rows = [-1]  # todo xx
        act_rows = list(query_truth_for_project(project, {}))
        self.assertEqual(sorted(exp_rows), sorted(act_rows))

Example #6

0

Show file

 def test_load_truth_data_null_rows(self):
     _, _, po_user, _, _, _, _, _ = get_or_create_super_po_mo_users(
         is_create_super=True)
     project = create_project_from_json(
         Path('forecast_app/tests/projects/docs-project.json'), po_user)
     load_truth_data(
         project,
         Path(
             'forecast_app/tests/truth_data/docs-ground-truth-null-value.csv'
         ),
         is_convert_na_none=True)
     exp_rows = [
         (datetime.date(2011, 10, 2), 'location1', 'Season peak week', None,
          None, None, datetime.date(2019, 12, 15), None),
         (datetime.date(2011, 10, 2), 'location1', 'above baseline', None,
          None, None, None, True),
         (datetime.date(2011, 10, 2), 'location1', 'season severity', None,
          None, 'moderate', None, None),
         (datetime.date(2011, 10, 2), 'location1', 'cases next week', None,
          None, None, None, None),  # all None
         (datetime.date(2011, 10, 2), 'location1', 'pct next week', None,
          None, None, None, None),  # all None
         (datetime.date(2011, 10, 9), 'location2', 'Season peak week', None,
          None, None, datetime.date(2019, 12, 29), None),
         (datetime.date(2011, 10, 9), 'location2', 'above baseline', None,
          None, None, None, True),
         (datetime.date(2011, 10, 9), 'location2', 'season severity', None,
          None, 'severe', None, None),
         (datetime.date(2011, 10, 9), 'location2', 'cases next week', 3,
          None, None, None, None),
         (datetime.date(2011, 10, 9), 'location2', 'pct next week', None,
          99.9, None, None, None),
         (datetime.date(2011, 10, 16), 'location1', 'Season peak week',
          None, None, None, datetime.date(2019, 12, 22), None),
         (datetime.date(2011, 10, 16), 'location1', 'above baseline', None,
          None, None, None, False),
         (datetime.date(2011, 10, 16), 'location1', 'cases next week', 0,
          None, None, None, None),
         (datetime.date(2011, 10, 16), 'location1', 'pct next week', None,
          0.0, None, None, None)
     ]
     act_rows = truth_data_qs(project) \
         .values_list('pred_ele__forecast__time_zero__timezero_date',
                      'pred_ele__unit__name', 'pred_ele__target__name',
                      'value_i', 'value_f', 'value_t', 'value_d', 'value_b')
     self.assertEqual(sorted(exp_rows), sorted(act_rows))

Example #7

0

Show file

File: make_minimal_projects.py Project: reichlab/forecast-repository

def fill_cdc_project(project, mo_user, is_public):
    project.description = "description"
    project.home_url = "http://example.com/"
    project.core_data = "http://example.com/"

    # make the Units and Targets via cdc-project.json (recall it has no timezeros)
    make_cdc_units_and_targets(project)

    # make two TimeZeros - one for ground truth, and one for the forecast's data:
    # EW1-KoTsarima-2017-01-17-small.csv -> pymmwr.date_to_mmwr_week(datetime.date(2017, 1, 17))  # EW01 2017
    #   -> {'year': 2017, 'week': 3, 'day': 3}
    time_zero1 = TimeZero.objects.create(project=project,
                                         timezero_date=datetime.date(2017, 1, 17),
                                         data_version_date=None)
    TimeZero.objects.create(project=project,
                            timezero_date=datetime.date(2017, 1, 24),
                            data_version_date=None)

    # load ground truth
    load_truth_data(project, Path('forecast_app/tests/truth_data/2017-01-17-truths.csv'), is_convert_na_none=True,
                    file_name='2017-01-17-truths.csv')

    # create the two models
    click.echo("creating ForecastModel")
    forecast_model1 = ForecastModel.objects.create(project=project,
                                                   name=f'Test ForecastModel1 ({"public" if is_public else "private"})',
                                                   abbreviation='model1_abbrev',
                                                   team_name='ForecastModel1 team',
                                                   description="a ForecastModel for testing",
                                                   home_url='http://example.com',
                                                   owner=mo_user)

    # load the forecasts using a small data file
    csv_file_path = Path('forecast_app/tests/EW1-KoTsarima-2017-01-17-small.csv')  # EW01 2017
    click.echo("* loading forecast into forecast_model={}, csv_file_path={}".format(forecast_model1, csv_file_path))
    start_time = timeit.default_timer()
    forecast1 = load_cdc_csv_forecast_file(2016, forecast_model1, csv_file_path, time_zero1)
    click.echo("  loaded forecast={}. {}".format(forecast1, timeit.default_timer() - start_time))

    ForecastModel.objects.create(project=project,
                                 name=f'Test ForecastModel2 ({"public" if is_public else "private"})',
                                 abbreviation='model2_abbrev',
                                 # team_name='ForecastModel2 team',  # leave default ('')
                                 description="a second ForecastModel for testing",
                                 home_url='http://example.com',
                                 owner=mo_user)

Example #8

0

Show file

    def test_load_truth_data_partial_dup(self):
        _, _, po_user, _, _, _, _, _ = get_or_create_super_po_mo_users(
            is_create_super=True)
        project, time_zero, forecast_model, forecast = _make_docs_project(
            po_user)  # loads batch: docs-ground-truth.csv

        try:
            load_truth_data(
                project,
                Path(
                    'forecast_app/tests/truth_data/docs-ground-truth-partial-dup.csv'
                ),
                file_name='docs-ground-truth-partial-dup.csv')
            batches = truth_batches(project)
            self.assertEqual(2, len(batches))
        except Exception as ex:
            self.fail(f"unexpected exception: {ex}")

Example #9

0

Show file

    def test_load_truth_data_dups(self):
        _, _, po_user, _, _, _, _, _ = get_or_create_super_po_mo_users(
            is_create_super=True)
        project = create_project_from_json(
            Path('forecast_app/tests/projects/docs-project.json'), po_user)
        load_truth_data(
            project,
            Path(
                'forecast_app/tests/truth_data/docs-ground-truth-null-value.csv'
            ),
            is_convert_na_none=True)
        self.assertEqual(-1, truth_data_qs(project).count())

        load_truth_data(
            project,
            Path(
                'forecast_app/tests/truth_data/docs-ground-truth-null-value.csv'
            ),
            is_convert_na_none=True)
        self.assertEqual(-1, truth_data_qs(project).count())

Example #10

0

Show file

    def test_load_truth_data_diff(self):
        """
        Tests the relaxing of this forecast version rule when loading truth (issue
        [support truth "diff" uploads #319](https://github.com/reichlab/forecast-repository/issues/319) ):
            3. New forecast versions cannot imply any retracted prediction elements in existing versions, i.e., you
            cannot load data that's a subset of the previous forecast's data.
        """
        _, _, po_user, _, _, _, _, _ = get_or_create_super_po_mo_users(
            is_create_super=True)
        project, time_zero, forecast_model, forecast = _make_docs_project(
            po_user)  # loads docs-ground-truth.csv

        oracle_model = oracle_model_for_project(project)
        self.assertEqual(3, oracle_model.forecasts.count(
        ))  # for 3 timezeros: 2011-10-02, 2011-10-09, 2011-10-16
        self.assertEqual(14, truth_data_qs(project).count())

        # updates only the five location2 rows:
        load_truth_data(
            project,
            Path('forecast_app/tests/truth_data/docs-ground-truth-diff.csv'),
            file_name='docs-ground-truth-diff.csv')
        self.assertEqual(3 + 1, oracle_model.forecasts.count())
        self.assertEqual(14 + 5, truth_data_qs(project).count())

Example #11

0

Show file

File: make_thai_moph_project.py Project: reichlab/forecast-repository

def make_thai_moph_project_app(data_dir, truths_csv_file):
    """
    Deletes and creates a database with one project, one group, and two classes of users. Hard-coded for 2017-2018
    season. Then loads models from the Impetus project. Note: The input files to this program are the output from a
    spamd export script located the dengue-data repo ( https://github.com/reichlab/dengue-data/blob/master/misc/cdc-csv-export.R )
    and are committed to https://epimodeling.springloops.io/project/156725/svn/source/browse/-/trunk%2Farchives%2Fdengue-reports%2Fdata-summaries
    They currently must be processed (currently by hand) via these rough steps:

        1. download template
        2. correct template header from 'bin_end_not_incl' to 'bin_end_notincl'
        3. delete files where first date (data_version_date) was before 0525
        4. for files with duplicate second dates (timezeros), keep the one with the most recent first date (data_version_date)
    """
    start_time = timeit.default_timer()
    data_dir = Path(data_dir)
    click.echo(f"* make_thai_moph_project_app(): data_dir={data_dir}, truths_csv_file={truths_csv_file}")

    project = Project.objects.filter(name=THAI_PROJECT_NAME).first()
    if project:
        click.echo("* Deleting existing project: {}".format(project))
        delete_project_iteratively(project)

    # create the Project (and Users if necessary), including loading the template and creating Targets
    po_user, _, mo_user, _, _, _ = get_or_create_super_po_mo_users(is_create_super=False)

    # !is_validate to bypass Impetus non-uniform bins: [0, 1), [1, 10), [10, 20), ..., [1990, 2000):
    project = create_project_from_json(Path('forecast_app/tests/projects/thai-project.json'), po_user)
    project.model_owners.add(mo_user)
    project.save()
    click.echo("* Created project: {}".format(project))

    # make the model
    forecast_model = make_model(project, mo_user)
    click.echo("* created model: {}".format(forecast_model))

    # create TimeZeros. NB: we skip existing TimeZeros in case we are loading new forecasts. for is_season_start and
    # season_name we use year transitions: the first 2017 we encounter -> start of that year, etc.
    seen_years = []  # indicates a year has been processed. used to determine season starts
    for cdc_csv_file, timezero_date, _, data_version_date in cdc_csv_components_from_data_dir(data_dir):
        timezero_year = timezero_date.year
        is_season_start = timezero_year not in seen_years
        if is_season_start:
            seen_years.append(timezero_year)

        found_time_zero = project.time_zero_for_timezero_date(timezero_date)
        if found_time_zero:
            click.echo(f"s (TimeZero exists)\t{cdc_csv_file}\t")  # 's' from load_cdc_csv_forecasts_from_dir()
            continue

        TimeZero.objects.create(project=project,
                                timezero_date=str(timezero_date),
                                data_version_date=str(data_version_date) if data_version_date else None,
                                is_season_start=(True if is_season_start else False),
                                season_name=(str(timezero_year) if is_season_start else None))
    click.echo("- created TimeZeros: {}".format(project.timezeros.all()))

    # load the truth
    click.echo("- loading truth values")
    load_truth_data(project, Path('utils/dengue-truth-table-script/truths.csv'), is_convert_na_none=True)

    # load data
    click.echo("* Loading forecasts")
    forecast_model = project.models.first()
    forecasts = load_cdc_csv_forecasts_from_dir(forecast_model, data_dir, None)  # season_start_year
    click.echo("- Loading forecasts: loaded {} forecast(s)".format(len(forecasts)))

    # done
    click.echo(f"* Done. time: {timeit.default_timer() - start_time}")

Example #12

0

Show file

    def test_load_truth_data_other_files(self):
        # test truth files that used to be in yyyymmdd or yyyyww (EW) formats
        # truths-ok.csv (2017-01-17-truths.csv would basically test the same)
        load_truth_data(self.project,
                        Path('forecast_app/tests/truth_data/truths-ok.csv'),
                        is_convert_na_none=True)
        exp_rows = [
            (datetime.date(2017, 1, 1), 'US National', '1 wk ahead', 0.73102),
            (datetime.date(2017, 1, 1), 'US National', '2 wk ahead', 0.688338),
            (datetime.date(2017, 1, 1), 'US National', '3 wk ahead', 0.732049),
            (datetime.date(2017, 1, 1), 'US National', '4 wk ahead', 0.911641),
            (datetime.date(2017, 1,
                           1), 'US National', 'Season onset', '2017-11-20')
        ]

        # note: https://code.djangoproject.com/ticket/32483 sqlite3 json query bug -> we manually access field instead
        # of using 'data__value'
        pred_data_qs = PredictionData.objects \
            .filter(pred_ele__forecast__forecast_model=oracle_model_for_project(self.project)) \
            .values_list('pred_ele__forecast__time_zero__timezero_date', 'pred_ele__unit__name',
                         'pred_ele__target__name', 'data')
        act_rows = [(tz_date, unit__name, target__name, data['value'])
                    for tz_date, unit__name, target__name, data in pred_data_qs
                    ]
        self.assertEqual(sorted(exp_rows), sorted(list(act_rows)))

        # truths-2016-2017-reichlab-small.csv
        project2 = Project.objects.create()
        TimeZero.objects.create(project=project2,
                                timezero_date=datetime.date(2016, 10, 30))
        make_cdc_units_and_targets(project2)
        load_truth_data(
            project2,
            Path(
                'forecast_app/tests/truth_data/truths-2016-2017-reichlab-small.csv'
            ),
            is_convert_na_none=True)
        exp_rows = [(datetime.date(2016, 10,
                                   30), 'US National', '1 wk ahead', 1.55838),
                    (datetime.date(2016, 10,
                                   30), 'US National', '2 wk ahead', 1.64639),
                    (datetime.date(2016, 10,
                                   30), 'US National', '3 wk ahead', 1.91196),
                    (datetime.date(2016, 10,
                                   30), 'US National', '4 wk ahead', 1.81129),
                    (datetime.date(2016, 10, 30), 'US National',
                     'Season onset', '2016-12-11'),
                    (datetime.date(2016, 10, 30), 'US National',
                     'Season peak percentage', 5.06094),
                    (datetime.date(2016, 10, 30), 'US National',
                     'Season peak week', '2017-02-05')]
        # note: https://code.djangoproject.com/ticket/32483 sqlite3 json query bug -> we manually access field instead
        # of using 'data__value'
        pred_data_qs = PredictionData.objects \
            .filter(pred_ele__forecast__forecast_model=oracle_model_for_project(project2)) \
            .values_list('pred_ele__forecast__time_zero__timezero_date', 'pred_ele__unit__name',
                         'pred_ele__target__name', 'data')
        act_rows = [(tz_date, unit__name, target__name, data['value'])
                    for tz_date, unit__name, target__name, data in pred_data_qs
                    ]
        self.assertEqual(sorted(exp_rows), sorted(list(act_rows)))