def test_init_and_end_datetime(self, are_dateparser_options_specified):
        # Select rows using both init and end time.
        param_config = {
            "selection_parameters": {
                "init_datetime": "2000-01-02",
                "end_datetime": "2000-01-04"
            },
        }

        if are_dateparser_options_specified:
            param_config["input_parameters"] = {}
            param_config["input_parameters"]["dateparser_options"] = {
                "date_formats": ["%Y-%m-%dT%H:%M:%S"]
            }

        df = get_fake_df(length=5)
        selected_df = select_timeseries_portion(df, param_config)

        assert selected_df.index.values[0] == Timestamp("2000-01-02")
        assert selected_df.index.values[1] == Timestamp("2000-01-03")
        assert selected_df.index.values[2] == Timestamp("2000-01-04")

        assert df.iloc[1]["value"] == selected_df.iloc[0]["value"]
        assert df.iloc[2]["value"] == selected_df.iloc[1]["value"]
        assert df.iloc[3]["value"] == selected_df.iloc[2]["value"]

        assert len(selected_df) == 3
    def test_selection_not_requested(self):
        # Test if df is returned untouched if selection is not required.
        param_config = {
            "input_parameters": {
                "source_data_url": os.path.join("test_datasets", "test_6.csv"),
            }
        }

        df = ingest_timeseries(param_config)
        selected_df = select_timeseries_portion(df, param_config)
        assert df.equals(selected_df)
    def test_end_datetime(self):
        # Select rows using end datetime.
        param_config = {
            "selection_parameters": {
                "init_datetime": "1999-01-02",
                "end_datetime": "2000-01-02"
            },
        }

        df = get_fake_df(length=3)
        selected_df = select_timeseries_portion(df, param_config)

        assert selected_df.index.values[0] == Timestamp("2000-01-01")
        assert selected_df.index.values[1] == Timestamp("2000-01-02")

        assert df.iloc[0]["value"] == selected_df.iloc[0]["value"]
        assert df.iloc[1]["value"] == selected_df.iloc[1]["value"]
        assert len(selected_df) == 2
    def test_select_on_values(self):
        # Select rows based on value.
        param_config = {
            "input_parameters": {
                "source_data_url": os.path.join("test_datasets", "test_1.csv"),
                "columns_to_load_from_url": "first_column,third_column",
                "datetime_column_name": "first_column",
                "index_column_name": "first_column",
                "frequency": "D"
            },
            "selection_parameters": {
                "column_name_selection": "third_column",
                "value_selection": 3,
            },
        }

        df = ingest_timeseries(param_config)
        df = select_timeseries_portion(df, param_config)

        assert df.index.values[0] == Timestamp("2020-02-25")

        assert df.iloc[0]["third_column"] == 3
        assert len(df) == 1
Exemple #5
0
def compute():

    param_file_nameJSON = 'configurations/configuration_test_covid19italy.json'

    # Load parameters from config file.
    with open(
            param_file_nameJSON) as json_file:  # opening the config_file_name
        param_config = json.load(json_file)  # loading the json

    # Logging
    log_level = getattr(logging, param_config["verbose"], None)
    if not isinstance(log_level, int):
        log_level = 0
    # %(name)s for module name
    logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s',
                        level=log_level,
                        stream=sys.stdout)

    # data ingestion
    log.info(f"Started data ingestion.")
    ingested_data = timexseries.data_ingestion.ingest_timeseries(
        param_config)  # ingestion of data

    # data selection
    log.info(f"Started data selection.")
    ingested_data = select_timeseries_portion(ingested_data, param_config)

    # Custom columns
    log.info(f"Adding custom columns.")
    ingested_data["New cases/tests ratio"] = [
        100 * (np / tamp) for np, tamp in zip(ingested_data['Daily cases'],
                                              ingested_data['Daily tests'])
    ]

    # data prediction
    containers = create_timeseries_containers(ingested_data=ingested_data,
                                              param_config=param_config)

    ####################################################################################################################
    # Custom time-series #########
    # If you are studying TIMEX code: you can ignore this.
    log.info(f"Computing the custom time-series.")

    regions = read_csv(
        "https://raw.githubusercontent.com/pcm-dpc/COVID-19/master/dati-regioni/dpc-covid19-ita-regioni.csv",
        header=0,
        index_col=0,
        usecols=['data', 'denominazione_regione', 'nuovi_positivi', 'tamponi'])
    regions.reset_index(inplace=True)
    regions['data'] = regions['data'].apply(lambda x: dateparser.parse(x))
    regions.set_index(['data', 'denominazione_regione'],
                      inplace=True,
                      drop=True)

    regions = add_diff_columns(regions, ['tamponi'],
                               group_by='denominazione_regione')

    regions.rename(columns={
        'nuovi_positivi': 'Daily cases',
        'tamponi': 'Tests',
        "tamponi_diff": "Daily tests"
    },
                   inplace=True)

    regions["New cases/tests ratio"] = [
        100 * (ndc / tamp) if tamp > ndc > 0 else "nan"
        for ndc, tamp in zip(regions['Daily cases'], regions['Daily tests'])
    ]

    # Prediction of "New daily cases" for every region
    # We also want to plot cross-correlation with other regions.
    # So, create a dataFrame with only daily cases and regions as columns.
    regions_names = regions.index.get_level_values(1).unique()
    regions_names = regions_names.sort_values()

    datas = regions.index.get_level_values(0).unique().to_list()
    datas = datas[1:]  # Abruzzo is missing the first day.

    cols = regions_names.to_list()
    cols = ['data'] + cols

    daily_cases_regions = DataFrame(columns=cols, dtype=numpy.float64)
    daily_cases_regions['data'] = datas

    daily_cases_regions.set_index(['data'], inplace=True, drop=True)

    for col in daily_cases_regions.columns:
        for i in daily_cases_regions.index:
            daily_cases_regions.loc[i][col] = regions.loc[i,
                                                          col]['Daily cases']

    daily_cases_regions = add_freq(daily_cases_regions, 'D')

    max_lags = param_config['xcorr_parameters']['xcorr_max_lags']
    modes = [*param_config['xcorr_parameters']["xcorr_mode"].split(",")]
    try:
        max_threads = param_config['max_threads']
    except KeyError:
        try:
            max_threads = len(os.sched_getaffinity(0))
        except:
            max_threads = 1

    for region in daily_cases_regions.columns:
        timeseries_data = daily_cases_regions[[region]]

        model_results = {}

        xcorr = calc_xcorr(region, daily_cases_regions, max_lags, modes)

        log.info(f"Computing univariate prediction for {region}...")
        predictor = FBProphetModel(param_config, transformation="none")
        prophet_result = predictor.launch_model(timeseries_data.copy(),
                                                max_threads=max_threads)
        model_results['fbprophet'] = prophet_result
        #
        # predictor = ARIMA(param_config)
        # arima_result = predictor.launch_model(scenario_data.copy())
        # model_results.append(arima_result)

        s = TimeSeriesContainer(timeseries_data, model_results, xcorr)
        containers.append(s)

        # children_for_each_scenario.append({
        #     'name': region,
        #     'children': create_scenario_children(s, param_config)
        # })

    ####################################################################################################################

    # Save the computed data; these are the TimeSeriesContainer objects from which a nice Dash page can be built.
    # They can be loaded by "app_load_from_dump.py" to start the app
    # without re-computing all the data.
    with open(f"containers.pkl", 'wb') as input_file:
        pickle.dump(containers, input_file)