Ejemplo n.º 1
0
    def transform(self, X: dt.Frame):
        logger = None
        if self.context and self.context.experiment_id:
            logger = make_experiment_logger(
                experiment_id=self.context.experiment_id,
                tmp_dir=self.context.tmp_dir,
                experiment_tmp_dir=self.context.experiment_tmp_dir)

        try:
            X = dt.Frame(X)
            original_zip_column_name = X.names[0]
            X.names = ['zip_key']
            X = X[:, str('zip_key')]
            zip_list = dt.unique(X[~dt.isna(dt.f.zip_key), 0]).to_list()[0]
            zip_features = [self.get_zipcode_features(x) for x in zip_list]
            X_g = dt.Frame({"zip_key": zip_list})
            X_g.cbind(dt.Frame(zip_features))
            X_g.key = 'zip_key'
            X_result = X[:, :, dt.join(X_g)]
            self._output_feature_names = [
                "{}.{}".format(original_zip_column_name, f)
                for f in list(X_result[:, 1:].names)
            ]
            self._feature_desc = [
                "Property '{}' of US zipcode found in '{}'".format(
                    f, original_zip_column_name)
                for f in list(X_result[:, 1:].names)
            ]
            return X_result[:, 1:]
        except Exception as ex:
            loggerwarning(
                logger, "USZipcodeDatabaseTransformer got exception {}".format(
                    type(ex).__name__))
            return np.zeros(X.shape[0])
    def create_data(
        X: dt.Frame = None
    ) -> Union[str, List[str], dt.Frame, List[dt.Frame], np.ndarray,
               List[np.ndarray], pd.DataFrame, List[pd.DataFrame], Dict[
                   str, str],  # {data set names : paths}
               Dict[str, dt.Frame],  # {data set names : dt frames}
               Dict[str, np.ndarray],  # {data set names : np arrays}
               Dict[str, pd.DataFrame],  # {data set names : pd frames}
               ]:
        # define date column and forecast horizon
        date_col = 'date'
        forecast_len = 7

        # get COVID19 new cases data from Our World in Data github
        X = dt.fread(
            "https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/owid-covid-data.csv"
        )

        # remove country aggregates like 'World' and 'International'
        X = X[~(dt.f.iso_code == '') & ~(dt.f.continent == ''), :]

        # determine threshold to split train and test based on forecast horizon
        dates = dt.unique(X[:, date_col])
        split_date = dates[-(forecast_len + 1):, :, dt.sort(date_col)][0, 0]
        test_date = dates[-1, :, dt.sort(date_col)][0, 0]

        # split data to honor forecast horizon in test set
        train = X[dt.f[date_col] <= split_date, :]
        test = X[dt.f[date_col] > split_date, :]

        # return [train, test] and rename dataset names as needed
        return {
            f"covid19_daily_{split_date}_by_countries_train": train,
            f"covid19_daily_{test_date}_by_countries_test": test
        }
Ejemplo n.º 3
0
def test_10k_diabetes_xlsx():
    filename = find_file("h2o-3", "fread", "10k_diabetes.xlsx")
    DT = dt.fread(filename)
    assert DT.shape == (10000, 51)
    assert DT.names[:4] == ("race", "gender", "age", "weight")
    assert DT["readmitted"].stype == dt.bool8
    assert DT[:, "num_lab_procedures":"number_inpatient"].stype == dt.int32
    assert dt.unique(DT["gender"]).nrows == 2
    def create_data(X: dt.Frame = None) -> Union[
        str, List[str],
        dt.Frame, List[dt.Frame],
        np.ndarray, List[np.ndarray],
        pd.DataFrame, List[pd.DataFrame],
        Dict[str, str],  # {data set names : paths}
        Dict[str, dt.Frame],  # {data set names : dt frames}
        Dict[str, np.ndarray],  # {data set names : np arrays}
        Dict[str, pd.DataFrame],  # {data set names : pd frames}
    ]:
        # define date column and forecast horizon
        date_col = 'date'
        forecast_len = 7

        # get COVID19 data from NYTimes github
        us_total = dt.fread("https://raw.githubusercontent.com/nytimes/covid-19-data/master/us.csv")

        # produce lag of 1 unit and add as new feature for each column in the list
        series_cols = ["cases", "deaths"]
        aggs = {f"{col}_yesterday": shift(f[col]) for col in series_cols}
        us_total[:, update(**aggs), sort(date_col)]

        # update NA lags to 0
        aggs = {f"{col}_yesterday": 0 for col in series_cols}
        us_total[isna(f[f"{series_cols[0]}_yesterday"]), update(**aggs)]

        # compute daily values by differentiating
        aggs = {f"{col}_daily": f[col] - f[f"{col}_yesterday"] for col in series_cols}
        us_total[:, update(**aggs), sort(date_col)]

        # delete columns with yesterday (shift) values
        series_cols_to_delete = [f"{col}_yesterday" for col in series_cols]
        del us_total[:, series_cols_to_delete]

        # set negative daily values to 0
        us_total[f.cases_daily < 0, [f.cases_daily]] = 0
        us_total[f.deaths_daily < 0, [f.deaths_daily]] = 0

        # determine threshold to split train and test based on forecast horizon
        dates = dt.unique(us_total[:, date_col])
        split_date = dates[-(forecast_len + 1):, :, dt.sort(date_col)][0, 0]
        test_date = dates[-1, :, dt.sort(date_col)][0, 0]

        # split data to honor forecast horizon in test set
        df = us_total[date_col].to_pandas()
        train = us_total[df[date_col] <= split_date, :]
        test = us_total[df[date_col] > split_date, :]

        # return [train, test] and rename dataset names as needed
        return {f"covid19_daily_{split_date}_us_train": train,
                f"covid19_daily_{test_date}_us_test": test}
    def create_data(
        X: dt.Frame = None
    ) -> Union[str, List[str], dt.Frame, List[dt.Frame], np.ndarray,
               List[np.ndarray], pd.DataFrame, List[pd.DataFrame], Dict[
                   str, str],  # {data set names : paths}
               Dict[str, dt.Frame],  # {data set names : dt frames}
               Dict[str, np.ndarray],  # {data set names : np arrays}
               Dict[str, pd.DataFrame],  # {data set names : pd frames}
               ]:
        # define date column and forecast horizon
        date_col = 'date'
        forecast_len = 7

        # get COVID19 data from NYTimes github
        us_states = dt.fread(
            "https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-states.csv"
        )

        # get states population
        us_states_pop = dt.fread(
            "http://www2.census.gov/programs-surveys/popest/datasets/2010-2019/national/totals/nst-est2019-alldata.csv"
        )
        us_states_pop.names = {'NAME': 'state', 'POPESTIMATE2019': 'pop'}
        us_states_pop.key = "state"

        # augment data with state population figures and create adjusted case and death counts
        us_states[:,
                  dt.update(pop=dt.g.pop,
                            pop100k=dt.g.pop / 100000,
                            cases100k=dt.f.cases / (dt.g.pop / 100000),
                            deaths100k=dt.f.deaths / (dt.g.pop / 100000)),
                  dt.join(us_states_pop)]

        # determine threshold to split train and test based on forecast horizon
        dates = dt.unique(us_states[:, date_col])
        split_date = dates[-forecast_len:, :, dt.sort(date_col)][0, 0]

        # split data to honor forecast horizon in test set
        df = us_states[date_col].to_pandas()
        train = us_states[df[date_col] < split_date, :]
        test = us_states[df[date_col] >= split_date, :]

        # return [train, test] and rename dataset names as needed
        return {
            "covid19_daily_by_states_train": train,
            "covid10_daily_by_states_test": test
        }
Ejemplo n.º 6
0
 def transform(self, X: dt.Frame):
     try:
         X = dt.Frame(X)
         X.names = ['zip_key']
         X = X[:, str('zip_key')]
         zip_list = dt.unique(X[~dt.isna(dt.f.zip_key), 0]).to_list()[0]
         zip_features = [
             self.get_zipcode_property(self.parse_zipcode(x))
             for x in zip_list
         ]
         X_g = dt.Frame({
             "zip_key": zip_list,
             self.get_property_name(): zip_features
         })
         X_g.key = 'zip_key'
         X_result = X[:, :, dt.join(X_g)]
         return X_result[:, 1:]
     except:
         return np.zeros(X.shape[0])
Ejemplo n.º 7
0
    def transform(self, X: dt.Frame):
        logger = None
        if self.context and self.context.experiment_id:
            logger = make_experiment_logger(
                experiment_id=self.context.experiment_id,
                tmp_dir=self.context.tmp_dir,
                experiment_tmp_dir=self.context.experiment_tmp_dir)

        X = dt.Frame(X)
        original_zip_column_name = X.names[0]
        X = X[:, dt.str64(dt.f[0])]
        X.names = ['zip_key']
        try:
            zip_list = dt.unique(X[~dt.isna(dt.f.zip_key),
                                   0]).to_list()[0] + ['79936']
            zip_features = [self.get_zipcode_features(x) for x in zip_list]
            X_g = dt.Frame({"zip_key": zip_list})
            X_g.cbind(dt.Frame(zip_features))
            X_g.key = 'zip_key'
            X_result = X[:, :, dt.join(X_g)]
            self._output_feature_names = [
                "{}:{}.{}".format(self.transformer_name,
                                  original_zip_column_name,
                                  self.replaceBannedCharacters(f))
                for f in list(X_result[:, 1:].names)
            ]
            self._feature_desc = [
                "Property '{}' of zipcode column ['{}'] from US zipcode database (recipe '{}')"
                .format(f, original_zip_column_name, self.transformer_name)
                for f in list(X_result[:, 1:].names)
            ]
            return X_result[:, 1:]
        except ValueError as ve:
            loggerinfo(
                logger, "Column '{}' is not a zipcode: {}".format(
                    original_zip_column_name, str(ve)))
            return self.get_zipcode_null_result(X, original_zip_column_name)
        except TypeError as te:
            loggerwarning(
                logger, "Column '{}' triggered TypeError: {}".format(
                    original_zip_column_name, str(te)))
            raise te
Ejemplo n.º 8
0
 def transform(self, X: dt.Frame):
     X = dt.Frame(X)
     original_zip_column_name = X.names[0]
     X.names = ['zip_key']
     X = X[:, str('zip_key')]
     zip_list = dt.unique(X[~dt.isna(dt.f.zip_key),
                            0]).to_list()[0] + ['79936']
     zip_features = [self.get_zipcode_features(x) for x in zip_list]
     X_g = dt.Frame({"zip_key": zip_list})
     X_g.cbind(dt.Frame(zip_features))
     X_g.key = 'zip_key'
     X_result = X[:, :, dt.join(X_g)]
     self._output_feature_names = [
         "{}.{}".format(original_zip_column_name, f)
         for f in list(X_result[:, 1:].names)
     ]
     self._feature_desc = [
         "Property '{}' of US zipcode found in '{}'".format(
             f, original_zip_column_name)
         for f in list(X_result[:, 1:].names)
     ]
     return X_result[:, 1:]
Ejemplo n.º 9
0
    def create_data(X: dt.Frame = None) -> Union[
        str, List[str],
        dt.Frame, List[dt.Frame],
        np.ndarray, List[np.ndarray],
        pd.DataFrame, List[pd.DataFrame],
        Dict[str, str],  # {data set names : paths}
        Dict[str, dt.Frame],  # {data set names : dt frames}
        Dict[str, np.ndarray],  # {data set names : np arrays}
        Dict[str, pd.DataFrame],  # {data set names : pd frames}
    ]:
        # define date column and forecast horizon
        date_col = 'date'
        group_by_cols = ["state"]
        forecast_len = 7

        # get COVID19 data from NYTimes github
        us_states = dt.fread("https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-states.csv")

        # get states population
        us_states_pop = dt.fread(
            "http://www2.census.gov/programs-surveys/popest/datasets/2010-2019/national/totals/nst-est2019-alldata.csv")
        us_states_pop.names = {'NAME': 'state', 'POPESTIMATE2019': 'pop'}
        us_states_pop.key = "state"

        # augment data with state population figures and create adjusted case and death counts
        series_cols = ["cases", "deaths"]
        aggs = {f"{col}100k": dt.f[col] / (dt.g.pop / 100000) for col in series_cols}
        us_states[:, dt.update(pop = g.pop, pop100k = g.pop / 10000, **aggs), join(us_states_pop)]

        # remove rows without state defined (resulted in unmatched rows after left outer join)
        del us_states[isna(f.pop), :]

        # produce lag of 1 unit and add as new feature for each column in the list
        series_cols.extend([col + "100k" for col in series_cols])
        aggs = {f"{col}_yesterday": shift(f[col]) for col in series_cols}
        us_states[:, update(**aggs), sort(date_col), by(group_by_cols)]

        # update NA lags to 0
        aggs = {f"{col}_yesterday": 0 for col in series_cols}
        us_states[isna(f[f"{series_cols[0]}_yesterday"]), update(**aggs)]

        # compute daily values by differentiating
        aggs = {f"{col}_daily": f[col] - f[f"{col}_yesterday"] for col in series_cols}
        us_states[:, update(**aggs), sort(date_col), by(group_by_cols)]

        # delete columns with yesterday (shift) values
        series_cols_to_delete = [f"{col}_yesterday" for col in series_cols]
        del us_states[:, series_cols_to_delete]

        # set negative daily values to 0
        us_states[f.cases_daily < 0, [f.cases_daily, f.cases100k_daily]] = 0
        us_states[f.deaths_daily < 0, [f.deaths_daily, f.deaths100k_daily]] = 0

        # determine threshold to split train and test based on forecast horizon
        dates = dt.unique(us_states[:, date_col])
        split_date = dates[-(forecast_len + 1):, :, dt.sort(date_col)][0, 0]
        test_date = dates[-1, :, dt.sort(date_col)][0, 0]

        # split data to honor forecast horizon in test set
        df = us_states[date_col].to_pandas()
        train = us_states[df[date_col] <= split_date, :]
        test = us_states[df[date_col] > split_date, :]

        # return [train, test] and rename dataset names as needed
        return {f"covid19_daily_{split_date}_by_states_train": train,
                f"covid19_daily_{test_date}_by_states_test": test}
Ejemplo n.º 10
0
# Split dataset by partition id (column): results in as many partitions (datasets)
# as there are values in parition column
import datatable as dt

# maximum number of partition to split allowed
MAX_PARTITIONS = 10
# partition column name
partition_col_name = 'quality'

values = dt.unique(X[partition_col_name]).to_list()[0]
if len(values) > MAX_PARTITIONS:
    raise ValueError("Too many partitions to split")

result = {}
for val in values:
    partition = X[dt.f[partition_col_name] == val, :]
    result.update({"mydata_" + str(val): partition})

return result
Ejemplo n.º 11
0
def analyze(fullTable, args):
    #fullTable = fullTable[dt.f.DatenstandTag > 382 - 20,:]
    print("Analyzing")
    pmu.printMemoryUsage("begin analyze")
    print("Keys:")
    print(fullTable.keys())
    firstDumpDay = cint(fullTable[:, "DatenstandTag"].min())
    lastDumpDay = cint(fullTable[:, "DatenstandTag"].max())
    print("firstDumpDay", firstDumpDay)
    print("lastDumpDay", lastDumpDay)

    #fromDay = lastDumpDay-27
    fromDay = firstDumpDay
    toDay = lastDumpDay + 1

    fullTable = fullTable[:, dt.f[:].extend(
        {"MeldeDelay": dt.f.DatenstandTag - dt.f.MeldeTag - 1})]
    fullTable = fullTable[:, dt.f[:].extend(
        {"RefDelay": dt.f.DatenstandTag - dt.f.RefTag - 1})]
    fullTable.materialize()

    Altersgruppen = []
    if args.agegroups:
        Altersgruppen = dt.unique(fullTable[:, "Altersgruppe"]).to_list()[0]

    print("Altersgruppen", Altersgruppen)

    Geschlechter = dt.unique(fullTable[:, "Geschlecht"]).to_list()[0]
    print("Geschlechter", Geschlechter)

    census = dt.fread("CensusByRKIAgeGroups.csv")
    censusDeutschland = census[dt.f.Name == "Deutschland", :]
    print(censusDeutschland)

    flaechen = loadFlaechen()
    #for id in range(1,16):
    #    censusBL = census[dt.f.Code == id, :]
    #    print(censusBL)

    print("Processing 'Deutschland'")
    pmu.printMemoryUsage("begin Deutschland")
    deutschland = analyzeDailyAltersgruppenGeschlechter(
        fullTable, filterByDay(fromDay, toDay), Altersgruppen, Geschlechter)
    deutschland = insertDates(deutschland)
    deutschland = insertRegionInfo(deutschland, 0, "Deutschland", "BR", 0,
                                   "Deutschland", flaechen[0])

    print(deutschland)
    pmu.printMemoryUsage("pre makeIncidenceColumns")

    deutschland = makeIncidenceColumns(deutschland, censusDeutschland,
                                       Altersgruppen, Geschlechter)
    print(deutschland)
    pmu.printMemoryUsage("pre save")
    pmu.saveCsvTable(deutschland, "series-{}-{}.csv".format(0, "Deutschland"),
                     args.outputDir)
    pmu.printMemoryUsage("post save")
    deutschland = None

    #exit(0)

    print("Processing Bundesländer")
    bundeslaender, bundeslaender_numbers = timeSeries(fullTable, fromDay,
                                                      toDay, dt.f.IdBundesland,
                                                      dt.f.Bundesland,
                                                      Altersgruppen,
                                                      Geschlechter)
    pmu.printMemoryUsage("post Bundesländer timeSeries")
    for i in range(bundeslaender.nrows):
        bl_name = bundeslaender[i, dt.f.Bundesland].to_list()[0][0]
        bl_id = bundeslaender[i, dt.f.IdBundesland].to_list()[0][0]

        if bl_id > 0:
            #bundeslaender_numbers[bl_id] = bundeslaender_numbers[bl_id][:, dt.f[:].extend(
            #    {"IdLandkreis": bl_id, "Landkreis": bl_name, "IdBundesland": bl_id, "Bundesland": bl_name, "Flaeche" : flaechen[bl_id]})]
            bundeslaender_numbers[bl_id] = insertDates(
                bundeslaender_numbers[bl_id])
            bundeslaender_numbers[bl_id] = insertRegionInfo(
                bundeslaender_numbers[bl_id], bl_id, bl_name, "BL", bl_id,
                bl_name, flaechen[0])
            censusBL = census[dt.f.IdLandkreis == bl_id, :]
            print(censusBL)
            bundeslaender_numbers[bl_id] = makeIncidenceColumns(
                bundeslaender_numbers[bl_id], censusBL, Altersgruppen,
                Geschlechter)
        pmu.printMemoryUsage("pre save {}".format(bl_name))

        pmu.saveCsvTable(bundeslaender_numbers[bl_id],
                         "series-{}-{}.csv".format(bl_id,
                                                   bl_name), args.outputDir)
    bundeslaender = None
    bundeslaender_numbers = None

    print("Processing Landkreise'")
    landKreise, landkreise_numbers = timeSeries(fullTable, fromDay, toDay,
                                                dt.f.IdLandkreis,
                                                dt.f.Landkreis, Altersgruppen,
                                                Geschlechter)
    pmu.printMemoryUsage("post Landkreise timeSeries")
    #print(landKreise)
    #print(landkreise_numbers)
    for i in range(landKreise.nrows):
        print(i)
        lk_name = landKreise[i, dt.f.Landkreis].to_list()[0][0]
        lk_id = landKreise[i, dt.f.IdLandkreis].to_list()[0][0]
        if lk_id > 0:
            censusLK = census[dt.f.IdLandkreis == lk_id, :]
            bl_name = censusLK[0, dt.f.Bundesland].to_list()[0][0]
            bl_id = censusLK[0, dt.f.IdBundesland].to_list()[0][0]
            lk_typ = landKreisTyp(lk_id, lk_name)

            landkreise_numbers[lk_id] = insertDates(landkreise_numbers[lk_id])
            landkreise_numbers[lk_id] = insertRegionInfo(
                landkreise_numbers[lk_id], lk_id, lk_name, lk_typ, bl_id,
                bl_name, flaechen[lk_id])
            #landkreise_numbers[lk_id] = landkreise_numbers[lk_id][:, dt.f[:].extend(
            #   {"IdLandkreis": lk_id, "Landkreis": lk_name, "IdBundesland": bl_id, "Bundesland": bl_name,
            #    "Flaeche": flaechen[lk_id]})]
            print(censusLK)
            landkreise_numbers[lk_id] = makeIncidenceColumns(
                landkreise_numbers[lk_id], censusLK, Altersgruppen,
                Geschlechter)
        pmu.printMemoryUsage("pre save {}".format(lk_name))
        pmu.saveCsvTable(landkreise_numbers[lk_id],
                         "series-{}-{}.csv".format(lk_id,
                                                   lk_name), args.outputDir)
    #print(landKreise)

    return fullTable
    def create_data(
        X: dt.Frame = None
    ) -> Union[str, List[str], dt.Frame, List[dt.Frame], np.ndarray,
               List[np.ndarray], pd.DataFrame, List[pd.DataFrame], Dict[
                   str, str],  # {data set names : paths}
               Dict[str, dt.Frame],  # {data set names : dt frames}
               Dict[str, np.ndarray],  # {data set names : np arrays}
               Dict[str, pd.DataFrame],  # {data set names : pd frames}
               ]:
        # define date column and forecast horizon
        date_col = 'date'
        group_by_cols = ["state"]
        forecast_len = 7

        # state codes lookup table
        us_state_codes = dt.Frame(
            code=[
                'AL', 'AK', 'AS', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'DC',
                'FL', 'GA', 'GU', 'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 'KY',
                'LA', 'ME', 'MD', 'MA', 'MI', 'MN', 'MS', 'MO', 'MT', 'NE',
                'NV', 'NH', 'NJ', 'NM', 'NY', 'NC', 'ND', 'MP', 'OH', 'OK',
                'OR', 'PA', 'PR', 'RI', 'SC', 'SD', 'TN', 'TX', 'UT', 'VT',
                'VI', 'VA', 'WA', 'WV', 'WI', 'WY'
            ],
            state=[
                'Alabama', 'Alaska', 'American Samoa', 'Arizona', 'Arkansas',
                'California', 'Colorado', 'Connecticut', 'Delaware',
                'District of Columbia', 'Florida', 'Georgia', 'Guam', 'Hawaii',
                'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky',
                'Louisiana', 'Maine', 'Maryland', 'Massachusetts', 'Michigan',
                'Minnesota', 'Mississippi', 'Missouri', 'Montana', 'Nebraska',
                'Nevada', 'New Hampshire', 'New Jersey', 'New Mexico',
                'New York', 'North Carolina', 'North Dakota',
                'Northern Mariana Islands', 'Ohio', 'Oklahoma', 'Oregon',
                'Pennsylvania', 'Puerto Rico', 'Rhode Island',
                'South Carolina', 'South Dakota', 'Tennessee', 'Texas', 'Utah',
                'Vermont', 'Virgin Islands', 'Virginia', 'Washington',
                'West Virginia', 'Wisconsin', 'Wyoming'
            ])
        us_state_codes.key = "state"

        # get states population lookup table
        us_states_pop = dt.fread(
            "http://www2.census.gov/programs-surveys/popest/datasets/2010-2019/national/totals/nst-est2019-alldata.csv"
        )
        us_states_pop.names = {'NAME': 'state', 'POPESTIMATE2019': 'pop'}
        us_states_pop = us_states_pop[dt.f.STATE > 0, :]
        us_states_pop.key = "state"

        # join state codes and population into single lookup table
        us_states_pop[:, dt.update(code=dt.g.code), dt.join(us_state_codes)]
        us_states_pop.key = "code"

        # US Covid Tracking API: https://covidtracking.com/data/api
        us_states = dt.fread(
            "https://covidtracking.com/api/v1/states/daily.csv")
        # remove deprecated fields
        deprecated = [
            'checkTimeEt', 'commercialScore', 'dateChecked', 'dateModified',
            'grade', 'hash', 'hospitalized', 'negativeIncrease',
            'negativeRegularScore', 'negativeScore', 'posNeg', 'positiveScore',
            'score', 'total'
        ]
        us_states = us_states[:, list(set(us_states.names) - set(deprecated))]
        us_states.names = {'state': 'code'}

        series_cols = [
            "positive", "negative", "hospitalizedCumulative",
            "inIcuCumulative", "onVentilatorCumulative", "recovered", "death"
        ]
        aggs = {f"{col}100k": f[col] / (g.pop / 100000) for col in series_cols}
        us_states[:,
                  dt.update(
                      state=g.state, pop=g.pop, pop100k=g.pop / 10000, **aggs),
                  join(us_states_pop)]
        us_states = us_states[~dt.isna(dt.f.state), :]

        # produce lag of 1 unit and add as new feature for each shift column
        series_cols.extend([col + "100k" for col in series_cols])
        aggs = {f"{col}_yesterday": shift(f[col]) for col in series_cols}
        us_states[:, update(**aggs), sort(date_col), by(group_by_cols)]

        # update NA lags
        aggs = {f"{col}_yesterday": 0 for col in series_cols}
        us_states[isna(f[f"{series_cols[0]}_yesterday"]), update(**aggs)]

        aggs = {
            f"{col}_daily": f[col] - f[f"{col}_yesterday"]
            for col in series_cols
        }
        us_states[:, update(**aggs), sort(date_col), by(group_by_cols)]

        for col in series_cols:
            del us_states[:, f[f"{col}_yesterday"]]

        # validate dataset
        if us_states[:, count(),
                     by(dt.f.state, f.date)][f.count > 1, :].shape[0] > 1:
            raise ValueError(
                "Found duplicate elements for the same date and state.")

        # determine threshold to split train and test based on forecast horizon
        dates = dt.unique(us_states[:, date_col])
        split_date = dates[-(forecast_len + 1):, :, dt.sort(date_col)][0, 0]
        test_date = dates[-1, :, dt.sort(date_col)][0, 0]

        # split data to honor forecast horizon in test set
        df = us_states[date_col].to_pandas()
        train = us_states[df[date_col] <= split_date, :]
        test = us_states[df[date_col] > split_date, :]

        return {
            f"covidtracking_daily_{split_date}_by_us_states_train": train,
            f"covidtracking_daily_{test_date}_by_us_states_test": test
        }
Ejemplo n.º 13
0
def analyze(fullTable, args, oldTables):
    #fullTable = fullTable[dt.f.DatenstandTag <= 387,:]

    print("Analyzing")
    pmu.printMemoryUsage("begin analyze")
    print("Keys:")
    print(fullTable.keys())
    print(list(zip(fullTable.names, fullTable.stypes)))

    daysInfullTable = dt.unique(fullTable[:, "DatenstandTag"]).to_list()[0]
    firstDumpDay = min(daysInfullTable)
    lastDumpDay = max(daysInfullTable)
    maxMeldeDay = cint(fullTable[:,"MeldeTag"].max())
    if maxMeldeDay > lastDumpDay:
        print("Future Date in Meldetag ({}), clipping to yesterday, Datenstandtag-1 = {}".format(maxMeldeDay, lastDumpDay))
        fullTable["MeldeTag">=lastDumpDay,"MeldeTag"] = lastDumpDay -1

    print("firstDumpDay", firstDumpDay)
    print("lastDumpDay",lastDumpDay)
    print("maxMeldeDay",maxMeldeDay)

    fromDay = firstDumpDay
    toDay = lastDumpDay+1
    #fromDay = lastDumpDay-1
    if len(oldTables)>0:

        # calculate which rows are needed for the update
        daysInOldTables = dt.unique(oldTables[0][:, "DatenstandTag"]).to_list()[0]
        newDays = sorted(list(set(daysInfullTable).difference(set(daysInOldTables))))
        print("newDays",newDays)
        if len(newDays) == 0:
            print("Nothing to update")
            exit(9)
        minNewDay = min(newDays)
        maxNewDay = max(newDays)
        minNewDay7daysAgo = minNewDay - 7
        maxNewDay7daysAgo = maxNewDay - 7

        fullTable = fullTable[((dt.f.DatenstandTag >= minNewDay) & (dt.f.DatenstandTag <= maxNewDay)) |
                                ((dt.f.DatenstandTag >= minNewDay7daysAgo) & (dt.f.DatenstandTag <= maxNewDay7daysAgo)),:]
        #fullTable.materialize()
        daysInfullTable = dt.unique(fullTable[:, "DatenstandTag"]).to_list()[0]
        print("daysInfullTable",daysInfullTable)

    fullTable = fullTable[:, dt.f[:].extend({"MeldeDelay": dt.f.DatenstandTag-dt.f.MeldeTag-1})]
    fullTable = fullTable[:, dt.f[:].extend({"RefDelay": dt.f.DatenstandTag-dt.f.RefTag-1})]
    #fullTable.materialize()

    Altersgruppen = []
    if args.agegroups:
        Altersgruppen = dt.unique(fullTable[:,"Altersgruppe"]).to_list()[0]

    print("Altersgruppen", Altersgruppen)

    Geschlechter = []
    if args.gender:
        Geschlechter = dt.unique(fullTable[:,"Geschlecht"]).to_list()[0]
    print("Geschlechter", Geschlechter)

    census = dt.fread("CensusByRKIAgeGroups.csv")
    censusDeutschland = census[dt.f.Name == "Deutschland",:]
    print(censusDeutschland)

    flaechen = loadFlaechen()

    print("Processing 'Deutschland'")
    pmu.printMemoryUsage("begin Deutschland")

    deutschland = analyzeDailyAltersgruppenGeschlechter(fullTable, fromDay, toDay, True, True, Altersgruppen, Geschlechter)
    deutschland = insertDates(deutschland)
    deutschland = insertRegionInfo(deutschland, 0, "Deutschland", "BR", 0, "Deutschland", flaechen[0])
    deutschland = insertEinwohnerColumns(deutschland, censusDeutschland, Altersgruppen, Geschlechter, "Flaeche")

    print(deutschland)
    pmu.printMemoryUsage("pre makeIncidenceColumns")

    #deutschland = makeIncidenceColumns(deutschland, censusDeutschland, Altersgruppen, Geschlechter)
    #print(deutschland)
    if len(oldTables) > 0: deutschland = updateOldTable(oldTables[0], deutschland)
    pmu.printMemoryUsage("pre save")
    pmu.saveCsvTable(deutschland, "series-{}-{}.csv".format(0, "Deutschland"), args.outputDir)
    pmu.printMemoryUsage("post save")
    deutschland = None

    #exit(0)

    print("Processing Bundesländer")
    bundeslaender, bundeslaender_numbers = timeSeries(fullTable, fromDay, toDay, dt.f.IdBundesland, dt.f.Bundesland, Altersgruppen, Geschlechter)
    pmu.printMemoryUsage("post Bundesländer timeSeries")
    for i in range(bundeslaender.nrows):
        bl_name=bundeslaender[i,dt.f.Bundesland].to_list()[0][0]
        bl_id=bundeslaender[i,dt.f.IdBundesland].to_list()[0][0]

        if bl_id > 0:
            bundeslaender_numbers[bl_id] = insertDates(bundeslaender_numbers[bl_id])
            bundeslaender_numbers[bl_id] = insertRegionInfo(bundeslaender_numbers[bl_id], bl_id, bl_name, "BL", bl_id, bl_name, flaechen[0])
            censusBL = census[dt.f.IdLandkreis == bl_id, :]
            bundeslaender_numbers[bl_id] = insertEinwohnerColumns(bundeslaender_numbers[bl_id], censusBL, Altersgruppen, Geschlechter, "Flaeche")
            if len(oldTables) > 0:
                bundeslaender_numbers[bl_id] = updateOldTable(oldTables[bl_id], bundeslaender_numbers[bl_id])
            print(censusBL)

        pmu.printMemoryUsage("pre save {}".format(bl_name))
        pmu.saveCsvTable(bundeslaender_numbers[bl_id], "series-{}-{}.csv".format(bl_id, bl_name), args.outputDir)
    bundeslaender = None
    bundeslaender_numbers = None

    print("Processing Landkreise'")
    landKreise, landkreise_numbers = timeSeries(fullTable, fromDay, toDay, dt.f.IdLandkreis, dt.f.Landkreis, Altersgruppen, Geschlechter)
    pmu.printMemoryUsage("post Landkreise timeSeries")
    #print(landKreise)
    #print(landkreise_numbers)
    for i in range(landKreise.nrows):
        print(i)
        lk_name = landKreise[i, dt.f.Landkreis].to_list()[0][0]
        lk_id = landKreise[i, dt.f.IdLandkreis].to_list()[0][0]
        if lk_name == "LK Saarpfalz-Kreis":
            lk_name = "LK Saar-Pfalz-Kreis"

        if lk_id > 0:
            censusLK = census[dt.f.IdLandkreis == lk_id, :]
            bl_name = censusLK[0,dt.f.Bundesland].to_list()[0][0]
            bl_id = censusLK[0, dt.f.IdBundesland].to_list()[0][0]
            lk_typ = landKreisTyp(lk_id, lk_name)

            landkreise_numbers[lk_id] = insertDates(landkreise_numbers[lk_id])
            landkreise_numbers[lk_id] = insertRegionInfo(landkreise_numbers[lk_id], lk_id, lk_name, lk_typ, bl_id,
                                                             bl_name, flaechen[lk_id])
            #print(censusLK)
            landkreise_numbers[lk_id] = insertEinwohnerColumns(landkreise_numbers[lk_id], censusLK, Altersgruppen,
                                                                Geschlechter, "Flaeche")
            if len(oldTables) > 0:
                landkreise_numbers[lk_id] = updateOldTable(oldTables[lk_id], landkreise_numbers[lk_id])

        pmu.printMemoryUsage("pre save {}".format(lk_name))
        pmu.saveCsvTable(landkreise_numbers[lk_id], "series-{}-{}.csv".format(lk_id, lk_name), args.outputDir)
    #print(landKreise)

    return fullTable