コード例 #1
0
ファイル: predict.py プロジェクト: jataware/views2_ensemble
def predict_ensembles(ensembles: List[api.Ensemble], dataset, run_id,
                      n_cores: int) -> None:
    """ Predict for ensembles """

    # Get empty df of times predict to hold predictions
    df_pred_ensembles = dataset.get_df_cols(cols=[]).loc[get_all_times_predict(
        get_period_pairs(run_id))]

    # Use only half the cores, using all memoryerrors a rackham node.
    with mp.Pool(processes=n_cores, maxtasksperchild=1) as pool:
        with tempfile.TemporaryDirectory() as tempdir:
            results = []
            for ensemble in ensembles:
                results.append(
                    pool.apply_async(
                        predict_ensemble,
                        (
                            ensemble,
                            dataset,
                            run_id,
                            tempdir,
                        ),
                    ))
            for result in results:
                path = result.get()
                df_pred_ensembles = assign_into_df(
                    df_to=df_pred_ensembles, df_from=io.parquet_to_df(path))
    # Join ensemble and constituent predictions and write them all to disk
    store_prediction_on_disk(
        df=df_pred_ensembles.join(
            get_predictions_from_disk(run_id=run_id, dataset=dataset)),
        run_id=run_id,
        dataset=dataset,
    )
コード例 #2
0
ファイル: api.py プロジェクト: jataware/views2_ensemble
 def get_df_cols(self, cols):
     """ Get a subset of cols from cached df """
     if os.path.isfile(self.path):
         log.debug(f"Found df on file for {self.name} at {self.path}")
         return io.parquet_to_df(path=self.path, cols=cols)
     else:
         raise RuntimeError(
             "get_df_cols() called but no cached df, run refresh() first.")
コード例 #3
0
ファイル: predict.py プロジェクト: jataware/views2_ensemble
def get_predictions_from_disk(
        run_id: str,
        dataset: data_api.Dataset,
        cols: Optional[List[str]] = None) -> pd.DataFrame:
    """ Read predictions from disk """
    log.info("Reading predictions from disk")
    return io.parquet_to_df(path=path_prediction(run_id=run_id,
                                                 dataset=dataset),
                            cols=cols)
コード例 #4
0
ファイル: api.py プロジェクト: jataware/views2_ensemble
    def get_df_cols(self, cols):
        """ Get a dataframe with a subset of columns

        TODO: Optimise to only read subset of cols from cache
        """
        log.debug(f"Getting {cols} cols from {self.name}")
        if os.path.isfile(self.path):
            df = io.parquet_to_df(path=self.path, cols=cols)
        else:
            df = self.df[cols]
        return df
コード例 #5
0
ファイル: api.py プロジェクト: jataware/views2_ensemble
    def df(self):
        """ Get the datafraem from the dataset """
        if os.path.isfile(self.path):
            df = io.parquet_to_df(self.path)
        else:
            df = self.refresh()

        if self.cols:
            for col in self.cols:
                if col not in df:
                    log.warning(f"Col {col} missing. Not in the sources?")

        return df
コード例 #6
0
ファイル: predict.py プロジェクト: jataware/views2_ensemble
def predict_ensemble(
    ensemble: api.Ensemble,
    dataset: data_api.Dataset,
    run_id: str,
    tempdir: str,
) -> str:
    """ Predict for a single ensemble """
    cols_needed = ensemble.cols_needed
    cols_data = dataset.list_cols_cached()
    cols_pred = io.list_columns_in_parquet(
        path=path_prediction(run_id, dataset))

    # Check we have all we need
    cols_missing = [
        col not in cols_data + cols_pred for col in ensemble.cols_needed
    ]
    if any(cols_missing):
        raise RuntimeError(
            f"Ensemble {ensemble.name} missing cols {cols_missing}")

    # Get constituent predictions and features (outcomes) needed for ensemble
    df_constituent = io.parquet_to_df(
        path=path_prediction(run_id, dataset),
        cols=[col for col in cols_needed if col in cols_pred],
    )
    df_data = dataset.get_df_cols(
        cols=[col for col in cols_needed if col in cols_data])
    df = df_constituent.join(df_data)

    period_pairs = get_period_pairs(run_id)

    # Empty df to hold predictions
    df_pred = df.loc[get_all_times_predict(period_pairs), []]

    for period_pair in period_pairs:
        df_pred = assign_into_df(
            df_to=df_pred,
            df_from=ensemble.predict(df=df,
                                     period_calib=period_pair[0],
                                     period_test=period_pair[1]),
        )

    log.debug(
        f"Done predicting for ensemble {ensemble.name}, writing results.")
    # Generate a random filename in the tempdir
    path = os.path.join(tempdir, f"{uuid.uuid4().hex}.parquet")
    io.df_to_parquet(df=df_pred, path=path)
    return path
コード例 #7
0
ファイル: predict.py プロジェクト: jataware/views2_ensemble
def predict_models(
    models: List[api.Model],
    dataset: data_api.Dataset,
    run_id: str,
    n_cores: int,
) -> None:
    """ Predict for models """

    # Get our calib/test period pairs
    period_pairs = get_period_pairs(run_id)

    log.info(f"Predicting for {len(models)} models "
             f"for {len(period_pairs)} period pairs.")

    # Create predictions df with predict times and no cols
    df_pred = dataset.get_df_cols(
        cols=[]).loc[get_all_times_predict(period_pairs), []]

    # Predict the models in parallel
    with mp.get_context("spawn").Pool(processes=n_cores,
                                      maxtasksperchild=1) as pool:
        # with mp.Pool(processes=n_cores, maxtasksperchild=1) as pool:
        with tempfile.TemporaryDirectory() as tempdir:
            results = []
            for period_pair in period_pairs:
                for model in models:
                    # period_pair[0] is calib period and [1] the test period
                    results.append(
                        pool.apply_async(
                            predict_model,
                            (
                                model,
                                dataset,
                                period_pair[0],
                                period_pair[1],
                                tempdir,
                            ),
                        ))
            # Collect as results become ready
            for result in results:
                path = result.get()
                log.debug(f"Insert from {path}")
                df_pred = assign_into_df(df_to=df_pred,
                                         df_from=io.parquet_to_df(path))

    log.debug("Done collecting.")
    store_prediction_on_disk(df=df_pred, run_id=run_id, dataset=dataset)
コード例 #8
0
    ],
)
log = logging.getLogger(__name__)
level = "cm"

model_path = "./models/{sub}"
out_paths = {
    "evaluation": model_path.format(sub="evaluation"),
    "features": model_path.format(sub="features")
}
for k, v in out_paths.items():
    if not os.path.isdir(v):
        os.makedirs(v)

path = "~/OpenViEWS2/storage/data/datasets/manual.parquet"  # change to your path
cm_global_imp_0 = io.parquet_to_df(path)
df = cm_global_imp_0

df_mdums = pd.get_dummies(df["month"], prefix="mdum")
df_ydums = pd.get_dummies(df["year"], prefix="ydum")

df = df.join(df_mdums)
df = df.join(df_ydums)

import pandas as pd
konstanz_df = pd.read_csv("~/OpenViEWS2/storage/data/konstanz/konstanz.csv",
                          low_memory=False)
#konstanz_df.head()
list(konstanz_df.columns)
#konstanz_df.index
コード例 #9
0
ファイル: api.py プロジェクト: jataware/views2_ensemble
 def df(self) -> pd.DataFrame:
     """ Get the dataframe """
     if not os.path.isfile(self.path):
         self.refresh()
     return io.parquet_to_df(self.path)