Ejemplo n.º 1
0
    def predict(self, new_df=None, sample_size=1000):
        '''
        Args:
            new_data_frame (pandas dataframe): the dataframe of new locations. Users can also include the truth value of Y.
            Note that MSE cannot be computed if truth is not provided.
        '''
        if new_df:
            try:
                self.X_test = coordinates_converter(new_df)
                self.y_test = new_df[self.response_var]
                self.test_loc_cache = new_df[['LATITUDE', 'LONGITUDE']]
            except:
                raise ValueError(
                    'The new dataframe should contain LATITUDE, LONGITUDE and the variable column, e.g., PRCP'
                )

        with self.model:
            y_pred = self.gp.conditional("y_pred", self.X_test)
            self.simulated_values = pm.sample_ppc(self.trace,
                                                  vars=[y_pred],
                                                  samples=sample_size)
            self.predictions = np.exp(
                np.median(self.simulated_values['y_pred'], axis=0))

        l1_loss = np.mean(np.abs(self.predictions - self.y_test))
        l2_loss = np.mean(np.square(self.predictions - self.y_test))
        self.summary = {'l1_loss': l1_loss, 'l2_loss': l2_loss}

        output_df = self.test_loc_cache.copy()
        output_df['PRED'] = self.predictions

        return self.predictions
    def predict(self, new_df=None, sample_size=500):

        if new_df:
            try:
                self.X_test = coordinates_converter(new_df)
                self.y_test = new_df[self.response_var]
                self.test_loc_cache = new_df[['LATITUDE', 'LONGITUDE']]
            except:
                raise ValueError(
                    'The new dataframe should contain LATITUDE, LONGITUDE and the variable column, e.g., PRCP'
                )
        with self.model:
            self.X_train.set_value(self.X_test)
            self.simulated_values = pm.sample_ppc(self.trace,
                                                  samples=sample_size)
            self.predictions = np.exp(
                np.median(self.simulated_values['y'], axis=0))

        l1_loss = np.mean(np.abs(self.predictions - self.y_test))
        l2_loss = np.mean(np.square(self.predictions - self.y_test))

        self.summary = {'l1_loss': l1_loss, 'l2_loss': l2_loss}

        output_df = self.test_loc_cache.copy()
        output_df['PRED'] = self.predictions

        return self.predictions
Ejemplo n.º 3
0
    def __init__(self, df, response_var='PRCP', split_ratio=0.7):

        X = coordinates_converter(df).values
        self.response_var = response_var
        y = df[self.response_var].values

        all_index = list(range(len(df)))
        train_size = int(round(len(df) * split_ratio, 0))

        train_index = np.random.choice(all_index, train_size)
        test_index = [idx for idx in all_index if idx not in train_index]

        self.X_train = X[train_index]
        self.X_test = X[test_index]
        self.y_train = y[train_index]
        self.y_test = y[test_index]

        self.train_loc_cache = df.loc[train_index, ['LATITUDE', 'LONGITUDE']]
        self.test_loc_cache = df.loc[test_index, ['LATITUDE', 'LONGITUDE']]