def predict(self, new_df=None, sample_size=1000): ''' Args: new_data_frame (pandas dataframe): the dataframe of new locations. Users can also include the truth value of Y. Note that MSE cannot be computed if truth is not provided. ''' if new_df: try: self.X_test = coordinates_converter(new_df) self.y_test = new_df[self.response_var] self.test_loc_cache = new_df[['LATITUDE', 'LONGITUDE']] except: raise ValueError( 'The new dataframe should contain LATITUDE, LONGITUDE and the variable column, e.g., PRCP' ) with self.model: y_pred = self.gp.conditional("y_pred", self.X_test) self.simulated_values = pm.sample_ppc(self.trace, vars=[y_pred], samples=sample_size) self.predictions = np.exp( np.median(self.simulated_values['y_pred'], axis=0)) l1_loss = np.mean(np.abs(self.predictions - self.y_test)) l2_loss = np.mean(np.square(self.predictions - self.y_test)) self.summary = {'l1_loss': l1_loss, 'l2_loss': l2_loss} output_df = self.test_loc_cache.copy() output_df['PRED'] = self.predictions return self.predictions
def predict(self, new_df=None, sample_size=500): if new_df: try: self.X_test = coordinates_converter(new_df) self.y_test = new_df[self.response_var] self.test_loc_cache = new_df[['LATITUDE', 'LONGITUDE']] except: raise ValueError( 'The new dataframe should contain LATITUDE, LONGITUDE and the variable column, e.g., PRCP' ) with self.model: self.X_train.set_value(self.X_test) self.simulated_values = pm.sample_ppc(self.trace, samples=sample_size) self.predictions = np.exp( np.median(self.simulated_values['y'], axis=0)) l1_loss = np.mean(np.abs(self.predictions - self.y_test)) l2_loss = np.mean(np.square(self.predictions - self.y_test)) self.summary = {'l1_loss': l1_loss, 'l2_loss': l2_loss} output_df = self.test_loc_cache.copy() output_df['PRED'] = self.predictions return self.predictions
def __init__(self, df, response_var='PRCP', split_ratio=0.7): X = coordinates_converter(df).values self.response_var = response_var y = df[self.response_var].values all_index = list(range(len(df))) train_size = int(round(len(df) * split_ratio, 0)) train_index = np.random.choice(all_index, train_size) test_index = [idx for idx in all_index if idx not in train_index] self.X_train = X[train_index] self.X_test = X[test_index] self.y_train = y[train_index] self.y_test = y[test_index] self.train_loc_cache = df.loc[train_index, ['LATITUDE', 'LONGITUDE']] self.test_loc_cache = df.loc[test_index, ['LATITUDE', 'LONGITUDE']]