def _preprocessor(self, X_raw): """Data preprocessing function. This function prepares the features of the data for training, evaluation, and prediction. Parameters ---------- X_raw : numpy.ndarray (NOTE, IF WE CAN USE PANDAS HERE IT WOULD BE GREAT) A numpy array, this is the raw data as downloaded Returns ------- X: numpy.ndarray (NOTE, IF WE CAN USE PANDAS HERE IT WOULD BE GREAT) A clean data set that is used for training and prediction. """ # YOUR CODE HERE # data = X_raw.drop(columns=drop_cols) # # X = data.loc[:, ~data.columns.isin(label_cols)].values # Y = data.loc[:, data.columns.isin(label_cols)].values.ravel() # # split_idx = int(0.8 * len(X)) # # x_train = X[:split_idx] # y_train = Y[:split_idx] # x_val = X[split_idx:] # y_val = Y[split_idx:] prep = nn.Preprocessor(X_raw) x_train_pre = prep.apply(X_raw) return x_train_pre # YOUR CLEAN DATA AS A NUMPY ARRAY
def load_model(): model = tf.keras.models.load_model('part3_pricing_model.h5') df = pd.read_csv('part3_data.csv').sample(frac=1) x_train = df[df.columns[:-2]].to_numpy() y_train = df[df.columns[-1]].to_numpy() claim_train = np.expand_dims(df[df.columns[-2]].to_numpy(), axis=1) numerical_features_names_sel = ['pol_bonus', 'pol_duration', 'pol_sit_duration', 'drv_age1', 'drv_age2', 'vh_age', 'vh_cyl', 'vh_value', 'town_mean_altitude', 'population', 'vh_speed', 'vh_weight'] categorical_feature_names_sel = ['pol_coverage', 'pol_usage', 'drv_drv2', 'vh_make', 'vh_type', 'vh_fuel', ] le = LabelEncoder() df2 = df[categorical_feature_names_sel].apply(le.fit_transform) imp = SimpleImputer(missing_values=np.nan, strategy='mean') imp.fit(df[numerical_features_names_sel]) num_features = np.concatenate((imp.transform(df[numerical_features_names_sel].to_numpy(dtype=np.float64)), df2.to_numpy(dtype=np.float64)), axis=1) preprocessor = nn_lib.Preprocessor(num_features) pm = PricingModel(preprocessor=preprocessor, imputer=imp, encoder=le, categorical=categorical_feature_names_sel, numerical=numerical_features_names_sel) nnz = np.where(claim_train != 0)[0] pm.y_mean = np.mean(claim_train[nnz]) pm.median_vh_value = df['vh_value'].median() pm.warp(model) return pm
def load_model(): model = tf.keras.models.load_model('part2_model.h5') data = np.genfromtxt('part2_data.csv', delimiter=',') split_index = int(0.8 * data.shape[0]) y_train = data[1:split_index, -1] x_train = data[1:split_index, :9] preprocessor = nn_lib.Preprocessor(x_train) m = ClaimClassifier(preprocessor) m.warp(model) return m
def train_model(): df = pd.read_csv('part3_data.csv').sample(frac=1) x_train = df[df.columns[:-2]].to_numpy() y_train = df[df.columns[-1]].to_numpy() claim_train = df[df.columns[-2]].to_numpy() numerical_features_names_sel = ['pol_bonus', 'pol_duration', 'pol_sit_duration', 'drv_age1', 'drv_age2', 'vh_age', 'vh_cyl', 'vh_value', 'town_mean_altitude', 'population', 'vh_speed', 'vh_weight'] categorical_feature_names_sel = ['pol_coverage', 'pol_usage', 'drv_drv2', 'vh_make', 'vh_type', 'vh_fuel', ] le = LabelEncoder() df2 = df[categorical_feature_names_sel].apply(le.fit_transform) imp = SimpleImputer(missing_values=np.nan, strategy='mean') imp.fit(df[numerical_features_names_sel]) num_features = np.concatenate((imp.transform(df[numerical_features_names_sel].to_numpy(dtype=np.float64)), df2.to_numpy(dtype=np.float64)), axis=1) preprocessor = nn_lib.Preprocessor(num_features) pm = PricingModel(preprocessor=preprocessor, imputer=imp, encoder=le, categorical=categorical_feature_names_sel, numerical=numerical_features_names_sel) pm.fit(x_train, y_train, claim_train) pm.save_model()
def evaluate_model(): df = pd.read_csv('part3_data.csv').sample(frac=1) split_index = int(0.8*df.shape[0]) # print(df.head()) df_train = df.iloc[:split_index, :] df_test = df.iloc[split_index:, :] x_train = df_train[df_train.columns[:-2]].to_numpy() y_train = df_train[df_train.columns[-1]].to_numpy() # print(y_train) claim_train = df_train[df_train.columns[-2]].to_numpy() # print(x_train) # print(y_train) # print(claim_train) x_test = df_test[df_test.columns[:-2]].to_numpy() y_test = df_test[df_test.columns[-1]].to_numpy() claim_test = df_test[df_test.columns[-2]].to_numpy() numerical_features_names_sel = ['pol_bonus', 'pol_duration', 'pol_sit_duration', 'drv_age1', 'drv_age2', 'vh_age', 'vh_cyl', 'vh_value', 'town_mean_altitude', 'population', 'vh_speed', 'vh_weight'] categorical_feature_names_sel = ['pol_coverage', 'pol_usage', 'drv_drv2', 'vh_make', 'vh_type', 'vh_fuel', ] le = LabelEncoder() df2 = df[categorical_feature_names_sel].apply(le.fit_transform) imp = SimpleImputer(missing_values=np.nan, strategy='mean') imp.fit(df[numerical_features_names_sel]) num_features = np.concatenate((imp.transform(df[numerical_features_names_sel].to_numpy(dtype=np.float64)), df2.to_numpy(dtype=np.float64)), axis=1) preprocessor = nn_lib.Preprocessor(num_features) pm = PricingModel(preprocessor=preprocessor, imputer=imp, encoder=le, categorical=categorical_feature_names_sel, numerical=numerical_features_names_sel) # print(y_test.max(), claim_test.max()) pm.fit(x_train, y_train, claim_train) predicted_y = pm.predict_claim_probability(x_test) print(predicted_y) print("AUC score is %.3f" % pm.evaluate_architecture(x_test, y_test, predicted_y)) print('Premioum', pm.predict_premium(x_test))