def find_best_model(x_clean, y_raw, variables=9, pricing=False): train_x, valid_x, train_y, valid_y = train_test_split(x_clean, y_raw, test_size=0.2) # Up sample (unique, counts) = np.unique(train_y, return_counts=True) total_train = np.append(train_x, train_y, axis=1) total_train = pd.DataFrame(total_train) df_class_0 = total_train[total_train.iloc[:, -1] == 0] df_class_1 = total_train[total_train.iloc[:, -1] == 1] total_train_class_1_over = df_class_1.sample(counts[0], replace=True) test_over = pd.concat([df_class_0, total_train_class_1_over], axis=0) total_train = np.array(test_over) new_train_y = total_train[:, -1] new_train_x = total_train[:, :-1] new_train_y = np.expand_dims(new_train_y, 1) max_metric = 0 searches = 10 for i in range(searches): new_net = ClaimClassifier(variables=len(train_x[0]), linear=True) lrn_rate = np.random.uniform(0.0001, 1) loss = nn.BCELoss() epochs = round(np.random.uniform(50, 150)) new_net.train() optimizer = optim.SGD(new_net.parameters(), lr=lrn_rate) for j in range(epochs): X = torch.Tensor(new_train_x) Y = torch.Tensor(new_train_y) # changed from optimizer to net.zero_grad new_net.zero_grad() output = new_net(X) loss_obj = loss(output, Y) loss_obj.backward() optimizer.step() new_net.eval() print("Model (" + str(i + 1) + ") out of " + str(searches)) pred, probabilities = new_net.predict_probabilities(valid_x, pricing=True) metric = roc_auc_score(valid_y, probabilities) print("Roc Score:" + str(metric)) if metric > max_metric: max_metric = metric best_lr = lrn_rate max_epochs = epochs best_net = new_net return best_net
def __init__(self, epoch=100, batchsize=64, learnrate=0.0001, neurons=9, num_features=13, calibrate_probabilities=False): """ Feel free to alter this as you wish, adding instance variables as necessary. """ self.y_median = None self.calibrate = calibrate_probabilities self.trained = False self.label_binarizer = {} self.base_classifier = ClaimClassifier(epoch, batchsize, learnrate, neurons, num_features)
def __init__(self, calibrate_probabilities=False): """ Feel free to alter this as you wish, adding instance variables as necessary. """ self.y_mean = None self.calibrate = calibrate_probabilities # ============================================================= # READ ONLY IF WANTING TO CALIBRATE # Place your base classifier here # NOTE: The base estimator must have: # 1. A .fit method that takes two arguments, X, y # 2. Either a .predict_proba method or a decision # function method that returns classification scores # # Note that almost every classifier you can find has both. # If the one you wish to use does not then speak to one of the TAs # # If you wish to use the classifier in part 2, you will need # to implement a predict_proba for it before use # ============================================================= self.base_classifier = ClaimClassifier()
class PricingModel(): # YOU ARE ALLOWED TO ADD MORE ARGUMENTS AS NECESSARY def __init__(self, calibrate_probabilities=False): """ Feel free to alter this as you wish, adding instance variables as necessary. """ self.y_mean = None self.calibrate = calibrate_probabilities # ============================================================= # READ ONLY IF WANTING TO CALIBRATE # Place your base classifier here # NOTE: The base estimator must have: # 1. A .fit method that takes two arguments, X, y # 2. Either a .predict_proba method or a decision # function method that returns classification scores # # Note that almost every classifier you can find has both. # If the one you wish to use does not then speak to one of the TAs # # If you wish to use the classifier in part 2, you will need # to implement a predict_proba for it before use # ============================================================= self.base_classifier = ClaimClassifier( ) # ADD YOUR BASE CLASSIFIER HERE # YOU ARE ALLOWED TO ADD MORE ARGUMENTS AS NECESSARY TO THE _preprocessor METHOD def _preprocessor(self, X_raw, training=False): """Data preprocessing function. This function prepares the features of the data for training, evaluation, and prediction. Parameters ---------- X_raw : ndarray An array, this is the raw data as downloaded Returns ------- X: ndarray A clean data set that is used for training and prediction. """ # ============================================================= # YOUR CODE HERE # Load simple data set used in part 2 part2_headers = [ 'drv_age1', 'vh_age', 'vh_cyl', 'vh_din', 'pol_bonus', 'vh_sale_begin', 'vh_sale_end', 'vh_value', 'vh_speed', 'drv_age_lic1', 'pol_duration', 'pol_sit_duration', 'drv_age2' ] # added from before # 'drv_age_lic1' # pol_duration # pol_sit_duration # drv_age2 required_attributes = X_raw[part2_headers] required_attributes = np.array(required_attributes) if training: self.means = np.mean(required_attributes, axis=0) self.std_dev = np.std(required_attributes, axis=0) x_normed = (required_attributes - self.means) / self.std_dev # Add extra columns here multiple_binarizers = [] binarizer = LabelBinarizer() headers = [ 'drv_sex1', 'vh_type', 'pol_coverage', 'pol_usage', 'pol_payd' ] i = 0 for header in headers: data = X_raw[header] if training: binarized = binarizer.fit_transform(data) multiple_binarizers.append(binarizer) else: binarized = self.saved_binarizers[i].transform(data) if len(binarized[0]) > 1: binarized = binarized[:, :-1] i += 1 binarized = np.asarray(binarized) total = np.append(x_normed, binarized, axis=1) if training: self.saved_binarizers = multiple_binarizers return total def fit(self, X_raw, y_raw, claims_raw): """Classifier training function. Here you will use the fit function for your classifier. Parameters ---------- X_raw : ndarray This is the raw data as downloaded y_raw : ndarray A one dimensional array, this is the binary target variable claims_raw: ndarray A one dimensional array which records the severity of claims Returns ------- self: (optional) an instance of the fitted model """ nnz = np.where(claims_raw != 0)[0] self.y_mean = np.mean(claims_raw[nnz]) # ============================================================= # REMEMBER TO A SIMILAR LINE TO THE FOLLOWING SOMEWHERE IN THE CODE X_clean = self._preprocessor(X_raw, training=True) #Split into training/Validation training_x, validation_x, training_y, validation_y = train_test_split( X_clean, y_raw, test_size=0.2) #Upsample data (unique, counts) = np.unique(training_y, return_counts=True) total_train = np.append(training_x, training_y, axis=1) total_train = pd.DataFrame(total_train) df_class_0 = total_train[total_train.iloc[:, -1] == 0] df_class_1 = total_train[total_train.iloc[:, -1] == 1] total_train_class_1_over = df_class_1.sample(counts[0], replace=True) test_over = pd.concat([df_class_0, total_train_class_1_over], axis=0) total_train = np.array(test_over) new_train_y = total_train[:, -1] new_train_x = total_train[:, :-1] new_train_y = np.expand_dims(new_train_y, 1) #(unique, counts) = np.unique(new_train_y, return_counts=True) varaibles = len(new_train_x[0]) validation_x = np.array(validation_x) validation_y = np.array(validation_y) # Find best parameters best classifier best_lr, best_epochs, multiplier, best_net = \ part2.ClaimClassifierHyperParameterSearch(new_train_x, new_train_y, validation_x, validation_y, varaibles, pricing=True) print("Best lr = " + str(best_lr)) print("Best epochs = " + str(best_epochs)) print("Multiplier = " + str(multiplier)) # THE FOLLOWING GETS CALLED IF YOU WISH TO CALIBRATE YOUR PROBABILITES if self.calibrate: self.base_classifier = fit_and_calibrate_classifier( self.base_classifier, X_clean, y_raw) else: self.base_classifier = best_net # Set classifier to model found return self.base_classifier def predict_claim_probability(self, X_raw): """Classifier probability prediction function. Here you will implement the predict function for your classifier. Parameters ---------- X_raw : ndarray This is the raw data as downloaded Returns ------- ndarray A one dimensional array of the same length as the input with values corresponding to the probability of beloning to the POSITIVE class (that had accidents) """ # ============================================================= # REMEMBER TO A SIMILAR LINE TO THE FOLLOWING SOMEWHERE IN THE CODE copyOfData = X_raw X_clean = self._preprocessor(copyOfData) self.base_classifier.eval() X = torch.Tensor(X_clean) oupt = self.base_classifier(X) prob_y = oupt.detach().numpy() #pred_y, prob_y = self.base_classifier.predict_probabilities(X_clean, pricing=True) return prob_y def predict_premium(self, X_raw): """Predicts premiums based on the pricing model. Here you will implement the predict function for your classifier. Parameters ---------- X_raw : numpy.ndarray A numpy array, this is the raw data as downloaded Returns ------- numpy.ndarray A one dimensional array of the same length as the input with values corresponding to the probability of belonging to the POSITIVE class (that had accidents) """ # ============================================================= # REMEMBER TO INCLUDE ANY PRICING STRATEGY HERE. # For example you could scale all your prices down by a factor premium_factor = 0.20 premiums = self.predict_claim_probability( X_raw) * self.y_mean * premium_factor premiums = np.array(premiums) premiums = premiums.flatten() return premiums def save_model(self): """Saves the class instance as a pickle file.""" # ============================================================= with open('part3_pricing_model.pickle', 'wb') as target: pickle.dump(self, target)
import pandas as pd import numpy as np from part2_claim_classifier import ClaimClassifier from sklearn.metrics import accuracy_score dataset = pd.read_csv("part2_data.csv").values X = dataset[:, 0:9] Y = dataset[:, -1] nn = ClaimClassifier() nn.fit(X, Y) nn.evaluate_architecture(X, Y) # nn.save_model() # print(nn.predict(X)) #model = nn.fit_skl(X,Y) data_test = dataset[np.where(dataset[:, -1] == 1)] X = data_test[:, 0:9] Y = data_test[:, -1] y_pred = nn.predict(X) print(y_pred) print(accuracy_score(Y, y_pred))
class PricingModelLinear(): # YOU ARE ALLOWED TO ADD MORE ARGUMENTS AS NECESSARY def __init__(self, calibrate_probabilities=False): """ Feel free to alter this as you wish, adding instance variables as necessary. """ self.y_mean = None self.calibrate = calibrate_probabilities # ============================================================= # READ ONLY IF WANTING TO CALIBRATE # Place your base classifier here # NOTE: The base estimator must have: # 1. A .fit method that takes two arguments, X, y # 2. Either a .predict_proba method or a decision # function method that returns classification scores # # Note that almost every classifier you can find has both. # If the one you wish to use does not then speak to one of the TAs # # If you wish to use the classifier in part 2, you will need # to implement a predict_proba for it before use # ============================================================= self.base_classifier = ClaimClassifier( Insurance_NN_4()) # ADD YOUR BASE CLASSIFIER HERE # YOU ARE ALLOWED TO ADD MORE ARGUMENTS AS NECESSARY TO THE _preprocessor METHOD def _preprocessor(self, X_raw): """Data preprocessing function. This function prepares the features of the data for training, evaluation, and prediction. Parameters ---------- X_raw : ndarray An array, this is the raw data as downloaded Returns ------- X: ndarray A clean data set that is used for training and prediction. """ # ============================================================= # YOUR CODE HERE X_raw = copy.deepcopy(X_raw[[ 'pol_coverage', 'vh_age', 'vh_din', 'vh_fuel', 'vh_sale_begin', 'vh_sale_end', 'vh_speed', 'vh_value', 'vh_weight' ]]) X_raw.dropna(how="any", inplace=True) X_raw = self.integer_encode(X_raw) if not isinstance(X_raw, np.ndarray): X_raw = X_raw.to_numpy(dtype=np.float) min_max_scaler = preprocessing.MinMaxScaler() X_raw = min_max_scaler.fit_transform(X_raw) return X_raw.astype(np.float32) def fit(self, X_raw, y_raw, claims_raw, prepro=True): """Classifier training function. Here you will use the fit function for your classifier. Parameters ---------- X_raw : ndarray This is the raw data as downloaded y_raw : ndarray A one dimensional array, this is the binary target variable claims_raw: ndarray A one dimensional array which records the severity of claims Returns ------- self: (optional) an instance of the fitted model """ nnz = np.where(claims_raw != 0)[0] self.y_mean = np.mean(claims_raw[nnz]) # ============================================================= # REMEMBER TO A SIMILAR LINE TO THE FOLLOWING SOMEWHERE IN THE CODE if prepro: X_clean = self._preprocessor(X_raw) else: X_clean = X_raw # THE FOLLOWING GETS CALLED IF YOU WISH TO CALIBRATE YOUR PROBABILITES if self.calibrate: self.base_classifier = fit_and_calibrate_classifier( self.base_classifier, X_clean, y_raw) self.save_model() else: self.base_classifier = self.base_classifier.fit(X_clean, y_raw) self.save_model() return self.base_classifier def predict_claim_probability(self, X_raw): """Classifier probability prediction function. Here you will implement the predict function for your classifier. Parameters ---------- X_raw : ndarray This is the raw data as downloaded Returns ------- ndarray A one dimensional array of the same length as the input with values corresponding to the probability of beloning to the POSITIVE class (that had accidents) """ # ============================================================= # REMEMBER TO A SIMILAR LINE TO THE FOLLOWING SOMEWHERE IN THE CODE X_clean = self._preprocessor(X_raw) #X_clean = X_raw # return probabilities for the positive class (label 1) return self.base_classifier.predict_proba(X_clean)[:, 1] def predict_premium(self, X_raw): """Predicts premiums based on the pricing model. Here you will implement the predict function for your classifier. Parameters ---------- X_raw : numpy.ndarray A numpy array, this is the raw data as downloaded Returns ------- numpy.ndarray A one dimensional array of the same length as the input with values corresponding to the probability of beloning to the POSITIVE class (that had accidents) """ # ============================================================= # REMEMBER TO INCLUDE ANY PRICING STRATEGY HERE. # For example you could scale all your prices down by a factor return self.predict_claim_probability(X_raw) * self.y_mean * 0.2725 def save_model(self): """Saves the class instance as a pickle file.""" # ============================================================= with open('part3_pricing_model_linear.pickle', 'wb') as target: pickle.dump(self, target) def load_data(self, filename): """ Function to load data from file Args: filename (str) - name of .txt file you are loading data from Output: (x, y) (tuple) - x: 2D array of training data where each row corresponds to a different sample and each column corresponds to a different attribute. y: 1D array where each index corresponds to the ground truth label of the sample x[index][] """ dat = pd.read_csv("part3_training_data.csv") #dat.drop(columns=["drv_sex2"], inplace=True) #dat.dropna(how="any", inplace=True) x = dat.drop(columns=["claim_amount", "made_claim"]) y = dat["made_claim"] y1 = dat["claim_amount"] y2 = y1[y1 != 0] return x, y, y2.to_numpy(), y1 def separate_pos_neg(self, x, y): # Separate into positive and negative samples pos_train_y = [] pos_train_x = np.empty((0, x.shape[1]), np.float32) neg_train_y = [] neg_train_x = np.empty((0, x.shape[1]), np.float32) for i in range(y.shape[0]): if y[i] == 1: pos_train_y.append(y[i]) pos_train_x = np.vstack((pos_train_x, x[i])) else: neg_train_y.append(y[i]) neg_train_x = np.vstack((neg_train_x, x[i])) neg_train_y = np.array(neg_train_y, dtype=np.float32) pos_train_y = np.array(pos_train_y, dtype=np.float32) return (neg_train_x, neg_train_y), (pos_train_x, pos_train_y) def integer_encode(self, x): """ Encode all columns containing strings with unique numbers for every category type """ x = x.to_numpy(dtype=str) for att_i in range(x.shape[1]): try: float(x[0, att_i]) except ValueError: values = x[:, att_i] # integer encode label_encoder = LabelEncoder() integer_encoded = label_encoder.fit_transform(values) x[:, att_i] = integer_encoded return x.astype(float)
class PricingModel(): def __init__(self, epoch=100, batchsize=64, learnrate=0.0001, neurons=9, num_features=13, calibrate_probabilities=False): """ Feel free to alter this as you wish, adding instance variables as necessary. """ self.y_median = None self.calibrate = calibrate_probabilities self.trained = False self.label_binarizer = {} self.base_classifier = ClaimClassifier(epoch, batchsize, learnrate, neurons, num_features) # ============================================================= # READ ONLY IF WANTING TO CALIBRATE # Place your base classifier here # NOTE: The base estimator must have: # 1. A .fit method that takes two arguments, X, y # 2. Either a .predict_proba method or a decision # function method that returns classification scores # # Note that almost every classifier you can find has both. # If the one you wish to use does not then speak to one of the TAs # # If you wish to use the classifier in part 2, you will need # to implement a predict_proba for it before use # ============================================================= def _balance_dataset(self, X_y_raw): """Function to balance dataset used for training/validation/testing This function balances the dataset so it contains an equal number of Class 0 and Class 1 events Parameters ---------- X_y_raw : ndarray An array, this is the raw data Returns ------- X_y_balanced: ndarray An array, but balanced for each Class """ # Seperate dataset into Class 0 and Class 1 events class_0 = X_y_raw[X_y_raw[:,-1] == 0] class_1 = X_y_raw[X_y_raw[:,-1] == 1] # Shuffle Class_0 events np.random.shuffle(class_0) # Take Subset of Class_0 events of equal size to Class 1 events class_1_size = class_1.shape[0] class_0_subset = class_0[:class_1_size,] X_y_balanced = np.vstack((class_0_subset,class_1)) # Shuffle combined balanced dataset before returning np.random.shuffle(X_y_balanced) return X_y_balanced def _preprocessor(self, X_raw): """Data preprocessing function. This function prepares the features of the data for training, evaluation, and prediction. Parameters ---------- X_raw : ndarray An array, this is the raw data as downloaded Returns ------- X: ndarray A clean data set that is used for training and prediction. """ features_to_keep = ['pol_coverage', 'vh_age', 'vh_din', 'vh_fuel', 'vh_sale_begin', 'vh_sale_end', 'vh_speed', 'vh_weight'] X_pre = X_raw[features_to_keep] for col in features_to_keep: if X_pre.dtypes[col] != 'float64' and X_pre.dtypes[col] != 'int64': X_pre[col].fillna("empty") if col not in self.label_binarizer.keys(): self.label_binarizer[col] = LabelBinarizer() if self.trained == False: X_pre = X_pre.join(pd.DataFrame(self.label_binarizer[col].fit_transform(X_pre[col]), columns=self.label_binarizer[col].classes_, index=X_pre.index)) else: X_pre = X_pre.join(pd.DataFrame(self.label_binarizer[col].transform(X_pre[col]), columns=self.label_binarizer[col].classes_, index=X_pre.index)) X_pre = X_pre.drop(columns=col) else: mean = np.nanmean(X_pre[col].values) X_pre[col].fillna(mean) return X_pre def fit(self, X_raw, y_raw, claims_raw): """Classifier training function. Here you will use the fit function for your classifier. Parameters ---------- X_raw : ndarray This is the raw data as downloaded y_raw : ndarray A one dimensional array, this is the binary target variable claims_raw: ndarray A one dimensional array which records the severity of claims Returns ------- self: (optional) an instance of the fitted model """ nnz = np.where(claims_raw != 0)[0] self.y_median = np.median(claims_raw[nnz]) X_clean = self._preprocessor(X_raw) X_Y_pandas = pd.concat([X_clean, y_raw], axis=1).reindex(X_clean.index) X_Y_clean = X_Y_pandas.to_numpy() X_Y_clean_balanced = self._balance_dataset(X_Y_clean) X_clean_balanced = pd.DataFrame(X_Y_clean_balanced[:,:-1]) y_clean_balanced = pd.DataFrame(X_Y_clean_balanced[:,-1:]) X_clean = X_clean_balanced y_raw = y_clean_balanced if self.calibrate: self.base_classifier = fit_and_calibrate_classifier( self.base_classifier, X_clean, y_raw) else: self.base_classifier = self.base_classifier.fit(X_clean, y_raw) self.trained = True return self.base_classifier def predict_claim_probability(self, X_raw): """Classifier probability prediction function. Here you will implement the predict function for your classifier. Parameters ---------- X_raw : ndarray This is the raw data as downloaded Returns ------- ndarray A one dimensional array of the same length as the input with values corresponding to the probability of beloning to the POSITIVE class (that had accidents) """ X_clean = self._preprocessor(X_raw) return self.base_classifier.predict(X_clean) def predict_premium(self, X_raw): """Predicts premiums based on the pricing model. Here you will implement the predict function for your classifier. Parameters ---------- X_raw : numpy.ndarray A numpy array, this is the raw data as downloaded Returns ------- numpy.ndarray A one dimensional array of the same length as the input with values corresponding to the probability of beloning to the POSITIVE class (that had accidents) """ factor = 0.8 # 0.8 has taken account of both the inflation and investment returns expected return self.predict_claim_probability(X_raw) * self.y_median * factor def save_model(self): """Saves the class instance as a pickle file.""" # ============================================================= with open('part3_pricing_model.pickle', 'wb') as target: pickle.dump(self, target) def evaluate_architecture(self, X_test, Y_test): X = self._preprocessor(X_test) return self.base_classifier.evaluate_architecture(X, Y_test)
class PricingModelLinear(): # YOU ARE ALLOWED TO ADD MORE ARGUMENTS AS NECESSARY def __init__(self, calibrate_probabilities=False): """ Feel free to alter this as you wish, adding instance variables as necessary. """ self.y_mean = None self.calibrate = calibrate_probabilities # ============================================================= # READ ONLY IF WANTING TO CALIBRATE # Place your base classifier here # NOTE: The base estimator must have: # 1. A .fit method that takes two arguments, X, y # 2. Either a .predict_proba method or a decision # function method that returns classification scores # # Note that almost every classifier you can find has both. # If the one you wish to use does not then speak to one of the TAs # # If you wish to use the classifier in part 2, you will need # to implement a predict_proba for it before use # ============================================================= self.base_classifier = ClaimClassifier() # YOU ARE ALLOWED TO ADD MORE ARGUMENTS AS NECESSARY TO THE _preprocessor METHOD def _preprocessor(self, X_raw, training=False): """Data preprocessing function. This function prepares the features of the data for training, evaluation, and prediction. Parameters ---------- X_raw : ndarray An array, this is the raw data as downloaded Returns ------- X: ndarray A clean data set that is used for training and prediction. """ # ============================================================= # YOUR CODE HERE # Load simple data set used in part 2 part2_headers = [ "drv_age1", 'vh_age', 'vh_cyl', 'vh_din', 'pol_bonus', 'vh_sale_begin', 'vh_sale_end', 'vh_value', 'vh_speed', 'drv_age_lic1', 'pol_duration', 'pol_sit_duration', 'drv_age2' ] # added from before # 'drv_age_lic1' # pol_duration # pol_sit_duration # drv_age2 required_attributes = X_raw[part2_headers] required_attributes = np.array(required_attributes) if training: self.means = np.mean(required_attributes, axis=0) self.std_dev = np.std(required_attributes, axis=0) x_normed = (required_attributes - self.means) / self.std_dev # Add extra columns here multiple_binarizers = [] binarizer = LabelBinarizer() headers = ['drv_sex1', 'vh_type', 'pol_coverage', 'pol_usage'] i = 0 for header in headers: data = X_raw[header] if training: binarized = binarizer.fit_transform(data) multiple_binarizers.append(binarizer) else: binarized = self.saved_binarizers[i].transform(data) if len(binarized[0]) > 1: binarized = binarized[:, :-1] i += 1 binarized = np.asarray(binarized) total = np.append(x_normed, binarized, axis=1) if training: self.saved_binarizers = multiple_binarizers return total def fit(self, X_raw, y_raw, claims_raw): """Classifier training function. Here you will use the fit function for your classifier. Parameters ---------- X_raw : ndarray This is the raw data as downloaded y_raw : ndarray A one dimensional array, this is the binary target variable claims_raw: ndarray A one dimensional array which records the severity of claims Returns ------- self: (optional) an instance of the fitted model """ nnz = np.where(claims_raw != 0)[0] self.y_mean = np.mean(claims_raw[nnz]) # ============================================================= # REMEMBER TO A SIMILAR LINE TO THE FOLLOWING SOMEWHERE IN THE CODE X_clean = self._preprocessor(X_raw, training=True) # THE FOLLOWING GETS CALLED IF YOU WISH TO CALIBRATE YOUR PROBABILITES if self.calibrate: self.base_classifier = fit_and_calibrate_classifier( self.base_classifier, X_clean, y_raw) else: self.base_classifier = find_best_model(X_clean, y_raw) return self.base_classifier def predict_claim_probability(self, X_raw): """Classifier probability prediction function. Here you will implement the predict function for your classifier. Parameters ---------- X_raw : ndarray This is the raw data as downloaded Returns ------- ndarray A one dimensional array of the same length as the input with values corresponding to the probability of beloning to the POSITIVE class (that had accidents) """ # ============================================================= # REMEMBER TO A SIMILAR LINE TO THE FOLLOWING SOMEWHERE IN THE CODE X_clean = self._preprocessor(X_raw) pred, prob_y = self.base_classifier.predict_probabilities(X_clean, pricing=True) return prob_y def predict_premium(self, X_raw): """Predicts premiums based on the pricing model. Here you will implement the predict function for your classifier. Parameters ---------- X_raw : numpy.ndarray A numpy array, this is the raw data as downloaded Returns ------- numpy.ndarray A one dimensional array of the same length as the input with values corresponding to the probability of beloning to the POSITIVE class (that had accidents) """ # ============================================================= # REMEMBER TO INCLUDE ANY PRICING STRATEGY HERE. # For example you could scale all your prices down by a factor premium_factor = 0.27 premiums = self.predict_claim_probability( X_raw) * self.y_mean * premium_factor premiums = np.array(premiums) premiums = premiums.flatten() return premiums def save_model(self): """Saves the class instance as a pickle file.""" # ============================================================= with open('part3_pricing_model_linear.pickle', 'wb') as target: pickle.dump(self, target)
class PricingModel(): # YOU ARE ALLOWED TO ADD MORE ARGUMENTS AS NECESSARY def __init__(self, calibrate_probabilities=False): """ Feel free to alter this as you wish, adding instance variables as necessary. """ self.y_mean = None self.y_std = None self.calibrate = calibrate_probabilities # ============================================================= # READ ONLY IF WANTING TO CALIBRATE # Place your base classifier here # NOTE: The base estimator must have: # 1. A .fit method that takes two arguments, X, y # 2. Either a .predict_proba method or a decision # function method that returns classification scores # # Note that almost every classifier you can find has both. # If the one you wish to use does not then speak to one of the TAs # # If you wish to use the classifier in part 2, you will need # to implement a predict_proba for it before use # ============================================================= self.base_classifier = ClaimClassifier( Insurance_NN_3()) # ADD YOUR BASE CLASSIFIER HERE # YOU ARE ALLOWED TO ADD MORE ARGUMENTS AS NECESSARY TO THE _preprocessor METHOD def _preprocessor(self, X_raw): """Data preprocessing function. This function prepares the features of the data for training, evaluation, and prediction. Parameters ---------- X_raw : ndarray An array, this is the raw data as downloaded Returns ------- X: ndarray A clean data set that is used for training and prediction. """ # ============================================================= # YOUR CODE HERE X_raw = copy.deepcopy(X_raw[[ 'pol_coverage', 'vh_age', 'vh_din', 'vh_fuel', 'vh_sale_begin', 'vh_sale_end', 'vh_speed', 'vh_value', 'vh_weight' ]]) X_raw.dropna(how="any", inplace=True) X_raw = self.integer_encode(X_raw) if not isinstance(X_raw, np.ndarray): X_raw = X_raw.to_numpy(dtype=np.float) min_max_scaler = preprocessing.MinMaxScaler() X_raw = min_max_scaler.fit_transform(X_raw) return X_raw.astype(np.float32) def fit(self, X_raw, y_raw, claims_raw, prepro=True): """Classifier training function. Here you will use the fit function for your classifier. Parameters ---------- X_raw : ndarray This is the raw data as downloaded y_raw : ndarray A one dimensional array, this is the binary target variable claims_raw: ndarray A one dimensional array which records the severity of claims Returns ------- self: (optional) an instance of the fitted model """ nnz = np.where(claims_raw != 0)[0] self.y_mean = np.mean(claims_raw[nnz]) self.y_std = np.std(claims_raw[nnz]) print(self.y_mean, self.y_std) # ============================================================= # REMEMBER TO A SIMILAR LINE TO THE FOLLOWING SOMEWHERE IN THE CODE if prepro: X_clean = self._preprocessor(X_raw) else: X_clean = X_raw # THE FOLLOWING GETS CALLED IF YOU WISH TO CALIBRATE YOUR PROBABILITES if self.calibrate: self.base_classifier = fit_and_calibrate_classifier( self.base_classifier, X_clean, y_raw) self.save_model() else: self.base_classifier.fit(X_clean, y_raw) self.save_model() return self def predict_claim_probability(self, X_raw): """Classifier probability prediction function. Here you will implement the predict function for your classifier. Parameters ---------- X_raw : ndarray This is the raw data as downloaded Returns ------- ndarray A one dimensional array of the same length as the input with values corresponding to the probability of beloning to the POSITIVE class (that had accidents) """ # ============================================================= # REMEMBER TO A SIMILAR LINE TO THE FOLLOWING SOMEWHERE IN THE CODE X_clean = self._preprocessor(X_raw) # return probabilities for the positive class (label 1) return self.base_classifier.predict_proba(X_clean)[:, 1] def predict_premium(self, X_raw): """Predicts premiums based on the pricing model. Here you will implement the predict function for your classifier. Parameters ---------- X_raw : numpy.ndarray A numpy array, this is the raw data as downloaded Returns ------- numpy.ndarray A one dimensional array of the same length as the input with values corresponding to the probability of beloning to the POSITIVE class (that had accidents) """ # ============================================================= # REMEMBER TO INCLUDE ANY PRICING STRATEGY HERE. # For example you could scale all your prices down by a factor return self.predict_claim_probability(X_raw) * self.y_mean * 0.2775 def save_model(self): """Saves the class instance as a pickle file.""" # ============================================================= with open('part3_pricing_model.pickle', 'wb') as target: pickle.dump(self, target) # -------- NEW FUNCTIONS ----------- def load_data(self, filename): """ Function to load data from file Args: filename (str) - name of .txt file you are loading data from Output: (x, y) (tuple) - x: 2D array of training data where each row corresponds to a different sample and each column corresponds to a different attribute. y: 1D array where each index corresponds to the ground truth label of the sample x[index][] """ dat = pd.read_csv("part3_training_data.csv") #dat.drop(columns=["drv_sex2"], inplace=True) #dat.dropna(how="any", inplace=True) x = dat.drop(columns=["claim_amount", "made_claim"]) y = dat["made_claim"] y1 = dat["claim_amount"] y2 = y1[y1 != 0] """ # load data to single 2D array data_set = np.genfromtxt(filename, dtype=str, delimiter=',', skip_header=1) num_att = len(data_set[0]) # number of parameters x = np.array(data_set[:, :(num_att-2)], dtype=str) y = np.array(data_set[:, (num_att-1)], dtype=np.float) """ return x, y, y2.to_numpy(), y1 def set_axis_style(self, ax, labels): ax.get_xaxis().set_tick_params(direction='out') ax.xaxis.set_ticks_position('bottom') ax.set_xticks(np.arange(1, len(labels) + 1)) ax.set_xticklabels(labels) ax.set_xlim(0.25, len(labels) + 0.75) ax.set_xlabel('Sample name') def evaluate_input1(self, X_raw): """ Function to evaluate data loaded from file """ attributes = [] for i in range(np.shape(X_raw)[1]): attributes.append(X_raw[:, i]) fig, ax1 = plt.subplots(figsize=(18, 4)) # type of plot ax1.boxplot(attributes) labels = [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 16, 17, 18, 19, 20, 21, 22, 24, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35 ] self.set_axis_style(ax1, labels) plt.subplots_adjust(bottom=0.15, wspace=0.05) # plt.show() plt.xlabel("Attribute Type") plt.ylabel("Attribute Value") plt.savefig("box_3.pdf", bbox_inches='tight') #################### plt.cla() ax1.violinplot(attributes) self.set_axis_style(ax1, labels) plt.subplots_adjust(bottom=0.15, wspace=0.05) plt.xlabel("Attribute Type") plt.ylabel("Attribute Value") plt.savefig("violin_3.pdf", bbox_inches='tight') def evaluate_input2(self, x, y): """ Function to evaluate data loaded from file """ # Separate positive and negative results (neg_x, neg_y), (pos_x, pos_y) = self.separate_pos_neg(x, y) attributes1 = [] attributes2 = [] for i in range(np.shape(neg_x)[1]): attributes1.append(neg_x[:, i]) attributes2.append(pos_x[:, i]) fig, axs = plt.subplots(2, figsize=(11, 11)) # type of plot axs[0].boxplot(attributes1) axs[1].boxplot(attributes2) labels = np.genfromtxt("part3_training_data.csv", dtype=str, delimiter=',', max_rows=1) self.set_axis_style(axs[0], labels) self.set_axis_style(axs[1], labels) # plt.show() axs[0].set(xlabel="Attribute Type", ylabel="Attribute Value") axs[0].set_title("No Claim") axs[1].set(xlabel="Attribute Type", ylabel="Attribute Value") axs[1].set_title("Claim") plt.subplots_adjust(bottom=0.15, wspace=0.05) plt.savefig("compare_box_3.pdf", bbox_inches='tight') def evaluate_input3(self, x, y, split=0): """ Function to evaluate data loaded from file """ # Separate positive and negative results if split == 0: (neg_x, neg_y), (pos_x, pos_y) = self.separate_pos_neg(x, y) else: (neg_x, neg_y), (pos_x, pos_y) = split print(split[0][0].shape, split[1][0].shape) attributes1 = [] attributes2 = [] difference = [] difference2 = [] for i in range(np.shape(neg_x)[1]): attributes1.append(np.mean(neg_x[:, i])) attributes2.append(np.mean(pos_x[:, i])) difference.append( ((attributes2[i] - attributes1[i]) * 100) / attributes1[i]) difference2.append(stats.ks_2samp(neg_x[:, i], pos_x[:, i])) print(i) print(attributes1) print(attributes2) print(difference) print(difference2) for i in range(len(difference2)): if difference2[i][0] > 0.1 and difference2[i][1] < 0.001: print(i, difference2[i]) def separate_pos_neg(self, x, y): # Separate into positive and negative samples pos_train_y = [] pos_train_x = np.empty((0, x.shape[1]), np.float32) neg_train_y = [] neg_train_x = np.empty((0, x.shape[1]), np.float32) for i in range(y.shape[0]): if y[i] == 1: pos_train_y.append(y[i]) pos_train_x = np.vstack((pos_train_x, x[i])) else: neg_train_y.append(y[i]) neg_train_x = np.vstack((neg_train_x, x[i])) neg_train_y = np.array(neg_train_y, dtype=np.float32) pos_train_y = np.array(pos_train_y, dtype=np.float32) return (neg_train_x, neg_train_y), (pos_train_x, pos_train_y) def integer_encode(self, x): """ Encode all columns containing strings with unique numbers for every category type """ x = x.to_numpy(dtype=str) for att_i in range(x.shape[1]): try: float(x[0, att_i]) except ValueError: values = x[:, att_i] # integer encode label_encoder = LabelEncoder() integer_encoded = label_encoder.fit_transform(values) x[:, att_i] = integer_encoded return x.astype(float)