Ejemplo n.º 1
0
import pandas as pd
import numpy as np
from part2_claim_classifier import ClaimClassifier
from sklearn.metrics import accuracy_score
dataset = pd.read_csv("part2_data.csv").values
X = dataset[:, 0:9]
Y = dataset[:, -1]
nn = ClaimClassifier()
nn.fit(X, Y)
nn.evaluate_architecture(X, Y)
# nn.save_model()
# print(nn.predict(X))
#model = nn.fit_skl(X,Y)
data_test = dataset[np.where(dataset[:, -1] == 1)]
X = data_test[:, 0:9]
Y = data_test[:, -1]
y_pred = nn.predict(X)
print(y_pred)
print(accuracy_score(Y, y_pred))
class PricingModel():
    def __init__(self, epoch=100, batchsize=64, learnrate=0.0001, neurons=9, num_features=13, calibrate_probabilities=False):
        """
        Feel free to alter this as you wish, adding instance variables as
        necessary.
        """
        self.y_median = None
        self.calibrate = calibrate_probabilities
        self.trained = False
        self.label_binarizer = {}
        self.base_classifier = ClaimClassifier(epoch, batchsize, learnrate, neurons, num_features)

        # =============================================================
        # READ ONLY IF WANTING TO CALIBRATE
        # Place your base classifier here

        # NOTE: The base estimator must have:
        #    1. A .fit method that takes two arguments, X, y
        #    2. Either a .predict_proba method or a decision
        #       function method that returns classification scores
        #
        # Note that almost every classifier you can find has both.
        # If the one you wish to use does not then speak to one of the TAs
        #
        # If you wish to use the classifier in part 2, you will need
        # to implement a predict_proba for it before use
        # =============================================================


    def _balance_dataset(self, X_y_raw):
        """Function to balance dataset used for training/validation/testing

        This function balances the dataset so it contains an equal number of
        Class 0 and Class 1 events

        Parameters
        ----------
        X_y_raw : ndarray
            An array, this is the raw data

        Returns
        -------
        X_y_balanced: ndarray
            An array, but balanced for each Class
        """
        # Seperate dataset into Class 0 and Class 1 events
        class_0 = X_y_raw[X_y_raw[:,-1] == 0]
        class_1 = X_y_raw[X_y_raw[:,-1] == 1]

        # Shuffle Class_0 events
        np.random.shuffle(class_0)

        # Take Subset of Class_0 events of equal size to Class 1 events
        class_1_size = class_1.shape[0]
        class_0_subset = class_0[:class_1_size,]
        X_y_balanced = np.vstack((class_0_subset,class_1))

        # Shuffle combined balanced dataset before returning
        np.random.shuffle(X_y_balanced)

        return X_y_balanced


    def _preprocessor(self, X_raw):
        """Data preprocessing function.

        This function prepares the features of the data for training,
        evaluation, and prediction.

        Parameters
        ----------
        X_raw : ndarray
            An array, this is the raw data as downloaded

        Returns
        -------
        X: ndarray
            A clean data set that is used for training and prediction.
        """

        features_to_keep = ['pol_coverage', 'vh_age', 'vh_din', 'vh_fuel', 'vh_sale_begin', 'vh_sale_end', 'vh_speed', 'vh_weight']
        X_pre = X_raw[features_to_keep]

        for col in features_to_keep:

            if X_pre.dtypes[col] != 'float64' and X_pre.dtypes[col] != 'int64':

                X_pre[col].fillna("empty")

                if col not in self.label_binarizer.keys():
                    self.label_binarizer[col] = LabelBinarizer()

                if self.trained == False:
                    X_pre = X_pre.join(pd.DataFrame(self.label_binarizer[col].fit_transform(X_pre[col]),
                                                    columns=self.label_binarizer[col].classes_,
                                                    index=X_pre.index))
                else:
                    X_pre = X_pre.join(pd.DataFrame(self.label_binarizer[col].transform(X_pre[col]),
                                                    columns=self.label_binarizer[col].classes_,
                                                    index=X_pre.index))

                X_pre = X_pre.drop(columns=col)
            else:
                mean = np.nanmean(X_pre[col].values)
                X_pre[col].fillna(mean)

        return X_pre

    def fit(self, X_raw, y_raw, claims_raw):
        """Classifier training function.

        Here you will use the fit function for your classifier.

        Parameters
        ----------
        X_raw : ndarray
            This is the raw data as downloaded
        y_raw : ndarray
            A one dimensional array, this is the binary target variable
        claims_raw: ndarray
            A one dimensional array which records the severity of claims

        Returns
        -------
        self: (optional)
            an instance of the fitted model

        """
        nnz = np.where(claims_raw != 0)[0]
        self.y_median = np.median(claims_raw[nnz])

        X_clean = self._preprocessor(X_raw)
        X_Y_pandas = pd.concat([X_clean, y_raw], axis=1).reindex(X_clean.index)
        X_Y_clean = X_Y_pandas.to_numpy()

        X_Y_clean_balanced = self._balance_dataset(X_Y_clean)

        X_clean_balanced = pd.DataFrame(X_Y_clean_balanced[:,:-1])
        y_clean_balanced = pd.DataFrame(X_Y_clean_balanced[:,-1:])

        X_clean = X_clean_balanced
        y_raw = y_clean_balanced

        if self.calibrate:
            self.base_classifier = fit_and_calibrate_classifier(
                self.base_classifier, X_clean, y_raw)
        else:
            self.base_classifier = self.base_classifier.fit(X_clean, y_raw)

        self.trained = True
        return self.base_classifier

    def predict_claim_probability(self, X_raw):
        """Classifier probability prediction function.

        Here you will implement the predict function for your classifier.

        Parameters
        ----------
        X_raw : ndarray
            This is the raw data as downloaded

        Returns
        -------
        ndarray
            A one dimensional array of the same length as the input with
            values corresponding to the probability of beloning to the
            POSITIVE class (that had accidents)
        """

        X_clean = self._preprocessor(X_raw)
        return self.base_classifier.predict(X_clean)


    def predict_premium(self, X_raw):
        """Predicts premiums based on the pricing model.

        Here you will implement the predict function for your classifier.

        Parameters
        ----------
        X_raw : numpy.ndarray
            A numpy array, this is the raw data as downloaded

        Returns
        -------
        numpy.ndarray
            A one dimensional array of the same length as the input with
            values corresponding to the probability of beloning to the
            POSITIVE class (that had accidents)
        """

        factor = 0.8 # 0.8 has taken account of both the inflation and investment returns expected
        return self.predict_claim_probability(X_raw) * self.y_median * factor

    def save_model(self):
        """Saves the class instance as a pickle file."""
        # =============================================================
        with open('part3_pricing_model.pickle', 'wb') as target:
            pickle.dump(self, target)


    def evaluate_architecture(self, X_test, Y_test):
        X = self._preprocessor(X_test)
        return self.base_classifier.evaluate_architecture(X, Y_test)