Ejemplo n.º 1
0
def quadratic_discriminant_analysis(x_train, y_train, x_test, y_test, compute_threshold=True):
    '''
        Train Quadratic Discriminant Analysis (LDA) classifier on x_train and predict on x_test.

        x_train, x_test: DataFrames of shape data x features.
        n_components: Number of components (< n_classes - 1) for dimensionality reduction.
    '''
    from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

    # classWeights = {defs.posCode: 0.5, defs.negCode: 0.5}
    model = QuadraticDiscriminantAnalysis()
    #X_r2 = model.fit(x_train, y_train).transform(X)
    metricsCV = cross_val_analysis(classifier=model, x=x_train, y=y_train, plot=False)


    model.fit(x_train, y_train)


    if compute_threshold is True:
        probTest  = model.predict_proba(x_test)
        probTrain = model.predict_proba(x_train)

        bestThresh = get_best_thresh(y_train, probTrain)

        predTest    = np.where(probTest[:, 1] >= bestThresh, defs.posCode, defs.negCode)
    else:
        predTest    = model.predict(x_test)

    return predTest, metricsCV, model
def QDA(trainX,trainY,validX,testX):
    model = QuadraticDiscriminantAnalysis(0.1)
    model.fit(trainX,trainY)
    train_pred_Y = model.predict_proba(trainX)[:,1]
    valid_pred_Y = model.predict_proba(validX)[:,1]
    test_pred_Y = model.predict_proba(testX)[:,1]
    return train_pred_Y,valid_pred_Y,test_pred_Y
Ejemplo n.º 3
0
class myQDABinary(myModel):
    def make(self , make_params ):
        self.model = QuadraticDiscriminantAnalysis(**make_params )
        return self

    def fit(self , xtrain , ytrain , xtest =None, ytest =None , fit_params = {} ):
        if type(xtrain) == pd.core.frame.DataFrame:
            self.model.fit(xtrain.astype('float32') , ytrain.astype('float32')  , **fit_params)
        else:
            self.model.fit(xtrain , ytrain  , **fit_params)

    def predict(self , xs , threshold = 0.5):
        if type(xs) == pd.core.frame.DataFrame:
            return self.model.predict(xs.astype('float32'))
        else:
            return self.model.predict(xs)
                    
    def predict_proba(self, xs):
        if type(xs) == pd.core.frame.DataFrame:
            return self.model.predict_proba(xs.astype('float32'))[:,1]
        else:
            if len(xs.shape) == 1:
                return self.model.predict_proba(xs.reshape(1,-1))
            else:
                return self.model.predict_proba(xs)
Ejemplo n.º 4
0
def BuildFinalModel():
    global train, test, cols
    oof = np.zeros(len(train))
    preds = np.zeros(len(test))

    # BUILD 512 SEPARATE MODELS
    for k in range(512):
        # ONLY TRAIN WITH DATA WHERE WHEEZY EQUALS I
        train2 = train[train['wheezy-copper-turtle-magic']==k] 
        train2p = train2.copy(); idx1 = train2.index 
        test2 = test[test['wheezy-copper-turtle-magic']==k]
        validTestData = len(test2)!=0
        if not validTestData:
            print("WARNING_PREDICTION : Zero length test data for "" k: ", k, " Length(train2): ", len(train2), " Length(test2): ", len(test2))

        # ADD PSEUDO LABELED DATA
        test2p = test2[ (test2['target']<=0.01) | (test2['target']>=0.99) ].copy()
        test2p.loc[ test2p['target']>=0.5, 'target' ] = 1
        test2p.loc[ test2p['target']<0.5, 'target' ] = 0 
        train2p = pd.concat([train2p,test2p],axis=0)
        train2p.reset_index(drop=True,inplace=True)
        
        # FEATURE SELECTION (USE APPROX 40 OF 255 FEATURES)
        sel = VarianceThreshold(threshold=1.5).fit(train2p[cols])     
        train3p = sel.transform(train2p[cols])
        train3 = sel.transform(train2[cols])
        test3 = None
        if (validTestData == True):
            test3 = sel.transform(test2[cols])
            
        # STRATIFIED K FOLD
        skf = StratifiedKFold(n_splits=11, random_state=42, shuffle=True)
        for train_index, test_index in skf.split(train3p, train2p['target']):
            test_index3 = test_index[ test_index<len(train3) ] # ignore pseudo in oof
            
            # MODEL AND PREDICT WITH QDA
            clf = QuadraticDiscriminantAnalysis(reg_param=0.5)
            clf.fit(train3p[train_index,:],train2p.loc[train_index]['target'])
            oof[idx1[test_index3]] = clf.predict_proba(train3[test_index3,:])[:,1]
            if (validTestData == True):
                preds[test2.index] += clf.predict_proba(test3)[:,1] / skf.n_splits
        
        #if k%64==0: print(k)
            
    # PRINT CV AUC
    auc = roc_auc_score(train['target'],oof)
    print('Pseudo Labeled QDA scores CV =',round(auc,5))


    # # Submit Predictions
    sub = pd.read_csv(os.path.join(dataDirPath,'sample_submission.csv'))
    sub['target'] = preds
    sub.to_csv(os.path.join(scriptDirPath, 'submission.csv'),index=False)
def BuildFirstModel():
    global train, test, cols
    # INITIALIZE VARIABLES
    # print(cols)
    oof = np.zeros(len(train))
    preds = np.zeros(len(test))

    # BUILD 512 SEPARATE MODELS
    for i in range(512):
        # ONLY TRAIN WITH DATA WHERE WHEEZY EQUALS I
        train2 = train[train['wheezy-copper-turtle-magic'] == i]
        assert (len(train2) != 0)
        test2 = test[test['wheezy-copper-turtle-magic'] == i]
        validTestData = len(test2) != 0
        if not validTestData:
            print("WARNING : Zero length test data for "
                  " i: ", i, " Length(train2): ", len(train2),
                  " Length(test2): ", len(test2))
        # print(test['wheezy-copper-turtle-magic'].head())
        idx1 = train2.index
        idx2 = test2.index
        train2.reset_index(drop=True, inplace=True)

        # FEATURE SELECTION (USE APPROX 40 OF 255 FEATURES)
        sel = VarianceThreshold(threshold=1.5).fit(train2[cols])
        train3 = sel.transform(train2[cols])
        test3 = None
        if validTestData:
            test3 = sel.transform(test2[cols])

        # STRATIFIED K-FOLD
        skf = StratifiedKFold(n_splits=11, random_state=42, shuffle=True)
        for train_index, test_index in skf.split(train3, train2['target']):

            # MODEL AND PREDICT WITH QDA
            clf = QuadraticDiscriminantAnalysis(reg_param=0.5)
            clf.fit(train3[train_index, :], train2.loc[train_index]['target'])
            oof[idx1[test_index]] = clf.predict_proba(train3[test_index, :])[:,
                                                                             1]
            if validTestData:
                preds[idx2] += clf.predict_proba(test3)[:, 1] / skf.n_splits

        #if i%64==0: print(i)

    # PRINT CV AUC
    auc = roc_auc_score(train['target'], oof)
    print('QDA scores CV =', round(auc, 5))

    # INITIALIZE VARIABLES
    test['target'] = preds
class FaceClassifier():
    def __init__(self, classifier=FaceClassifierModels.DEFAULT):
        self._clf = None
        if classifier == FaceClassifierModels.LINEAR_SVM:
            self._clf = SVC(C=1.0, kernel="linear", probability=True)
        elif classifier == FaceClassifierModels.NAIVE_BAYES:
            self._clf = GaussianNB()
        elif classifier == FaceClassifierModels.RBF_SVM:
            self._clf = SVC(C=1, kernel='rbf', probability=True, gamma=2)
        elif classifier == FaceClassifierModels.NEAREST_NEIGHBORS:
            self._clf = KNeighborsClassifier(1)
        elif classifier == FaceClassifierModels.DECISION_TREE:
            self._clf = DecisionTreeClassifier(max_depth=5)
        elif classifier == FaceClassifierModels.RANDOM_FOREST:
            self._clf = RandomForestClassifier(max_depth=5,
                                               n_estimators=10,
                                               max_features=1)
        elif classifier == FaceClassifierModels.NEURAL_NET:
            self._clf = MLPClassifier(alpha=1)
        elif classifier == FaceClassifierModels.ADABOOST:
            self._clf = AdaBoostClassifier()
        elif classifier == FaceClassifierModels.QDA:
            self._clf = QuadraticDiscriminantAnalysis()
        print("classifier={}".format(FaceClassifierModels(classifier)))

    def fit(self, embeddings, labels):
        self._clf.fit(embeddings, labels)

    def predict(self, vec):
        return self._clf.predict_proba(vec)
class QuadraticDiscriminantAnalysisImpl():
    def __init__(self,
                 priors=None,
                 reg_param=0.0,
                 store_covariance=False,
                 tol=0.0001,
                 store_covariances=None):
        self._hyperparams = {
            'priors': priors,
            'reg_param': reg_param,
            'store_covariance': store_covariance,
            'tol': tol,
            'store_covariances': store_covariances
        }

    def fit(self, X, y=None):
        self._sklearn_model = SKLModel(**self._hyperparams)
        if (y is not None):
            self._sklearn_model.fit(X, y)
        else:
            self._sklearn_model.fit(X)
        return self

    def predict(self, X):
        return self._sklearn_model.predict(X)

    def predict_proba(self, X):
        return self._sklearn_model.predict_proba(X)
Ejemplo n.º 8
0
def qda(np_train_x, np_train_y, np_test_x, np_test_y, verified_num,
        rejected_num, p):
    model_QDA = QuadraticDiscriminantAnalysis()
    model_QDA.fit(np_train_x, np_train_y)
    for prob in p:
        predicted_values_QDA = np.where(
            model_QDA.predict_proba(np_test_x)[:, 1] > prob, 1, 0)

        total_miss_classified_QDA = 0
        reject_wrong_QDA = 0
        verify_wrong_QDA = 0
        for i in range(len(np_test_x)):
            total_miss_classified_QDA += abs(np_test_y[i] -
                                             predicted_values_QDA[i])
            if np_test_y[i] == 1 and predicted_values_QDA[i] == 0:
                reject_wrong_QDA += 1
            if np_test_y[i] == 0 and predicted_values_QDA[i] == 1:
                verify_wrong_QDA += 1
        print("\n----------------------Quadratic Discriminant Analysis prob:",
              prob, "--------------------")
        print("miss-classification rate :",
              total_miss_classified_QDA / (rejected_num + verified_num),
              "\nFalse negative rate (type1 error) :",
              reject_wrong_QDA / verified_num,
              "\nFalse positive rate (type2 error) :",
              verify_wrong_QDA / rejected_num)
def test_qda():
    # QDA classification.
    # This checks that QDA implements fit and predict and returns
    # correct values for a simple toy dataset.
    clf = QuadraticDiscriminantAnalysis()
    y_pred = clf.fit(X6, y6).predict(X6)
    assert_array_equal(y_pred, y6)

    # Assure that it works with 1D data
    y_pred1 = clf.fit(X7, y6).predict(X7)
    assert_array_equal(y_pred1, y6)

    # Test probas estimates
    y_proba_pred1 = clf.predict_proba(X7)
    assert_array_equal((y_proba_pred1[:, 1] > 0.5) + 1, y6)
    y_log_proba_pred1 = clf.predict_log_proba(X7)
    assert_array_almost_equal(np.exp(y_log_proba_pred1), y_proba_pred1, 8)

    y_pred3 = clf.fit(X6, y7).predict(X6)
    # QDA shouldn't be able to separate those
    assert np.any(y_pred3 != y7)

    # Classes should have at least 2 elements
    with pytest.raises(ValueError):
        clf.fit(X6, y4)
Ejemplo n.º 10
0
def lda(df, headers, title):
    lda = LinearDiscriminantAnalysis(solver="svd", store_covariance=True)
    qda = QuadraticDiscriminantAnalysis(store_covariance=True)
    
    df_train = df[:int(len(df)*0.8)].reset_index(drop=True).fillna(0)
    df_test = df[int(len(df)*0.8):].reset_index(drop=True).fillna(0)

    lda.fit(df_train[headers], df_train['cho2_b'])
    qda.fit(df_train[headers], df_train['cho2_b'])

    y_pred=lda.predict(df_test[headers])
    y=df_test['cho_b']
    utils.evaluate(y, y_pred, 0, 'LDA '+title)
    utils.plot_eval(df_test, y, y_pred, title='LDA '+title)
    y_pred=qda.predict(df_test[headers])
    utils.evaluate(y, y_pred, 0, 'QDA '+title)
    utils.plot_eval(df_test, y, y_pred, title='QDA '+title)

    # plot areas
    if len(headers) == 2:
        cho_true = df_test[df_test['cho2_b'] == True]
        cho_false = df_test[df_test['cho_b'] == False]

        fig = plt.figure(figsize=(12, 8))
        plt.subplot(2, 1, 1)
        plt.suptitle('LDA')
        plt.scatter(cho_false[headers[0]], cho_false[headers[1]], label='CHO false', s=8, marker='o')
        plt.scatter(cho_true[headers[0]], cho_true[headers[1]], label='CHO true', s=15, marker='o')

        nx, ny = 200, 100
        x_min, x_max = plt.xlim()
        y_min, y_max = plt.ylim()
        xx, yy = np.meshgrid(np.linspace(x_min, x_max, nx),
                             np.linspace(y_min, y_max, ny))
        Z = lda.predict_proba(np.c_[xx.ravel(), yy.ravel()+1/1000000000000])
        Z = Z[:, 1].reshape(xx.shape)
        plt.pcolormesh(xx, yy, Z, cmap='RdBu',
                       norm=colors.Normalize(0., 1.), zorder=0)
        plt.contour(xx, yy, Z, [0.5], linewidths=2., colors='white')
        plt.legend()

        plt.subplot(2, 1, 2)
        plt.suptitle('QDA')
        plt.scatter(cho_false[headers[0]], cho_false[headers[1]], label='CHO false', s=3, marker='o')
        plt.scatter(cho_true[headers[0]], cho_true[headers[1]], label='CHO true', s=5, marker='x')
        nx, ny = 200, 100
        x_min, x_max = plt.xlim()
        y_min, y_max = plt.ylim()
        xx, yy = np.meshgrid(np.linspace(x_min, x_max, nx),
                             np.linspace(y_min, y_max, ny))
        Z = qda.predict_proba(np.c_[xx.ravel(), yy.ravel()])
        Z = Z[:, 1].reshape(xx.shape)
        plt.pcolormesh(xx, yy, Z, cmap='RdBu',
                       norm=colors.Normalize(0., 1.), zorder=0)
        plt.contour(xx, yy, Z, [0.5], linewidths=2., colors='white')
        plt.legend()

    return lda, qda
Ejemplo n.º 11
0
class FaceClassifier():
    def __init__(self, classifier=FaceClassifierModels.DEFAULT):
        self._clf = None
        if classifier.value == FaceClassifierModels.LINEAR_SVM.value:
            self._clf = SVC(C=1.0, kernel="linear", probability=True)
        elif classifier.value == FaceClassifierModels.NAIVE_BAYES.value:
            self._clf = GaussianNB()
        elif classifier.value == FaceClassifierModels.RBF_SVM.value:
            pipe_svc = make_pipeline(StandardScaler(),
                                     SVC(random_state=1, probability=True))

            param_range = [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]
            param_grid = [{
                'svc__C': param_range,
                'svc__kernel': ['linear']
            }, {
                'svc__C': param_range,
                'svc__gamma': param_range,
                'svc__kernel': ['rbf']
            }]
            self._clf = GridSearchCV(estimator=pipe_svc,
                                     param_grid=param_grid,
                                     scoring='accuracy',
                                     cv=5,
                                     n_jobs=-1)

            # self._clf = SVC(C=1, kernel='rbf', probability=True, gamma=2)
        elif classifier.value == FaceClassifierModels.NEAREST_NEIGHBORS.value:
            self._clf = KNeighborsClassifier(1)
        elif classifier.value == FaceClassifierModels.DECISION_TREE.value:
            self._clf = DecisionTreeClassifier(max_depth=5)
        elif classifier.value == FaceClassifierModels.RANDOM_FOREST.value:
            self._clf = RandomForestClassifier(max_depth=5,
                                               n_estimators=10,
                                               max_features=1)
        elif classifier.value == FaceClassifierModels.NEURAL_NET.value:
            # self._clf = MLPClassifier(alpha=1)
            self._clf = MLPClassifier(solver='lbfgs',
                                      alpha=1e-2,
                                      hidden_layer_sizes=(512, 100),
                                      random_state=1)

        elif classifier.value == FaceClassifierModels.ADABOOST.value:
            self._clf = AdaBoostClassifier()
        elif classifier.value == FaceClassifierModels.QDA.value:
            self._clf = QuadraticDiscriminantAnalysis()
        # print("classifier={}".format(FaceClassifierModels(classifier)))
        print("classifier={}".format(self._clf))

    def fit(self, embeddings, labels):
        self._clf.fit(embeddings, labels)

    def predict(self, vec):
        return self._clf.predict_proba(vec)

    def score(self, X, y):
        return self._clf.score(X, y)
Ejemplo n.º 12
0
class QDA(object):
    def __init__(self,
                 priors=None,
                 reg_param=0.,
                 store_covariance=False,
                 tol=1.0e-4):
        """
        :param priors:  分来优先级, array, 可选项, shape=[n_classes]
        :param reg_param:  float, 可选项,将协方差估计正规化
        :param store_covariance: boolean 如果为真,则计算并存储协方差矩阵到self.covariance_中
        :param tol:  使用排序评估的阈值
        """
        self.model = QuadraticDiscriminantAnalysis(
            priors=priors,
            reg_param=reg_param,
            store_covariance=store_covariance,
            tol=tol)

    def fit(self, x, y):
        self.model.fit(X=x, y=y)

    def get_params(self, deep=True):
        return self.model.get_params(deep=deep)

    def predict(self, x):
        return self.model.predict(X=x)

    def predict_log_dict(self, x):
        return self.model.predict_log_proba(X=x)

    def predict_proba(self, x):
        return self.model.predict_proba(X=x)

    def score(self, x, y, sample_weight=None):
        return self.model.score(X=x, y=y, sample_weight=sample_weight)

    def set_params(self, **params):
        self.model.set_params(**params)

    def decision_function(self, x):  # 将决策函数应用于样本数组。
        return self.model.decision_function(X=x)

    def get_attribute(self):
        covariance = self.model.covariance_  # 每个种类的协方差矩阵, list of array-like of shape (n_features, n_features)
        means = self.model.means  # 种类均值, array-like of shape (n_classes, n_features)
        priors = self.model.priors_  # 种类占比, 求和为1, array-like of shape (n_classes)
        rotations = self.model.rotations_  # n_k = min(n_features, number of elements in class k) list_array,
        # 高斯分布的旋转
        scalings = self.model.scalings_  # list_array, 每个种类k,shape[n_k]的数组,包含高斯分布的缩放,
        # 如,旋转坐标系中的方差
        classes = self.model.classes_  # array-like, shape(n_classes,), 不同种类标签

        return covariance, means, priors, rotations, scalings, classes
Ejemplo n.º 13
0
class QDA(Model):
    # TODO investigate NaN in results
    def __init__(self):
        input_type = NumericalDataTypesEnum.table
        output_type = NumericalDataTypesEnum.vector

        super().__init__(input_type=input_type, output_type=output_type)
        self.__model = QuadraticDiscriminantAnalysis()

    def predict(self, data: InputData):
        predicted = self.__model.predict_proba(data.features)[:, 1]
        return predicted

    def fit(self, data: InputData):
        train_data, _ = train_test_data_setup(data=data)
        self.__model.fit(train_data.features, train_data.target)

    def tune(self, data):
        return 1
class QuadraticDiscriminantAnalysisPredictor(PredictorBase):
    '''
    Quadratic Discriminant Analysis
    '''

    def __init__(self):
        self.clf = QuadraticDiscriminantAnalysis()

    def fit(self, X_train, y_train):
        self.clf.fit(X_train, y_train)

    def predict(self, X_test):
        predictions = self.clf.predict_proba(X_test)
        predictions_df = self.bundle_predictions(predictions)

        return predictions_df

    def get_k_best_k(self):
        return 4
Ejemplo n.º 15
0
class QuadraticDiscriminantAnalysisImpl:
    def __init__(self, **hyperparams):
        self._hyperparams = hyperparams
        self._wrapped_model = Op(**self._hyperparams)

    def fit(self, X, y=None):
        if y is not None:
            self._wrapped_model.fit(X, y)
        else:
            self._wrapped_model.fit(X)
        return self

    def predict(self, X):
        return self._wrapped_model.predict(X)

    def predict_proba(self, X):
        return self._wrapped_model.predict_proba(X)

    def decision_function(self, X):
        return self._wrapped_model.decision_function(X)
class QuadraticDiscriminant(AbstractModel):
    def __init__(self, optimised):
        self.create_model(optimised)

    def create_model(self, optimised):
        self.model = QuadraticDiscriminantAnalysis()

    def fit_model(self, x_train, y_train):
        self.model.fit(x_train, y_train)

    def predict(self, x_test):
        y_pred = self.model.predict(x_test)
        return y_pred

    def get_model(self):
        return self.model

    def predict_proba(self, x_test):
        y_pred = self.model.predict_proba(x_test)
        return y_pred

    def print(self):
        pass
Ejemplo n.º 17
0
class QDAClf(ClassifierHolderBase):
    def __init__(self, clf_params):
        super().__init__(clf_params)
        self.clf_obj = QuadraticDiscriminantAnalysis()
        self.clf_retrainable = False

    def normalize_data(self, vals_to_normalize):
        for i in range(0, len(vals_to_normalize) - 1):
            self.normalized_data.append(
                self.op_subtr_input(vals_to_normalize[i],
                                    vals_to_normalize[i + 1]))

    def train_clf(self):
        self.can_be_fit = False
        self.clf_obj.fit(self.train_X, self.train_y)
        self.train_classes = self.clf_obj.classes_

    def predict_feature_probability(self):
        return self.clf_obj.predict_proba(self.test_X)

    def enough_min_train_data(self):
        return (self.accumulated_samples >= self.num_train_samples)

    def add_train_y_value(self, val_list):
        self.accumulated_samples += 1
        for i in range(0, len(val_list) - 1):
            val = self.op_bin_input(val_list[i], val_list[i + 1])
            self.train_y.append(val)

    def add_new_train_item(self, vals_to_normalize_x, vals_to_normalize_y):
        self.normalize_data(vals_to_normalize_x)
        self.add_train_x_values()
        self.add_train_y_value(vals_to_normalize_y)

    def get_actual_next_value(self):
        return self.train_y[-1]
Ejemplo n.º 18
0
def runModel(X_train, X_test, Y_train, Y_test):
    """ Create model and run predictions
    """

    numberClasses = 4

    Y_predict = np.zeros((Y_test[0].shape[0], numberClasses), dtype=np.float)
    models = []

    for i in range(numberClasses):
        print('Model: ', i + 1)
        #        model = RandomForestClassifier(
        #                n_estimators=300,
        #                max_depth=180,
        #                min_samples_split=6,
        #                min_samples_leaf=1,
        #                random_state=24,
        #                oob_score=True)
        model = QuadraticDiscriminantAnalysis()
        model.fit(X_train[i], Y_train[i])
        Y_predict[:, i] = model.predict_proba(X_test[i])[:, 1]
        models.append(model)

    return Y_predict, models
def test_qda():
    # QDA classification.
    # This checks that QDA implements fit and predict and returns
    # correct values for a simple toy dataset.
    clf = QuadraticDiscriminantAnalysis()
    y_pred = clf.fit(X6, y6).predict(X6)
    assert_array_equal(y_pred, y6)

    # Assure that it works with 1D data
    y_pred1 = clf.fit(X7, y6).predict(X7)
    assert_array_equal(y_pred1, y6)

    # Test probas estimates
    y_proba_pred1 = clf.predict_proba(X7)
    assert_array_equal((y_proba_pred1[:, 1] > 0.5) + 1, y6)
    y_log_proba_pred1 = clf.predict_log_proba(X7)
    assert_array_almost_equal(np.exp(y_log_proba_pred1), y_proba_pred1, 8)

    y_pred3 = clf.fit(X6, y7).predict(X6)
    # QDA shouldn't be able to separate those
    assert np.any(y_pred3 != y7)

    # Classes should have at least 2 elements
    assert_raises(ValueError, clf.fit, X6, y4)
Ejemplo n.º 20
0
def main():
    # ============================================
    # === Loading data
    # ============================================
    train = pd.read_csv('../input/train.csv')
    test = pd.read_csv('../input/test.csv')

    cols = [c for c in train.columns if c not in ['id', 'target']]
    cols.remove('wheezy-copper-turtle-magic')

    # ============================================
    # === Step 1 - Build first QDA model and predict test
    # ============================================
    # initialize variables
    oof = np.zeros(len(train))
    preds = np.zeros(len(test))

    # build 512 separate models
    for i in range(512):
        # only train with data where wheezy equals i
        train2 = train[train['wheezy-copper-turtle-magic'] == i]
        test2 = test[test['wheezy-copper-turtle-magic'] == i]
        idx1 = train2.index
        idx2 = test2.index
        train2.reset_index(drop=True, inplace=True)

        # feature selection (use approx 40 of 255 features)
        sel = VarianceThreshold(threshold=1.5).fit(train2[cols])
        train3 = sel.transform(train2[cols])
        test3 = sel.transform(test2[cols])

        # stratified k-fold
        skf = StratifiedKFold(n_splits=11, random_state=42, shuffle=True)
        for train_index, test_index in skf.split(train3, train2['target']):
            # model and predict with QDA
            clf = QuadraticDiscriminantAnalysis(reg_param=0.5)
            clf.fit(train3[train_index, :], train2.loc[train_index]['target'])
            oof[idx1[test_index]] = clf.predict_proba(train3[test_index, :])[:,
                                                                             1]
            preds[idx2] += clf.predict_proba(test3)[:, 1] / skf.n_splits

    # print cv auc
    auc = roc_auc_score(train['target'], oof)
    print('QDA scores CV =', round(auc, 5))

    # ============================================
    # === Step2 - Add pseudo label data and build second model
    # ============================================
    n_init = 10
    test['target'] = preds

    # initialize variables
    oof = np.zeros(len(train))
    preds = np.zeros(len(test))

    # build 512 separate models
    for k in range(512):
        # only train with data where wheezy equals i
        train2 = train[train['wheezy-copper-turtle-magic'] == k]
        train2p = train2.copy()
        idx1 = train2.index
        test2 = test[test['wheezy-copper-turtle-magic'] == k]

        # add pseudo labeled data
        test2p = test2[(test2['target'] <= 0.01) |
                       (test2['target'] >= 0.99)].copy()
        test2p.loc[test2p['target'] >= 0.5, 'target'] = 1
        test2p.loc[test2p['target'] < 0.5, 'target'] = 0
        train2p = pd.concat([train2p, test2p], axis=0)
        train2p.reset_index(drop=True, inplace=True)

        # feature selextion (use approx 40 of 255 features)
        sel = VarianceThreshold(threshold=1.5).fit(train2p[cols])
        train3p = sel.transform(train2p[cols])
        train3 = sel.transform(train2[cols])
        test3 = sel.transform(test2[cols])

        # get cluster labels
        target_0 = np.argwhere(train2p["target"].values == 0).reshape(-1)
        target_1 = np.argwhere(train2p["target"].values == 1).reshape(-1)
        n_cols = train3.shape[1]
        # 1引くのは万が一insertがうまく言っていなかったとき対策
        cluster_labels = np.zeros_like(train2p["target"].values) - 1
        proba_x_0 = np.zeros((len(target_0), n_cols * 2))
        proba_x_1 = np.zeros((len(target_1), n_cols * 2))

        # calculate GMM per col
        for j in range(n_cols):
            # target = 0
            kms_0 = GaussianMixture(n_components=2,
                                    max_iter=10000,
                                    n_init=n_init,
                                    means_init=[[-1], [1]],
                                    init_params="kmeans")
            kms_0.fit(train3p[target_0, j:j + 1])
            pred_0 = kms_0.predict_proba(train3p[target_0, j:j + 1])
            proba_x_0[:, j * 2:(j + 1) * 2] = pred_0

            # target = 1
            kms_1 = GaussianMixture(n_components=2,
                                    max_iter=10000,
                                    n_init=n_init,
                                    means_init=[[-1], [1]],
                                    init_params="kmeans")
            kms_1.fit(train3p[target_1, j:j + 1])
            pred_1 = kms_1.predict_proba(train3p[target_1, j:j + 1])
            proba_x_1[:, j * 2:(j + 1) * 2] = pred_1

        # re-calculate GMM
        kms_0 = GaussianMixture(
            n_components=3,
            max_iter=10000,
            n_init=n_init,
            init_params="kmeans",
        )
        kms_0.fit(proba_x_0)

        kms_1 = GaussianMixture(
            n_components=3,
            max_iter=10000,
            n_init=n_init,
            init_params="kmeans",
        )
        kms_1.fit(proba_x_1)

        # predict cluster labels
        cluster_labels_0 = kms_0.predict(proba_x_0)
        cluster_labels_1 = kms_1.predict(proba_x_1) + 3
        cluster_labels[target_0] = cluster_labels_0
        cluster_labels[target_1] = cluster_labels_1

        # stratified k-fold
        skf = StratifiedKFold(n_splits=11, random_state=42, shuffle=True)
        for train_index, test_index in skf.split(train3p, cluster_labels):
            test_index3 = test_index[test_index <
                                     len(train3)]  # ignore pseudo in oof

            # model and predict with QDA
            clf = QuadraticDiscriminantAnalysis(reg_param=0.5)
            clf.fit(train3p[train_index, :], cluster_labels[train_index])

            # predict cluster labels
            val_prediction_6 = clf.predict_proba(train3p[test_index3, :])
            val_prediction = val_prediction_6[:,
                                              3] + val_prediction_6[:,
                                                                    4] + val_prediction_6[:,
                                                                                          5]
            oof[idx1[test_index3]] = val_prediction
            test_prediction_6 = clf.predict_proba(test3)
            test_prediction = test_prediction_6[:,
                                                3] + test_prediction_6[:,
                                                                       4] + test_prediction_6[:,
                                                                                              5]
            preds[test2.index] += test_prediction / skf.n_splits

    # print cv auc
    auc = roc_auc_score(train['target'], oof)
    print('Pseudo Labeled QDA scores CV =', round(auc, 5))

    # ============================================
    # === Make Submission
    # ============================================
    sub = pd.read_csv('../input/sample_submission.csv')
    sub['target'] = preds
    sub.to_csv('submission.csv', index=False)
Ejemplo n.º 21
0
Ty3 = np.ravel(Ty3)
TX3 = TX3.reset_index().values
TX3 = np.delete(TX3, [0, 1], axis=1)

#LOG prediction prob
log1Prob = log1.predict_proba(TX1)
log2Prob = log2.predict_proba(TX2)
log3Prob = log3.predict_proba(TX3)

#LDA prediction prob
lda1Prob = lda1.predict_proba(TX1)
lda2Prob = lda2.predict_proba(TX2)
lda3Prob = lda3.predict_proba(TX3)

#QDA prediction prob
qda1Prob = qda1.predict_proba(TX1)
qda2Prob = qda2.predict_proba(TX2)
qda3Prob = qda3.predict_proba(TX3)

#build a function getROCdata, that returns a dataframe with the 11 columns listed
#truthvals is a column vector containing the correct classification
#probs is a column vector of probability that the model believes the datapoint to be of class 1
#thresholds is a vector of probability thresholds to use when deciding to predict what class it is


def getROCdata(truthVals, probs, thresholds):
    row_array = np.zeros([len(thresholds), 11])

    for j in range(len(thresholds)):
        add_array = np.zeros([1, 11])
        Predict = np.zeros(len(probs))
What is the training misclassification rate?
"""

lda1 = LDA(solver="svd", store_covariance=True)
lda1.fit(warX,warY)

my_lda_pred = pd.DataFrame()
my_lda_pred["pred"] = ["No" if x == 0 else "Yes" for x in lda1.predict(warX)]
my_lda_pred["actual"] = ["No" if x == 0 else "Yes" for x in war["start"]]
conf_lda = pd.crosstab(my_lda_pred["pred"], my_lda_pred["actual"])
conf_lda

(1/(war.shape[0])) * (conf_lda.iloc[1,0] + conf_lda.iloc[0,1])


"""
6.69%
"""

qda1 = QDA(store_covariances=True)
qda1.fit(warX,warY)

test = qda1.predict_proba(warX)

my_qda_pred = pd.DataFrame()
my_qda_pred["pred"] = ["No" if x < .5 else "Yes" for x in qda1.predict(warX)]
my_qda_pred["actual"] = ["No" if x == 0 else "Yes" for x in war["start"]]
conf_qda = pd.crosstab(my_qda_pred["pred"], my_qda_pred["actual"])
conf_qda

(1/(war.shape[0])) * (conf_qda.iloc[1,0] + conf_qda.iloc[0,1])
Ejemplo n.º 23
0
    def run(self, verbose):
        if verbose > 0:
            print("Running ensemble")

        #Build Model1 - Level 0
        if verbose > 0:
            print("Running Random Forest Classifier")

        Model1 = RandomForestClassifier(
            n_estimators=self.attrs.rf_n_estimators,
            max_depth=self.attrs.rf_max_depth,
            min_samples_split=self.attrs.rf_min_samples_split,
            min_samples_leaf=self.attrs.rf_min_samples_leaf,
            random_state=1,
            n_jobs=self.attrs.rf_n_jobs)

        if self.attrs.rf_use_rfe:
            if verbose > 0:
                print(" using RFE")
            Model1 = RFE(Model1, n_features_to_select=150, step=20)

        Model1.fit(self.attrs.X_train, self.attrs.Y_train)

        #Predict on X_train, X_test
        Model1_pred_test = Model1.predict_proba(self.attrs.X_test)
        Model1_pred_train = Model1.predict_proba(self.attrs.X_train)
        Model1_pred_testsub = Model1.predict_proba(self.attrs.X_testsub)
        Model1_pred_blindsub = Model1.predict_proba(self.attrs.X_blindsub)

        #Build Model2 - Level 0
        if verbose > 0:
            print("Running SVM Classifier")

        Model2 = SVC(C=self.attrs.svc_C,
                     gamma=self.attrs.svc_gamma,
                     kernel=self.attrs.svc_kernel,
                     probability=self.attrs.svc_probability,
                     random_state=2)
        Model2.fit(self.attrs.X_train, self.attrs.Y_train)
        #Predict on X_train, X_test
        Model2_pred_test = Model2.predict_proba(self.attrs.X_test)
        Model2_pred_train = Model2.predict_proba(self.attrs.X_train)
        Model2_pred_testsub = Model2.predict_proba(self.attrs.X_testsub)
        Model2_pred_blindsub = Model2.predict_proba(self.attrs.X_blindsub)

        #Build Model3 - Level 0
        if verbose > 0:
            print("Quadratic Discriminant Analysis Classifier")

        Model3 = QuadraticDiscriminantAnalysis()
        Model3.fit(self.attrs.X_train, self.attrs.Y_train)
        #Predict on X_train, X_test
        Model3_pred_test = Model3.predict_proba(self.attrs.X_test)
        Model3_pred_train = Model3.predict_proba(self.attrs.X_train)
        Model3_pred_testsub = Model3.predict_proba(self.attrs.X_testsub)
        Model3_pred_blindsub = Model3.predict_proba(self.attrs.X_blindsub)

        #Build Model4 - Level 0
        if verbose > 0:
            print("GaussianNB Classifier")

        Model4 = GaussianNB()
        Model4.fit(self.attrs.X_train, self.attrs.Y_train)
        #Predict on X_train, X_test
        Model4_pred_test = Model4.predict_proba(self.attrs.X_test)
        Model4_pred_train = Model4.predict_proba(self.attrs.X_train)
        Model4_pred_testsub = Model4.predict_proba(self.attrs.X_testsub)
        Model4_pred_blindsub = Model4.predict_proba(self.attrs.X_blindsub)

        #Build Model5 - Level 0
        if verbose > 0:
            print("KNeighbors Classifier")

        Model5 = KNeighborsClassifier(n_neighbors=self.attrs.kn_n_neighbors,
                                      weights=self.attrs.kn_weights)

        Model5.fit(self.attrs.X_train, self.attrs.Y_train)
        #Predict on X_train, X_test
        Model5_pred_test = Model5.predict_proba(self.attrs.X_test)
        Model5_pred_train = Model5.predict_proba(self.attrs.X_train)
        Model5_pred_testsub = Model5.predict_proba(self.attrs.X_testsub)
        Model5_pred_blindsub = Model5.predict_proba(self.attrs.X_blindsub)

        #Build Model6 - Level 0
        if verbose > 0:
            print("Logistic Regression Classifier")

        Model6 = LogisticRegression(C=self.attrs.lr_C, random_state=6)

        Model6.fit(self.attrs.X_train, self.attrs.Y_train)
        #Predict on X_train, X_test
        Model6_pred_test = Model6.predict_proba(self.attrs.X_test)
        Model6_pred_train = Model6.predict_proba(self.attrs.X_train)
        Model6_pred_testsub = Model6.predict_proba(self.attrs.X_testsub)
        Model6_pred_blindsub = Model6.predict_proba(self.attrs.X_blindsub)

        #Final Model - Level 1
        #Creating training attributes for the stacked model
        if verbose > 0:
            print("Stacked Classifier")

        FeaturesTrain1 = np.hstack([
            Model1_pred_train, Model2_pred_train, Model3_pred_train,
            Model4_pred_train, Model5_pred_train, Model6_pred_train
        ])
        ModelFinal = LogisticRegression(random_state=49)
        ModelFinal.fit(FeaturesTrain1, self.attrs.Y_train)

        # Save the final model in case we want to work with it later
        self.attrs.final_model = ModelFinal

        #Creating test attributes final model
        Features_test1 = np.hstack([
            Model1_pred_test, Model2_pred_test, Model3_pred_test,
            Model4_pred_test, Model5_pred_test, Model6_pred_test
        ])
        Features_testsub1 = np.hstack([
            Model1_pred_testsub, Model2_pred_testsub, Model3_pred_testsub,
            Model4_pred_testsub, Model5_pred_testsub, Model6_pred_testsub
        ])
        Features_blindsub1 = np.hstack([
            Model1_pred_blindsub, Model2_pred_blindsub, Model3_pred_blindsub,
            Model4_pred_blindsub, Model5_pred_blindsub, Model6_pred_blindsub
        ])

        #Final predictions
        self.attrs.final_pred = ModelFinal.predict_proba(Features_test1)
        self.attrs.final_pred_testsub = ModelFinal.predict_proba(
            Features_testsub1)
        self.attrs.final_pred_blindsub = ModelFinal.predict_proba(
            Features_blindsub1)

        #AUC
        if verbose > 0:
            print("Calculating AUC")

        fpr, tpr, thresholds = roc_curve(self.attrs.Y_test,
                                         self.attrs.final_pred[:, 1])
        roc_auc = auc(fpr, tpr)
        print("AUC with Stacking: ", roc_auc)
Ejemplo n.º 24
0
    plt.title('LightGBM Feature Importance Based on Split')
    plt.tight_layout()
    plt.savefig('../output/lgbm_importances_split.jpg')
    plt.show()

    #
    X_train_df, X_valid_df, X_test_df = normalize_data(X_train_df, X_valid_df,
                                                       X_test_df, features)

    qda = QuadraticDiscriminantAnalysis(store_covariance=True)
    qda_model = qda.fit(X_train_df[features], y_train_df.values)
    qda_valid_predict = qda.predict(X_valid_df[features])
    qda_train_predict = qda.predict(X_train_df[features])
    qda_test_predict = qda.predict(X_test_df[features])

    qda_train_prob = qda.predict_proba(X_train_df[features])
    qda_valid_prob = qda.predict_proba(X_valid_df[features])
    qda_test_prob = qda.predict_proba(X_test_df[features])

    qda_train_acc = accuracy_score(y_train_df.values, qda_train_predict)
    qda_valid_acc = accuracy_score(y_valid_df.values, qda_valid_predict)
    qda_test_acc = accuracy_score(y_test_df.values, qda_test_predict)

    print('Train Accuracy is {}'.format(qda_train_acc))
    print('Valid Accuracy is {}'.format(qda_valid_acc))
    print('Test Accuracy is {}'.format(qda_test_acc))

    qda_profit, qda_cum_profit = utils.backtest(qda_test_predict,
                                                qda_test_prob[:, 1], 'qda')

    lda = LinearDiscriminantAnalysis(store_covariance=True)
Ejemplo n.º 25
0
def discriminatePlot(X, y, cVal, titleStr='', figdir='.', Xcolname = None, plotFig = False, removeTickLabels = False, testInd = None):
    # Frederic's Robust Wrapper for discriminant analysis function.  Performs lda, qda and RF afer error checking, 
    # Generates nice plots and returns cross-validated
    # performance, stderr and base line.
    # X np array n rows x p parameters
    # y group labels n rows
    # rgb color code for each data point - should be the same for each data beloging to the same group
    # titleStr title for plots
    # figdir is a directory name (folder name) for figures
    # Xcolname is a np.array or list of strings with column names for printout display
    # returns: ldaScore, ldaScoreSE, qdaScore, qdaScoreSE, rfScore, rfScoreSE, nClasses

    # Global Parameters
    CVFOLDS = 10
    MINCOUNT = 10
    MINCOUNTTRAINING = 5
    # figdir = '/Users/frederictheunissen/Documents/Data/Julie/Acoustical Analysis/Figures Voice'

    # Initialize Variables and clean up data
    classes, classesCount = np.unique(y, return_counts = True)  # Classes to be discriminated should be same as ldaMod.classes_
    goodIndClasses = np.array([n >= MINCOUNT for n in classesCount])
    goodInd = np.array([b in classes[goodIndClasses] for b in y])
    if testInd is not None:
        # Check for goodInd - should be an np.array of dtype=bool
        # Transform testInd into an index inside xGood and yGood
        testIndx = testInd.nonzero()[0]
        goodIndx = goodInd.nonzero()[0]
        testInd = np.hstack([ np.where(goodIndx == testval)[0] for testval in testIndx])
        trainInd = np.asarray([i for i in range(len(goodIndx)) if i not in testInd])
        
    yGood = y[goodInd]
    XGood = X[goodInd]
    cValGood = cVal[goodInd]
        
    classes, classesCount = np.unique(yGood, return_counts = True) 
    nClasses = classes.size         # Number of classes or groups  

    # Do we have enough data?  
    if (nClasses < 2):
        print ('Error in ldaPLot: Insufficient classes with minimun data (%d) for discrimination analysis' % (MINCOUNT))
        return -1, -1, -1, -1 , -1, -1, -1, -1, -1
    
    if testInd is None:
        cvFolds = min(min(classesCount), CVFOLDS)
        if (cvFolds < CVFOLDS):
            print ('Warning in ldaPlot: Cross-validation performed with %d folds (instead of %d)' % (cvFolds, CVFOLDS))
    else:
        cvFolds = 1
   
    # Data size and color values   
    nD = XGood.shape[1]                 # number of features in X
    nX = XGood.shape[0]                 # number of data points in X
    cClasses = []   # Color code for each class
    for cl in classes:
        icl = (yGood == cl).nonzero()[0][0]
        cClasses.append(np.append(cValGood[icl],1.0))
    cClasses = np.asarray(cClasses)
    
    # Use a uniform prior 
    myPrior = np.ones(nClasses)*(1.0/nClasses)  

    # Perform a PCA for dimensionality reduction so that the covariance matrix can be fitted.
    nDmax = int(np.fix(np.sqrt(nX//5)))
    if nDmax < nD:
        print ('Warning: Insufficient data for', nD, 'parameters. PCA projection to', nDmax, 'dimensions.' )
    nDmax = min(nD, nDmax)
    pca = PCA(n_components=nDmax)
    Xr = pca.fit_transform(XGood)
    print ('Variance explained is %.2f%%' % (sum(pca.explained_variance_ratio_)*100.0))
    
    
    # Initialise Classifiers  
    ldaMod = LDA(n_components = min(nDmax,nClasses-1), priors = myPrior, shrinkage = None, solver = 'svd') 
    qdaMod = QDA(priors = myPrior)
    rfMod = RF()   # by default assumes equal weights

        
    # Perform CVFOLDS fold cross-validation to get performance of classifiers.
    ldaYes = 0
    qdaYes = 0
    rfYes = 0
    cvCount = 0
    
    if testInd is None:
        skf = cross_validation.StratifiedKFold(yGood, cvFolds)
    else:
        skf = [(trainInd,testInd)]
    
    for train, test in skf:
        
        # Enforce the MINCOUNT in each class for Training
        trainClasses, trainCount = np.unique(yGood[train], return_counts=True)
        goodIndClasses = np.array([n >= MINCOUNTTRAINING for n in trainCount])
        goodIndTrain = np.array([b in trainClasses[goodIndClasses] for b in yGood[train]])

        # Specity the training data set, the number of groups and priors
        yTrain = yGood[train[goodIndTrain]]
        XrTrain = Xr[train[goodIndTrain]]

        trainClasses, trainCount = np.unique(yTrain, return_counts=True) 
        ntrainClasses = trainClasses.size
        
        # Skip this cross-validation fold because of insufficient data
        if ntrainClasses < 2:
            continue
        goodInd = np.array([b in trainClasses for b in yGood[test]])    
        if (goodInd.size == 0):
            continue
           
        # Fit the data
        trainPriors = np.ones(ntrainClasses)*(1.0/ntrainClasses)
        ldaMod.priors = trainPriors
        qdaMod.priors = trainPriors
        ldaMod.fit(XrTrain, yTrain)
        qdaMod.fit(XrTrain, yTrain)        
        rfMod.fit(XrTrain, yTrain)
        
        
        ldaYes += np.around((ldaMod.score(Xr[test[goodInd]], yGood[test[goodInd]]))*goodInd.size)
        qdaYes += np.around((qdaMod.score(Xr[test[goodInd]], yGood[test[goodInd]]))*goodInd.size)
        rfYes += np.around((rfMod.score(Xr[test[goodInd]], yGood[test[goodInd]]))*goodInd.size)
        cvCount += goodInd.size


      
# Refit with all the data  for the plots
        
    ldaMod.priors = myPrior
    qdaMod.priors = myPrior
    Xrr = ldaMod.fit_transform(Xr, yGood)
    # Check labels
    for a, b in zip(classes, ldaMod.classes_):
        if a != b:
            print ('Error in ldaPlot: labels do not match')
            
# Check the within-group covariance in the rotated space 
#    covs = []
#    for group in classes:
#        Xg = Xrr[yGood == group, :]
#        covs.append(np.atleast_2d(np.cov(Xg,rowvar=False)))
#    withinCov = np.average(covs, axis=0, weights=myPrior)
  
    # Print the five largest coefficients of first 3 DFA
    MAXCOMP = 3        # Maximum number of DFA componnents
    MAXWEIGHT = 5     # Maximum number of weights printed for each componnent
    
    ncomp = min(MAXCOMP, nClasses-1)
    nweight = min(MAXWEIGHT, nD)
    
    # The scalings_ has the eigenvectors of the LDA in columns and the pca.componnents has the eigenvectors of PCA in columns
    weights = np.dot(ldaMod.scalings_[:,0:ncomp].T, pca.components_)
    
    print('LDA Weights:')
    for ic in range(ncomp):
        idmax = np.argsort(np.abs(weights[ic,:]))[::-1]
        print('DFA %d: '%ic, end = '')
        for iw in range(nweight):
            if Xcolname is None:
                colstr = 'C%d' % idmax[iw]
            else:
                colstr = Xcolname[idmax[iw]]
            print('%s %.3f; ' % (colstr, float(weights[ic, idmax[iw]]) ), end='')
        print()
        
    if plotFig:
        dimVal = 0.8    # Overall diming of background so that points can be seen
        # Obtain fits in this rotated space for display purposes   
        ldaMod.fit(Xrr, yGood)    
        qdaMod.fit(Xrr, yGood)
        rfMod.fit(Xrr, yGood)
    
        XrrMean = Xrr.mean(0)
                
        # Make a mesh for plotting
        x1, x2 = np.meshgrid(np.arange(-6.0, 6.0, 0.1), np.arange(-6.0, 6.0, 0.1))
        xm1 = np.reshape(x1, -1)
        xm2 = np.reshape(x2, -1)
        nxm = np.size(xm1)
        Xm = np.zeros((nxm, Xrr.shape[1]))
        Xm[:,0] = xm1
        if Xrr.shape[1] > 1 :
            Xm[:,1] = xm2
        
        for ix in range(2,Xrr.shape[1]):
            Xm[:,ix] = np.squeeze(np.ones((nxm,1)))*XrrMean[ix]
        
        XmcLDA = np.zeros((nxm, 4))  # RGBA values for color for LDA
        XmcQDA = np.zeros((nxm, 4))  # RGBA values for color for QDA
        XmcRF = np.zeros((nxm, 4))  # RGBA values for color for RF

    
        # Predict values on mesh for plotting based on the first two DFs     
        yPredLDA = ldaMod.predict_proba(Xm) 
        yPredQDA = qdaMod.predict_proba(Xm) 
        yPredRF = rfMod.predict_proba(Xm)

    
        # Transform the predictions in color codes
        maxLDA = yPredLDA.max()
        for ix in range(nxm) :
            cWeight = yPredLDA[ix,:]                               # Prob for all classes
            cWinner = ((cWeight == cWeight.max()).astype('float')) # Winner takes all 
            # XmcLDA[ix,:] = np.dot(cWeight, cClasses)/nClasses
            XmcLDA[ix,:] = np.dot(cWinner*cWeight, cClasses)
            XmcLDA[ix,3] = (cWeight.max()/maxLDA)*dimVal
    
        # Plot the surface of probability    
        plt.figure(facecolor='white', figsize=(10,4))
        plt.subplot(131)
        Zplot = XmcLDA.reshape(np.shape(x1)[0], np.shape(x1)[1],4)
        plt.imshow(Zplot, zorder=0, extent=[-6, 6, -6, 6], origin='lower', interpolation='none', aspect='auto')
        if nClasses > 2:
            plt.scatter(Xrr[:,0], Xrr[:,1], c=cValGood, s=40, zorder=1)
        else:
            plt.scatter(Xrr,(np.random.rand(Xrr.size)-0.5)*12.0 , c=cValGood, s=40, zorder=1) 
        plt.title('%s: LDA %d/%d' % (titleStr, ldaYes, cvCount))
        plt.axis('square')
        plt.xlim((-6, 6))
        plt.ylim((-6, 6))    
        plt.xlabel('DFA 1')
        plt.ylabel('DFA 2')

        if removeTickLabels:
            ax = plt.gca()
        
            labels = [item.get_text() for item in ax.get_xticklabels()]
            empty_string_labels = ['']*len(labels)
            ax.set_xticklabels(empty_string_labels)
            
            labels = [item.get_text() for item in ax.get_yticklabels()]
            empty_string_labels = ['']*len(labels)
            ax.set_yticklabels(empty_string_labels)
        
    
        # Transform the predictions in color codes
        maxQDA = yPredQDA.max()
        for ix in range(nxm) :
            cWeight = yPredQDA[ix,:]                               # Prob for all classes
            cWinner = ((cWeight == cWeight.max()).astype('float')) # Winner takes all 
            # XmcLDA[ix,:] = np.dot(cWeight, cClasses)/nClasses
            XmcQDA[ix,:] = np.dot(cWinner*cWeight, cClasses)
            XmcQDA[ix,3] = (cWeight.max()/maxQDA)*dimVal
    
        # Plot the surface of probability  

        plt.subplot(132)
        Zplot = XmcQDA.reshape(np.shape(x1)[0], np.shape(x1)[1],4)
        plt.imshow(Zplot, zorder=0, extent=[-6, 6, -6, 6], origin='lower', interpolation='none', aspect='auto')
        if nClasses > 2:
            plt.scatter(Xrr[:,0], Xrr[:,1], c=cValGood, s=40, zorder=1)
        else:
            plt.scatter(Xrr,(np.random.rand(Xrr.size)-0.5)*12.0 , c=cValGood, s=40, zorder=1) 
        plt.title('%s: QDA %d/%d' % (titleStr, qdaYes, cvCount))
        plt.xlabel('DFA 1')
        plt.ylabel('DFA 2')
        plt.axis('square')
        plt.xlim((-6, 6))
        plt.ylim((-6, 6))
           
        if removeTickLabels:
            ax = plt.gca()
            labels = [item.get_text() for item in ax.get_xticklabels()]
            empty_string_labels = ['']*len(labels)
            ax.set_xticklabels(empty_string_labels)
        
            labels = [item.get_text() for item in ax.get_yticklabels()]
            empty_string_labels = ['']*len(labels)
            ax.set_yticklabels(empty_string_labels)
   
        # Transform the predictions in color codes
        maxRF = yPredRF.max()
        for ix in range(nxm) :
            cWeight = yPredRF[ix,:]           # Prob for all classes
            cWinner = ((cWeight == cWeight.max()).astype('float')) # Winner takes all 
            # XmcLDA[ix,:] = np.dot(cWeight, cClasses)/nClasses  # Weighted colors does not work
            XmcRF[ix,:] = np.dot(cWinner*cWeight, cClasses)
            XmcRF[ix,3] = (cWeight.max()/maxRF)*dimVal
    
    # Plot the surface of probability    
        plt.subplot(133)
        Zplot = XmcRF.reshape(np.shape(x1)[0], np.shape(x1)[1],4)
        plt.imshow(Zplot, zorder=0, extent=[-6, 6, -6, 6], origin='lower', interpolation='none', aspect='auto')
        if nClasses > 2:    
            plt.scatter(Xrr[:,0], Xrr[:,1], c=cValGood, s=40, zorder=1)
        else:
            plt.scatter(Xrr,(np.random.rand(Xrr.size)-0.5)*12.0 , c=cValGood, s=40, zorder=1) 
            
        plt.title('%s: RF %d/%d' % (titleStr, rfYes, cvCount))
        plt.xlabel('DFA 1')
        plt.ylabel('DFA 2')
        plt.axis('square')
        plt.xlim((-6, 6))
        plt.ylim((-6, 6))
        
        if removeTickLabels:
            ax = plt.gca()
                        
            labels = [item.get_text() for item in ax.get_xticklabels()]
            empty_string_labels = ['']*len(labels)
            ax.set_xticklabels(empty_string_labels)
        
            labels = [item.get_text() for item in ax.get_yticklabels()]
            empty_string_labels = ['']*len(labels)
            ax.set_yticklabels(empty_string_labels)
        
        plt.show()
        plt.savefig('%s/%s.png' % (figdir,titleStr), format='png', dpi=1000)


    # Results
    ldaYes = int(ldaYes)
    qdaYes = int(qdaYes)
    rfYes = int(rfYes)
    
    p = 1.0/nClasses
    ldaP = 0
    qdaP = 0
    rfP = 0
    
    for k in range(ldaYes, cvCount+1):
        ldaP += binom.pmf(k, cvCount, p)
        
    for k in range(qdaYes, cvCount+1):
        qdaP += binom.pmf(k, cvCount, p)
        
    for k in range(rfYes, cvCount+1):
        rfP += binom.pmf(k, cvCount, p)
        
    print ("Number of classes %d. Chance level %.2f %%" % (nClasses, 100.0/nClasses))
    print ("%s LDA: %.2f %% (%d/%d p=%.4f)" % (titleStr, 100.0*ldaYes/cvCount, ldaYes, cvCount, ldaP))
    print ("%s QDA: %.2f %% (%d/%d p=%.4f)" % (titleStr, 100.0*qdaYes/cvCount, qdaYes, cvCount, qdaP))
    print ("%s RF: %.2f %% (%d/%d p=%.4f)" % (titleStr, 100.0*rfYes/cvCount, rfYes, cvCount, rfP))
    return ldaYes, qdaYes, rfYes, cvCount, ldaP, qdaP, rfP, nClasses, weights
Ejemplo n.º 26
0
QDAModel = QDA()
QDAModel.fit(X_train, y_train)
# ----------------------------------------------------
#Calculating Details
print('QDAModel Train Score is : ', QDAModel.score(X_train, y_train))
print('QDAModel Test Score is : ', QDAModel.score(X_test, y_test))
print("=" * 10)
# ---------------
print('QDAModel means are : ', QDAModel.means_)
print('QDAModel classes are : ', QDAModel.classes_)

print("=" * 25)
# ----------------------------------------------------
# Calculating Prediction
y_pred = QDAModel.predict(X_test)
y_pred_prob = QDAModel.predict_proba(X_test)
print('Prediction Probabilities Value for QDAModel is : \n', y_pred_prob[:5])
print('Pred Value for QDAModel is : ', y_pred[:5])
print('True Value for QDAModel is : ', y_test[:5])
print("=" * 25)
# ----------------------------------------------------
ClassificationReport = classification_report(y_test, y_pred)
print(ClassificationReport)
print("=" * 10)
# ---------------
CM = confusion_matrix(y_test, y_pred)
print(CM)
print("=" * 10)
# ---------------
# plt.figure()
# sns.heatmap(CM, center = True, annot=True, fmt="d")
Ejemplo n.º 27
0
def validation(cancer,
               local,
               signature_file,
               datafile='_data_mRNA.txt',
               trainfile='_train_idx.txt',
               testfile='_test_idx.txt',
               dtype='mRNA'):
    '''
  '''

    print('Validation step for cancer {}'.format(cancer), end='')

    # read signature database
    signatures = pd.read_csv(signature_file, sep=",")

    # read data and labels
    loc = os.path.join(local, cancer)
    data = pd.read_table(os.path.join(loc, cancer + datafile),
                         sep="\t",
                         index_col=0,
                         header=[0, 1])
    lbl = np.asarray(
        data.columns.get_level_values(0).astype(float).astype(int))

    # TODO: use default dict and please change this orrible load method!
    with open(os.path.join(loc, cancer + trainfile), 'r') as f:
        rows = (line.split('\t') for line in f)
        d = {(i, int(row[0])): list(map(int, row[1:][0].split(",")))
             for i, row in enumerate(rows)}
        train_idx = {}
        for key, value in d.items():
            train_idx.setdefault(key[1], []).append(value)

    with open(os.path.join(loc, cancer + testfile), 'r') as f:
        rows = (line.split('\t') for line in f)
        d = {(i, int(row[0])): list(map(int, row[1:][0].split(",")))
             for i, row in enumerate(rows)}
        test_idx = {}
        for key, value in d.items():
            test_idx.setdefault(key[1], []).append(value)

    cls = QDA()

    final_db = []

    # loop over each selected signature
    for i, signature in signatures.iterrows():
        n = np.asarray(signature.nodes.split(';'), dtype=int)
        signature.train = int(signature.train)
        idx_train = train_idx[int(signature.fold)][int(signature.train)]
        idx_test = test_idx[int(signature.fold)][int(signature.train)]
        Train = data.iloc[n, idx_train].T

        # take the half of indices for temporary training and the other half for validation
        idx_test1 = idx_test[:len(idx_test) // 2]
        idx_test2 = idx_test[len(idx_test) // 2:]

        Test1 = data.iloc[n, idx_test1].T
        Test2 = data.iloc[n, idx_test2].T

        lbl_train = lbl[idx_train]

        lbl_test1 = lbl[idx_test1]
        lbl_test2 = lbl[idx_test2]

        # train over training and predict over test
        cls.fit(Train, lbl_train)
        # test
        lbl_pred_test1_as_test = cls.predict_proba(Test1)
        lbl_pred_test2_as_test = cls.predict_proba(Test2)

        # train over training + test and predict over validation set
        cls.fit(pd.concat([Train, Test1], ignore_index=True),
                np.concatenate([lbl_train, lbl_test1]))
        lbl_pred_test2_as_val = cls.predict_proba(Test2)

        cls.fit(pd.concat([Train, Test2], ignore_index=True),
                np.concatenate([lbl_train, lbl_test2]))
        lbl_pred_test1_as_val = cls.predict_proba(Test1)

        for val_scores, test_scores, val_labels, test_labels, val_idx, test_idx_ in zip(
            [lbl_pred_test1_as_val, lbl_pred_test2_as_val],
            [lbl_pred_test2_as_test, lbl_pred_test1_as_test],
            [lbl_test1, lbl_test2], [lbl_test2, lbl_test1],
            [idx_test1, idx_test2], [idx_test2, idx_test1]):
            for group_name, scores, labels, idxs in zip(
                ['validation', 'test'], [val_scores, test_scores],
                [val_labels, test_labels], [val_idx, test_idx_]):
                for subj_score, subj_label, subj_idx in zip(
                        scores, labels, idxs):
                    info = dict()
                    info['fold'] = signature.train
                    info['repetition'] = signature.fold
                    info['cancer'] = signature.cancer
                    info['dtype'] = signature['dtype']
                    info['signature_nodes'] = signature.nodes
                    info['signatureID'] = i
                    info['betweenness_centrality'] = signature.bc
                    info['performance_chunck_couples'] = signature.perf_couple
                    info['training_accuracy_score_signature'] = signature.score
                    info['subjectID'] = subj_idx
                    info['subject_label'] = subj_label
                    prob_cl0, prob_cl1 = subj_score
                    info['subject_proba_cl0'] = prob_cl0
                    info['subject_proba_cl1'] = prob_cl1
                    info['subject_group'] = group_name

                    final_db.append(info)

    final_db = pd.DataFrame(data=final_db).to_csv(os.path.join(
        local,
        signature_file.split(os.sep)[-1].split(".")[0] + ".tidy"),
                                                  sep=",",
                                                  index=False)

    print('[done]')
    return
Ejemplo n.º 28
0
completeness, contamination = completeness_contamination(predictions, y_test)

print("completeness", completeness)
print("contamination", contamination)

#------------------------------------------------------------
# Compute the decision boundary
clf = classifiers[1]
xlim = (0.7, 1.35)
ylim = (-0.15, 0.4)

xx, yy = np.meshgrid(np.linspace(xlim[0], xlim[1], 71),
                     np.linspace(ylim[0], ylim[1], 81))

Z = clf.predict_proba(np.c_[yy.ravel(), xx.ravel()])
Z = Z[:, 1].reshape(xx.shape)

#----------------------------------------------------------------------
# plot the results
fig = plt.figure(figsize=(5, 2.5))
fig.subplots_adjust(bottom=0.15,
                    top=0.95,
                    hspace=0.0,
                    left=0.1,
                    right=0.95,
                    wspace=0.2)

# left plot: data and decision boundary
ax = fig.add_subplot(121)
im = ax.scatter(X[-N_plot:, 1],
Ejemplo n.º 29
0
predictions_RF = RF.predict_proba(Xtest)

# AdaBoost
AD = AdaBoostClassifier()
AD.fit(Xtrain, Ytrain)
predictions_AD = AD.predict_proba(Xtest)

# Naive Bayes
NB = GaussianNB()
NB.fit(Xtrain, Ytrain)
predictions_NB = NB.predict_proba(Xtest)

# QDA
QDA = QuadraticDiscriminantAnalysis()
QDA.fit(Xtrain, Ytrain)
predictions_QDA = QDA.predict_proba(Xtest)


# Voting
from sklearn.ensemble import VotingClassifier
eclf1 = VotingClassifier(estimators=[('MLP', MLP),
                                     ('SGD', SGD),
                                     ('NN', NN),
#                                     ('GP', GP),
                                     ('DT', DT),
                                     ('RF', RF),
                                     ('AD', AD),
#                                     ('NB', NB),
                                     ('QDA', QDA)],
                         voting='soft',
                         n_jobs=-1)
Ejemplo n.º 30
0
plt.axis('tight')
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.plot(roc_auc_lda)

# In[27]:

# finally lets see how quadratic discriminant analysis performs
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

qda = QuadraticDiscriminantAnalysis()
qda.fit(X_train, Y_train.ravel())

# In[28]:

Y_prob_qda = qda.predict_proba(X_test)[:, 1]
Y_pred_qda = np.where(Y_prob_qda > 0.5, 1, 0)

# In[29]:

qda_confusion_matrix = confusion_matrix(Y_test, Y_pred_qda)
qda_confusion_matrix

# In[30]:

false_positive_rate_qda, true_positive_rate_qda, thresholds_qda = roc_curve(
    Y_test, Y_prob_qda)
roc_auc_qda = auc(false_positive_rate_qda, true_positive_rate_qda)
roc_auc_qda

# In[31]:
Ejemplo n.º 31
0
def discriminatePlot(X, y, cVal, titleStr=''):
    # Frederic's Robust Wrapper for discriminant analysis function.  Performs lda, qda and RF afer error checking, 
    # Generates nice plots and returns cross-validated
    # performance, stderr and base line.
    # X np array n rows x p parameters
    # y group labels n rows
    # rgb color code for each data point - should be the same for each data beloging to the same group
    # titleStr title for plots
    # returns: ldaScore, ldaScoreSE, qdaScore, qdaScoreSE, rfScore, rfScoreSE, nClasses
    
    # Global Parameters
    CVFOLDS = 10
    MINCOUNT = 10
    MINCOUNTTRAINING = 5 
    
    # Initialize Variables and clean up data
    classes, classesCount = np.unique(y, return_counts = True)  # Classes to be discriminated should be same as ldaMod.classes_
    goodIndClasses = np.array([n >= MINCOUNT for n in classesCount])
    goodInd = np.array([b in classes[goodIndClasses] for b in y])
    yGood = y[goodInd]
    XGood = X[goodInd]
    cValGood = cVal[goodInd]


    classes, classesCount = np.unique(yGood, return_counts = True) 
    nClasses = classes.size         # Number of classes or groups  

    # Do we have enough data?  
    if (nClasses < 2):
        print 'Error in ldaPLot: Insufficient classes with minimun data (%d) for discrimination analysis' % (MINCOUNT)
        return -1, -1, -1, -1 , -1, -1, -1
    cvFolds = min(min(classesCount), CVFOLDS)
    if (cvFolds < CVFOLDS):
        print 'Warning in ldaPlot: Cross-validation performed with %d folds (instead of %d)' % (cvFolds, CVFOLDS)
   
    # Data size and color values   
    nD = XGood.shape[1]                 # number of features in X
    nX = XGood.shape[0]                 # number of data points in X
    cClasses = []   # Color code for each class
    for cl in classes:
        icl = (yGood == cl).nonzero()[0][0]
        cClasses.append(np.append(cValGood[icl],1.0))
    cClasses = np.asarray(cClasses)
    myPrior = np.ones(nClasses)*(1.0/nClasses)  

    # Perform a PCA for dimensionality reduction so that the covariance matrix can be fitted.
    nDmax = int(np.fix(np.sqrt(nX/5)))
    if nDmax < nD:
        print 'Warning: Insufficient data for', nD, 'parameters. PCA projection to', nDmax, 'dimensions.' 
    nDmax = min(nD, nDmax)
    pca = PCA(n_components=nDmax)
    Xr = pca.fit_transform(XGood)
    print 'Variance explained is %.2f%%' % (sum(pca.explained_variance_ratio_)*100.0)
    
    
    # Initialise Classifiers  
    ldaMod = LDA(n_components = min(nDmax,nClasses-1), priors = myPrior, shrinkage = None, solver = 'svd') 
    qdaMod = QDA(priors = myPrior)
    rfMod = RF()   # by default assumes equal weights

        
    # Perform CVFOLDS fold cross-validation to get performance of classifiers.
    ldaScores = np.zeros(cvFolds)
    qdaScores = np.zeros(cvFolds)
    rfScores = np.zeros(cvFolds)
    skf = cross_validation.StratifiedKFold(yGood, cvFolds)
    iskf = 0
    
    for train, test in skf:
        
        # Enforce the MINCOUNT in each class for Training
        trainClasses, trainCount = np.unique(yGood[train], return_counts=True)
        goodIndClasses = np.array([n >= MINCOUNTTRAINING for n in trainCount])
        goodIndTrain = np.array([b in trainClasses[goodIndClasses] for b in yGood[train]])

        # Specity the training data set, the number of groups and priors
        yTrain = yGood[train[goodIndTrain]]
        XrTrain = Xr[train[goodIndTrain]]

        trainClasses, trainCount = np.unique(yTrain, return_counts=True) 
        ntrainClasses = trainClasses.size
        
        # Skip this cross-validation fold because of insufficient data
        if ntrainClasses < 2:
            continue
        goodInd = np.array([b in trainClasses for b in yGood[test]])    
        if (goodInd.size == 0):
            continue
           
        # Fit the data
        trainPriors = np.ones(ntrainClasses)*(1.0/ntrainClasses)
        ldaMod.priors = trainPriors
        qdaMod.priors = trainPriors
        ldaMod.fit(XrTrain, yTrain)
        qdaMod.fit(XrTrain, yTrain)        
        rfMod.fit(XrTrain, yTrain)
        

        ldaScores[iskf] = ldaMod.score(Xr[test[goodInd]], yGood[test[goodInd]])
        qdaScores[iskf] = qdaMod.score(Xr[test[goodInd]], yGood[test[goodInd]])
        rfScores[iskf] = rfMod.score(Xr[test[goodInd]], yGood[test[goodInd]])

        iskf += 1
     
    if (iskf !=  cvFolds):
        cvFolds = iskf
        ldaScores.reshape(cvFolds)
        qdaScores.reshape(cvFolds)
        rfScores.reshape(cvFolds)
      
# Refit with all the data  for the plots
        
    ldaMod.priors = myPrior
    qdaMod.priors = myPrior
    Xrr = ldaMod.fit_transform(Xr, yGood)
    # Check labels
    for a, b in zip(classes, ldaMod.classes_):
        if a != b:
            print 'Error in ldaPlot: labels do not match'
  
    # Print the coefficients of first 3 DFA 
    print 'LDA Weights:'
    print 'DFA1:', ldaMod.coef_[0,:]
    if nClasses > 2:
        print 'DFA2:', ldaMod.coef_[1,:] 
    if nClasses > 3:
        print 'DFA3:', ldaMod.coef_[2,:] 
        
    # Obtain fits in this rotated space for display purposes   
    ldaMod.fit(Xrr, yGood)    
    qdaMod.fit(Xrr, yGood)
    rfMod.fit(Xrr, yGood)
    
    XrrMean = Xrr.mean(0)
                
    # Make a mesh for plotting
    x1, x2 = np.meshgrid(np.arange(-6.0, 6.0, 0.1), np.arange(-6.0, 6.0, 0.1))
    xm1 = np.reshape(x1, -1)
    xm2 = np.reshape(x2, -1)
    nxm = np.size(xm1)
    Xm = np.zeros((nxm, Xrr.shape[1]))
    Xm[:,0] = xm1
    if Xrr.shape[1] > 1 :
        Xm[:,1] = xm2
        
    for ix in range(2,Xrr.shape[1]):
        Xm[:,ix] = np.squeeze(np.ones((nxm,1)))*XrrMean[ix]
        
    XmcLDA = np.zeros((nxm, 4))  # RGBA values for color for LDA
    XmcQDA = np.zeros((nxm, 4))  # RGBA values for color for QDA
    XmcRF = np.zeros((nxm, 4))  # RGBA values for color for RF

    
    # Predict values on mesh for plotting based on the first two DFs     
    yPredLDA = ldaMod.predict_proba(Xm) 
    yPredQDA = qdaMod.predict_proba(Xm) 
    yPredRF = rfMod.predict_proba(Xm)

    
    # Transform the predictions in color codes
    maxLDA = yPredLDA.max()
    for ix in range(nxm) :
        cWeight = yPredLDA[ix,:]                               # Prob for all classes
        cWinner = ((cWeight == cWeight.max()).astype('float')) # Winner takes all 
        # XmcLDA[ix,:] = np.dot(cWeight, cClasses)/nClasses
        XmcLDA[ix,:] = np.dot(cWinner, cClasses)
        XmcLDA[ix,3] = cWeight.max()/maxLDA
    
    # Plot the surface of probability    
    plt.figure(facecolor='white', figsize=(10,3))
    plt.subplot(131)
    Zplot = XmcLDA.reshape(np.shape(x1)[0], np.shape(x1)[1],4)
    plt.imshow(Zplot, zorder=0, extent=[-6, 6, -6, 6], origin='lower', interpolation='none', aspect='auto')
    if nClasses > 2:
        plt.scatter(Xrr[:,0], Xrr[:,1], c=cValGood, s=40, zorder=1)
    else:
        plt.scatter(Xrr,(np.random.rand(Xrr.size)-0.5)*12.0 , c=cValGood, s=40, zorder=1) 
    plt.title('%s: LDA pC %.0f %%' % (titleStr, (ldaScores.mean()*100.0)))
    plt.axis('square')
    plt.xlim((-6, 6))
    plt.ylim((-6, 6))    
    plt.xlabel('DFA 1')
    plt.ylabel('DFA 2')

    
    # Transform the predictions in color codes
    maxQDA = yPredQDA.max()
    for ix in range(nxm) :
        cWeight = yPredQDA[ix,:]                               # Prob for all classes
        cWinner = ((cWeight == cWeight.max()).astype('float')) # Winner takes all 
        # XmcLDA[ix,:] = np.dot(cWeight, cClasses)/nClasses
        XmcQDA[ix,:] = np.dot(cWinner, cClasses)
        XmcQDA[ix,3] = cWeight.max()/maxQDA
    
    # Plot the surface of probability    
    plt.subplot(132)
    Zplot = XmcQDA.reshape(np.shape(x1)[0], np.shape(x1)[1],4)
    plt.imshow(Zplot, zorder=0, extent=[-6, 6, -6, 6], origin='lower', interpolation='none', aspect='auto')
    if nClasses > 2:
        plt.scatter(Xrr[:,0], Xrr[:,1], c=cValGood, s=40, zorder=1)
    else:
        plt.scatter(Xrr,(np.random.rand(Xrr.size)-0.5)*12.0 , c=cValGood, s=40, zorder=1) 
    plt.title('%s: QDA pC %.0f %%' % (titleStr, (qdaScores.mean()*100.0)))
    plt.xlabel('DFA 1')
    plt.ylabel('DFA 2')
    plt.axis('square')
    plt.xlim((-6, 6))
    plt.ylim((-6, 6))
    
    
    # Transform the predictions in color codes
    maxRF = yPredRF.max()
    for ix in range(nxm) :
        cWeight = yPredRF[ix,:]           # Prob for all classes
        cWinner = ((cWeight == cWeight.max()).astype('float')) # Winner takes all 
        # XmcLDA[ix,:] = np.dot(cWeight, cClasses)/nClasses  # Weighted colors does not work
        XmcRF[ix,:] = np.dot(cWinner, cClasses)
        XmcRF[ix,3] = cWeight.max()/maxRF
    
    # Plot the surface of probability    
    plt.subplot(133)
    Zplot = XmcRF.reshape(np.shape(x1)[0], np.shape(x1)[1],4)
    plt.imshow(Zplot, zorder=0, extent=[-6, 6, -6, 6], origin='lower', interpolation='none', aspect='auto')
    if nClasses > 2:    
        plt.scatter(Xrr[:,0], Xrr[:,1], c=cValGood, s=40, zorder=1)
    else:
        plt.scatter(Xrr,(np.random.rand(Xrr.size)-0.5)*12.0 , c=cValGood, s=40, zorder=1) 
    plt.title('%s: RF pC %.0f %%' % (titleStr, (rfScores.mean()*100.0)))
    plt.xlabel('DFA 1')
    plt.ylabel('DFA 2')
    plt.axis('square')
    plt.xlim((-6, 6))
    plt.ylim((-6, 6))
    
    plt.show()


    # Results
    ldaScore = ldaScores.mean()*100.0
    qdaScore = qdaScores.mean()*100.0
    rfScore = rfScores.mean()*100.0
    ldaScoreSE = ldaScores.std() * 100.0
    qdaScoreSE = qdaScores.std() * 100.0 
    rfScoreSE = rfScores.std() * 100.0 
    
    print ("Number of classes %d. Chance level %.2f %%") % (nClasses, 100.0/nClasses)
    print ("%s LDA: %.2f (+/- %0.2f) %%") % (titleStr, ldaScore, ldaScoreSE)
    print ("%s QDA: %.2f (+/- %0.2f) %%") % (titleStr, qdaScore, qdaScoreSE)
    print ("%s RF: %.2f (+/- %0.2f) %%") % (titleStr, rfScore, rfScoreSE)
    return ldaScore, ldaScoreSE, qdaScore, qdaScoreSE, rfScore, rfScoreSE, nClasses
Ejemplo n.º 32
0
#         check.append(label)
check = tst.columns

for i in range(512):
    train2 = train[train['wheezy-copper-turtle-magic'] == i]
    test2 = test[test['wheezy-copper-turtle-magic'] == i]
    index1 = train2.index
    index2 = test2.index
    print(index1)
    train2.reset_index(drop=True, inplace=True)
    lowvardrop = VarianceThreshold(threshold=1.5).fit(train2[check])
    trainupdted = lowvardrop.transform(train2[check])
    testupdted = lowvardrop.transform(test2[check])
    splits = 11
    X_train, X_test, Y_train, Y_test = train_test_split(trainupdted,
                                                        train2['target'],
                                                        test_size=0.1,
                                                        random_state=42)
    model = model.fit(X_train, Y_train)
    pree[index2] += (model.predict_proba(testupdted)[:, 1]) / splits
#     folds = StratifiedKFold(n_splits=splits)
#     for traindex, testdex in folds.split(trainupdted, train2['target']):
#         print('hi')
#         print(len(traindex))
#         model.fit(trainupdted[traindex,:],train2.loc[traindex]['target'])
#         res[index1[testdex]] = model.predict_proba(trainupdted[testdex,:])[:,1]
#         pree[index2] += (model.predict_proba(testupdted)[:,1])/splits
lmao = pd.read_csv('../input/sample_submission.csv')
lmao['target'] = pree.reshape((-1, 1))
lmao.to_csv('submission.csv', index=False)
# LDA model
lda = QuadraticDiscriminantAnalysis()
lda.fit(comps, labels)
y_pred = lda.predict(comps)
print(labels)
print(y_pred)
mcc = matthews_corrcoef(labels,y_pred)
print("MCC="+str(mcc))


# Plotting LDA contour
nx, ny = 200, 100
x_min, x_max = np.amin(comps[:,0]), np.amax(comps[:,0])
y_min, y_max = np.amin(comps[:,1]), np.amax(comps[:,1])
xx, yy = np.meshgrid(np.linspace(x_min, x_max, nx),np.linspace(y_min, y_max, ny))
Z = lda.predict_proba(np.c_[xx.ravel(), yy.ravel()])
Z = Z[:, 1].reshape(xx.shape)
plt.contour(xx, yy, Z, [0.5], linewidths=5, colors = 'k', linestyles = 'dashed')

# Plotting LDA means
#plt.plot(lda.means_[0][0], lda.means_[0][1],'o', color='black', markersize=10)
#plt.plot(lda.means_[1][0], lda.means_[1][1],'o', color='black', markersize=10)
plt.title('PCA with QDA')

# Plot red and green data
output_red = comps[0:26]
output_green = comps[27:52]
#plt.scatter(output_red[:, 0], output_red[:,1], color='r')
#plt.scatter(output_green[:, 0], output_green[:, 1],color='g')
plt.show()
Ejemplo n.º 34
0
ys = dataset[1:, 3].astype(np.float64).astype(np.int32)

# train
classifier = QDA()
classifier.fit(xs, ys)

# error rate
prediction = classifier.predict(xs)
error_rate = np.sum(prediction != ys) / ys.shape[0]
print("Error rate: %.2f" % error_rate)

# visualize
# partition
nx, ny = 100, 100
xx, yy = np.meshgrid(np.linspace(0, 1, nx), np.linspace(0, 1, ny))
Z = classifier.predict_proba(np.c_[xx.ravel(), yy.ravel()])
Z = Z[:, 1].reshape(xx.shape)
plt.pcolormesh(xx, yy, Z, cmap='BuPu')
plt.contour(xx, yy, Z, [0.5], linewidths=2., colors='k')

# plot data
positive_xs = xs[ys == 1]
negative_xs = xs[ys == 0]
plt.scatter(positive_xs[:, 0],
            positive_xs[:, 1],
            c='#00CED1',
            s=60,
            label='Great (positive)')
plt.scatter(negative_xs[:, 0],
            negative_xs[:, 1],
            c='#DC143C',
Ejemplo n.º 35
0
def main():
    # PROBLEMS 1, 2, 3 (USPS Data)
    ####################################################################
    # read in usps data
    # training data
    data = pd.read_csv(os.path.join(os.path.dirname(__file__), 'data', 'usps',
                                    'zip.train'),
                       header=None,
                       delimiter=' ').iloc[:, :-1]
    y_train = data.pop(0).values
    X_train = data.values

    # test data
    data = pd.read_csv(os.path.join(os.path.dirname(__file__), 'data', 'usps',
                                    'zip.test'),
                       header=None,
                       delimiter=' ')
    y_test = data.pop(0).values
    X_test = data.values

    errs = None
    rows = np.array([])
    cols = np.array([])

    # apply pca
    for var in [.95, .96, .97, .98, .99, 1.]:
        if var == 1:
            pca = PCA()
        else:
            pca = PCA(n_components=var)

        pca.fit(X_train)

        X_train_pca = pca.transform(X_train)
        X_test_pca = pca.transform(X_test)

        n_comps = pca.n_components_
        rows = np.append(
            rows, "{0:.0f} components ({1:.0f}% variance)".format(
                n_comps, var * 100))

        # setting this up in here so that the classifiers re-initialize
        # for each variance
        classifiers = {
            "$k$NN ($k$ = 3)": KNeighborsClassifier(n_neighbors=3),
            "LDA": LDA()
        }

        var_errs = np.array([])
        for key in classifiers:
            # check first if we already have all our classifier columns
            if (len(cols) < len(classifiers)):
                cols = np.append(cols, key)

            cls = classifiers[key]
            cls_err = 1 - cls.fit(X_train_pca, y_train).score(
                X_test_pca, y_test)
            var_errs = np.append(var_errs, "{0:.4f}".format(cls_err))
        if errs is not None:
            errs = np.vstack((errs, var_errs))
        else:
            errs = np.array([var_errs])

    # make a table of the results
    plt.figure(figsize=(12, 2))
    plt.table(cellText=errs, rowLabels=rows, colLabels=cols, loc='upper left')
    plt.axis('off')
    save_path = os.path.join(os.path.dirname(__file__),
                             'hw4/usps_err_rates_lda.pdf')
    plt.savefig(save_path, bbox_inches='tight')
    plt.title("USPS Error Rates, LDA")
    plt.show()

    errs = None
    rows = np.array([])
    cols = np.array([])

    # apply pca
    for var in [.95, .96, .97, .98, .99, 1.]:
        if var == 1:
            pca = PCA()
        else:
            pca = PCA(n_components=var)

        pca.fit(X_train)

        X_train_pca = pca.transform(X_train)
        X_test_pca = pca.transform(X_test)

        n_comps = pca.n_components_
        rows = np.append(
            rows, "{0:.0f} components ({1:.0f}% variance)".format(
                n_comps, var * 100))

        # setting this up in here so that the classifiers re-initialize
        # for each variance
        classifiers = {
            "$k$NN ($k$ = 3)": KNeighborsClassifier(n_neighbors=3),
            "QDA": QDA()
        }

        var_errs = np.array([])
        for key in classifiers:
            # check first if we already have all our classifier columns
            if (len(cols) < len(classifiers)):
                cols = np.append(cols, key)

            cls = classifiers[key]
            cls_err = 1 - cls.fit(X_train_pca, y_train).score(
                X_test_pca, y_test)
            var_errs = np.append(var_errs, "{0:.4f}".format(cls_err))
        if errs is not None:
            errs = np.vstack((errs, var_errs))
        else:
            errs = np.array([var_errs])

    plt.figure(figsize=(12, 2))
    plt.table(cellText=errs, rowLabels=rows, colLabels=cols, loc='upper left')
    plt.axis('off')
    save_path = os.path.join(os.path.dirname(__file__),
                             'hw4/usps_err_rates_qda.pdf')
    plt.savefig(save_path, bbox_inches='tight')
    plt.title("USPS Error Rates, QDA")
    plt.show()

    errs = None
    rows = np.array([])
    cols = np.array([])

    # apply pca
    for var in [.95, .96, .97, .98, .99, 1.]:
        if var == 1:
            pca = PCA()
        else:
            pca = PCA(n_components=var)

        pca.fit(X_train)

        X_train_pca = pca.transform(X_train)
        X_test_pca = pca.transform(X_test)

        n_comps = pca.n_components_
        rows = np.append(
            rows, "{0:.0f} components ({1:.0f}% variance)".format(
                n_comps, var * 100))

        # setting this up in here so that the classifiers re-initialize
        # for each variance
        classifiers = {
            "$k$NN ($k$ = 3)": KNeighborsClassifier(n_neighbors=3),
            "GNB": GaussianNB()
        }

        var_errs = np.array([])
        for key in classifiers:
            # check first if we already have all our classifier columns
            if (len(cols) < len(classifiers)):
                cols = np.append(cols, key)

            cls = classifiers[key]
            cls_err = 1 - cls.fit(X_train_pca, y_train).score(
                X_test_pca, y_test)
            var_errs = np.append(var_errs, "{0:.4f}".format(cls_err))
        if errs is not None:
            errs = np.vstack((errs, var_errs))
        else:
            errs = np.array([var_errs])

    plt.figure(figsize=(12, 2))
    plt.table(cellText=errs, rowLabels=rows, colLabels=cols, loc='upper left')
    plt.axis('off')
    save_path = os.path.join(os.path.dirname(__file__),
                             'hw4/usps_err_rates_gnb.pdf')
    plt.savefig(save_path, bbox_inches='tight')
    plt.title("USPS Error Rates, GNB")
    plt.show()

    # PROBLEM 4 (MNIST Data)
    ####################################################################
    # read in mnist data
    mndata = MNIST('data/mnist')

    # training data
    images, labels = mndata.load_training()
    X_train = np.array(images)
    y_train = np.array(labels)

    # test data
    images, labels = mndata.load_testing()
    X_test = np.array(images)
    y_test = np.array(labels)

    errs = None
    rows = np.array([])
    cols = np.array([])

    classifiers = {"LDA": LDA(), "QDA": QDA()}
    variances = [.95, .96, .97, .98, .99, 1.]
    subsets = {
        "0, 1": [0, 1],
        "4, 9": [4, 9],
        "0, 1, 2": [0, 1, 2],
        "3, 5, 8": [3, 5, 8]
    }
    for components in variances:
        for cls_key in classifiers:
            var_errs = np.array([])
            for i, key in enumerate(subsets):
                if components == 1:
                    pca = PCA()
                else:
                    pca = PCA(n_components=components)

                X_train_sub = X_train[np.isin(y_train, subsets[key])]
                y_train_sub = y_train[np.isin(y_train, subsets[key])]

                X_test_sub = X_test[np.isin(y_test, subsets[key])]
                y_test_sub = y_test[np.isin(y_test, subsets[key])]

                pca.fit(X_train_sub)

                X_train_pca = pca.transform(X_train_sub)
                X_test_pca = pca.transform(X_test_sub)

                n_comps = pca.n_components_

                # only need to fill these if they aren't already filled
                if (i == 0):
                    rows = np.append(
                        rows, "{0:.0f} components ({1:.0f}% variance) "
                        "+ {2}".format(n_comps, components * 100, cls_key))

                if (len(cols) < len(subsets)):
                    cols = np.append(cols, key)

                cls = classifiers[cls_key]
                cls.fit(X_train_pca, y_train_sub)
                cls_err = 1 - cls.score(X_test_pca, y_test_sub)
                var_errs = np.append(var_errs, "{0:.4f}".format(cls_err))

            if errs is not None:
                errs = np.vstack((errs, var_errs))
            else:
                errs = np.array([var_errs])

    # make a table of the results
    plt.figure(figsize=(12, 2))
    plt.table(cellText=errs, rowLabels=rows, colLabels=cols, loc='upper left')
    plt.axis('off')
    save_path = os.path.join(os.path.dirname(__file__),
                             'hw4/mnist_err_rates.pdf')
    plt.savefig(save_path, bbox_inches='tight')
    plt.title("MNIST Error Rates")
    plt.show()

    # PROBLEM 5
    ###################################################################
    gaussians = loadmat('data/twogaussians/twogaussians.mat')
    X_train = gaussians['Xtr']
    y_train = np.ravel(gaussians['ytr'])

    X_test = gaussians['Xtst']
    y_test = np.ravel(gaussians['ytst'])

    lda = LDA()
    lda.fit(X_train, y_train)
    lda_err_rate = 1 - lda.score(X_test, y_test)
    print('{0:.4f}'.format(lda_err_rate))

    qda = QDA()
    qda.fit(X_train, y_train)
    qda_err_rate = 1 - qda.score(X_test, y_test)
    print('{0:.4f}'.format(qda_err_rate))

    X1 = X_train[y_train == 1]
    X2 = X_train[y_train == 2]

    plt.figure(figsize=(12, 12))
    plt.plot(X1[:, 0], X1[:, 1], 'o', color='red', label='1')
    plt.plot(X2[:, 0], X2[:, 1], 'o', color='blue', label='2')

    nx, ny = 200, 100
    x_min, x_max = plt.xlim()
    y_min, y_max = plt.ylim()
    xx, yy = np.meshgrid(np.linspace(x_min, x_max, nx),
                         np.linspace(y_min, y_max, ny))
    Z = lda.predict_proba(np.c_[xx.ravel(), yy.ravel()])
    Z = Z[:, 1].reshape(xx.shape)
    plt.contour(xx, yy, Z, [0.5], linewidths=2., colors='k')

    save_path = os.path.join(os.path.dirname(__file__),
                             'hw4/lda_decision_boundary.pdf')
    plt.title(
        "LDA Decision Boundary (Error Rate = {0:.4f})".format(lda_err_rate))
    plt.savefig(save_path, bbox_inches='tight')
    plt.show()

    plt.figure(figsize=(12, 12))
    plt.plot(X1[:, 0], X1[:, 1], 'o', color='red', label='1')
    plt.plot(X2[:, 0], X2[:, 1], 'o', color='blue', label='2')

    nx, ny = 200, 100
    x_min, x_max = plt.xlim()
    y_min, y_max = plt.ylim()
    xx, yy = np.meshgrid(np.linspace(x_min, x_max, nx),
                         np.linspace(y_min, y_max, ny))
    Z = qda.predict_proba(np.c_[xx.ravel(), yy.ravel()])
    Z = Z[:, 1].reshape(xx.shape)
    plt.contour(xx, yy, Z, [0.5], linewidths=2., colors='k')

    save_path = os.path.join(os.path.dirname(__file__),
                             'hw4/qda_decision_boundary.pdf')
    plt.title(
        "QDA Decision Boundary (Error Rate = {0:.4f})".format(qda_err_rate))
    plt.savefig(save_path, bbox_inches='tight')
    plt.show()