Ejemplo n.º 1
0
    def test_perceptron(self):
        # train the perceptron model
        perceptron = Perceptron(learning_rate=0.1, num_epochs=10)
        perceptron.fit(self.x, self.y)

        # plot learning
        curve = {
            'cost_length': len(perceptron.cost),
            'cost': perceptron.cost,
            'marker': 'o',
            'x_label': 'Epochs',
            'y_label': 'Number of updates',
            'title': 'Perceptron - Learning rate 0.1'
        }
        Plotter.plot_learning_curve(
            curve,
            FilesystemUtils.get_test_resources_plot_file_name(
                'perceptron/Perceptron-Learning-Curve.png'))

        # plot decision boundary
        diagram_options = {
            'x_label': 'sepal length [cm]',
            'y_label': 'petal length [cm]',
            'legend': 'upper left'
        }
        Plotter.plot_decision_boundary(
            self.x,
            self.y,
            classifier=perceptron,
            diagram_options=diagram_options,
            image_file_path=FilesystemUtils.get_test_resources_plot_file_name(
                'perceptron/Perceptron-Decision-Boundary.png'))
Ejemplo n.º 2
0
    def test_scikit_learn_decision_tree(self):
        # Train the decision tree.
        # Most algorithms in scikit-learn already support multiclass classification via the One-versus-Rest (OvR) method
        tree = DecisionTreeClassifier(criterion='gini',
                                      max_depth=4,
                                      random_state=1)
        tree.fit(self.x_train, self.y_train)

        self.predict_and_evaluate(
            tree,
            image_file_path=FilesystemUtils.get_test_resources_plot_file_name(
                'decision_tree/DecisionTree-ScikitLearn-Decision-Boundary.png')
        )

        # save the tree as a digram
        dot_data = export_graphviz(
            tree,
            filled=True,
            rounded=True,
            class_names=['Setosa', 'Versicolor', 'Virginica'],
            feature_names=['petal length', 'petal width'],
            out_file=None)
        graph = graph_from_dot_data(dot_data)
        graph.write_png(
            FilesystemUtils.get_test_resources_plot_file_name(
                'decision_tree/DecisionTree-ScikitLearn-Tree-Diagram.png'))
Ejemplo n.º 3
0
    def test_logistic_regresssion(self):
        # train the logistic regression model
        logistic_regression = LogisticRegressionBGD(learning_rate=0.05,
                                                    num_epochs=1000)
        logistic_regression.fit(self.x, self.y)

        # plot learning curve
        curve = {
            'cost_length': len(logistic_regression.cost),
            'cost': np.log10(logistic_regression.cost),
            'marker': 'o',
            'x_label': 'Epochs',
            'y_label': 'Number of updates',
            'title': 'Logistic regression - Learning rate 0.05'
        }
        Plotter.plot_learning_curve(
            curve,
            FilesystemUtils.get_test_resources_plot_file_name(
                'logistic_regression/LogisticRegressionBGD-Learning-Curve.png')
        )

        # plot decision boundary
        diagram_options = {
            'x_label': 'sepal length [cm]',
            'y_label': 'petal length [cm]',
            'legend': 'upper left'
        }
        Plotter.plot_decision_boundary(
            self.x,
            self.y,
            classifier=logistic_regression,
            diagram_options=diagram_options,
            image_file_path=FilesystemUtils.get_test_resources_plot_file_name(
                'logistic_regression/LogisticRegressionBGD-Decision-Boundary.png'
            ))
Ejemplo n.º 4
0
    def test_adaline(self):
        # train the first  with bigger learning ratio
        adaline1 = AdalineBGD(learning_rate=0.01, num_epochs=30)
        adaline1.fit(self.x, self.y)

        # train the second adaline model with smaller learning ration
        adaline2 = AdalineBGD(learning_rate=0.0001, num_epochs=30)
        adaline2.fit(self.x, self.y)

        # plot multiple learning curves for both adaline trained models
        curves = [{
            'cost_length': len(adaline1.cost),
            'cost': np.log10(adaline1.cost),
            'marker': 'o',
            'x_label': 'Epochs',
            'y_label': 'log(Sum-squared-error)',
            'title': 'Adaline - Learning rate 0.1'
        }, {
            'cost_length': len(adaline2.cost),
            'cost': np.log10(adaline2.cost),
            'marker': 'o',
            'x_label': 'Epochs',
            'y_label': 'log(Sum-squared-error)',
            'title': 'Adaline - Learning rate 0.0001'
        }]
        Plotter.plot_multiple_learning_curves(
            curves,
            image_file_path=FilesystemUtils.get_test_resources_plot_file_name(
                'adaline/AdalineBGD-Learning-Curves.png'))

        # plot decision boundary for divergent model (adaline 1)
        Plotter.plot_decision_boundary(
            self.x,
            self.y,
            classifier=adaline1,
            diagram_options={
                'x_label': 'sepal length [cm]',
                'y_label': 'petal length [cm]',
                'legend': 'upper left'
            },
            image_file_path=FilesystemUtils.get_test_resources_plot_file_name(
                'adaline/AdalineBGD-Decision-Boundary-Divergent.png'))

        # plot decision boundary for convergent model (adaline 2)
        Plotter.plot_decision_boundary(
            self.x,
            self.y,
            classifier=adaline2,
            diagram_options={
                'x_label': 'sepal length [cm]',
                'y_label': 'petal length [cm]',
                'legend': 'upper left'
            },
            image_file_path=FilesystemUtils.get_test_resources_plot_file_name(
                'adaline/AdalineBGD-Decision-Boundary-Convergent.png'))
Ejemplo n.º 5
0
    def setUp(self):
        # load subset of Iris data
        iris_data_reader = IrisDataReader(
            FilesystemUtils.get_resources_data_file_name('iris/iris.data'))
        self.x, self.y = iris_data_reader.get_data()

        # plotter data and save it to file
        Plotter.plot_iris_data_set(
            self.x,
            FilesystemUtils.get_test_resources_plot_file_name(
                'adaline/Adaline-Training-Set.png'))
Ejemplo n.º 6
0
    def test_scikit_learn_svm_nonlinear_iris(self):
        svm = SVC(kernel='rbf', random_state=1, gamma=0.2, C=1.0)
        svm.fit(self.x_train, self.y_train)

        self.predict_and_evaluate(
            svm,
            FilesystemUtils.get_test_resources_plot_file_name(
                'svm/SVM-ScikitLearn-LowGamma-Decision-Boundary.png'))

        svm = SVC(kernel='rbf', random_state=1, gamma=100.0, C=1.0)
        svm.fit(self.x_train, self.y_train)
        self.predict_and_evaluate(
            svm,
            FilesystemUtils.get_test_resources_plot_file_name(
                'svm/SVM-ScikitLearn-HighGamma-Decision-Boundary.png'))
Ejemplo n.º 7
0
    def test_draw_impurity_criteria_types(self):
        # For a visual comparison of the three different impurity criteria (Entropy, Gini, Misclassification Error),
        # let us plot the impurity indices for the probability range [0, 1] for class 1.
        # Note that we will also add a scaled version of the entropy (entropy / 2) to observe that the Gini impurity is
        # an intermediate measure between entropy and the classification error.

        x_range = np.arange(start=0.0, stop=1.0, step=0.01, dtype=float)

        # compute entropy and scaled entropy
        ent = [self.entropy(p) if p != 0 else None for p in x_range]
        sc_ent = [e * 0.5 if e else None for e in ent]

        # compute gini
        gn = [self.gini(p) for p in x_range]

        # computer misclassification error
        err = [self.misclassification_error(p) for p in x_range]

        Plotter.plot_impurity_criteria(
            x_range,
            entropy=ent,
            scaled_entropy=sc_ent,
            gini=gn,
            misclassification_error=err,
            image_file_path=FilesystemUtils.get_test_resources_plot_file_name(
                'decision_tree/DecisionTree-ScikitLearn-Impurity-Criteria.png')
        )
Ejemplo n.º 8
0
    def test_scikit_learn_validation_curves_on_wdbc_pipeline(self):
        # create a pipeline to test
        wdbc_pipeline = make_pipeline(
            StandardScaler(), PCA(n_components=2),
            LogisticRegression(solver='lbfgs', random_state=42))

        # prepare validation curves

        # Validation curves are a useful tool for improving the performance of a model by addressing issues such as
        # overfitting or underfitting.
        # Validation curves are related to learning curves, but instead of plotting the training and test accuracies as
        # functions of the sample size, we vary the values of the model parameters, for example, the
        # inverse regularization parameter C in logistic regression.

        num_folds = 10
        param_range = np.array([0.001, 0.01, 0.1, 1.0, 10.0, 100.0])

        train_scores, test_scores = validation_curve(
            estimator=wdbc_pipeline,
            X=self.x_train,
            y=self.y_train,
            param_name='logisticregression__C',
            param_range=param_range,
            cv=num_folds,
            n_jobs=-1)

        image_file_path = FilesystemUtils.get_test_resources_plot_file_name(
            'model_performance/ModelPerformance-ScikitLearn-ValidationCurves.png'
        )
        Plotter.plot_validation_curves(param_range,
                                       train_scores,
                                       test_scores,
                                       image_file_path=image_file_path)
Ejemplo n.º 9
0
    def test_scikit_learn_learning_curves_on_wdbc_pipeline(self):
        # create a pipeline to test
        wdbc_pipeline = make_pipeline(
            StandardScaler(), PCA(n_components=2),
            LogisticRegression(solver='lbfgs', random_state=42))

        # prepare learning curves

        # Via the train_sizes parameter in the learning_curve function, we can control the absolute or relative number
        # of training samples that are used to generate the learning curves.
        # Here, we set train_sizes=np.linspace(0.1, 1.0, 10) to use 10 evenly spaced, relative intervals for the
        # training set sizes. By default, the learning_curve function uses stratified k-fold cross-validation to
        # calculate the cross-validation accuracy of a classifier, and we set k=num_folds via the cv parameter for
        # k-fold stratified cross-validation.
        # Finally, we plot the diagram using a helper function.

        num_folds = 10

        train_sizes, train_scores, test_scores = learning_curve(
            estimator=wdbc_pipeline,
            X=self.x_train,
            y=self.y_train,
            train_sizes=np.linspace(0.1, 1.0, 10),
            cv=num_folds,
            n_jobs=-1)

        image_file_path = FilesystemUtils.get_test_resources_plot_file_name(
            'model_performance/ModelPerformance-ScikitLearn-LearningCurves.png'
        )
        Plotter.plot_performance_curves(train_sizes,
                                        train_scores,
                                        test_scores,
                                        image_file_path=image_file_path)
    def test_scikit_learn_confusion_matrix_by_svm(self):
        # create a pipeline to test
        wdbc_pipeline = make_pipeline(
            StandardScaler(),
            SVC(random_state=42)
        )

        wdbc_pipeline.fit(self.x_train, self.y_train)

        y_pred = wdbc_pipeline.predict(self.x_test)

        cm = confusion_matrix(y_true=self.y_test, y_pred=y_pred)

        # display confusion matrix
        print('Confusion matrix')
        print(cm)

        # display metrics
        print('Precision: {:.3f}'.format(precision_score(y_true=self.y_test, y_pred=y_pred)))
        print('Recall: {:.3f}'.format(recall_score(y_true=self.y_test, y_pred=y_pred)))
        print('F1: {:.3f}'.format(f1_score(y_true=self.y_test, y_pred=y_pred)))

        # The array that was returned after executing the code provides us with information about the different types of
        # error the classifier made on the test dataset.

        image_file_path = FilesystemUtils.get_test_resources_plot_file_name(
            'model_performance/ModelPerformance-ScikitLearn-ConfusionMatrix.png'
        )
        Plotter.plot_confusion_matrix(cm, image_file_path=image_file_path)
    def load_wine_data_set(self):
        # Load a non linearly separable dataset
        # There are 13 different features in the Wine dataset, describing the chemical properties of the 178 wine
        # samples, and each sample belongs to one of three different classes, 1, 2, and 3, which refer to the three
        # different types of grape grown in the same region in Italy but derived from different wine cultivars, as
        # described in the dataset summary (https://archive. ics.uci.edu/ml/machine-learning-databases/wine/wine.names).

        # load wine data set
        df = pd.read_csv(
            FilesystemUtils.get_resources_data_file_name('wine/wine.data'),
            header=None)

        # create headers
        df.columns = [
            'Class label', 'Alcohol', 'Malic acid', 'Ash', 'Alcalinity of ash',
            'Magnesium', 'Total phenols', 'Flavanoids', 'Nonflavanoid phenols',
            'Proanthocyanins', 'Color intensity', 'Hue',
            'OD280/OD315 of diluted wines', 'Proline'
        ]

        # print sample info:
        # values of class labels
        print('Class labels', np.unique(df['Class label']))
        # 10% of the shuffled data set samples with respective class labels (shuffling is necessary as rows are sorted
        # according to class label values in increasing order
        print(df.sample(frac=0.1).to_string())

        # split training and test set
        # separate class labels from features
        self.x, self.y = df.iloc[:, 1:].values, df.iloc[:, 0].values

        # save column names for later usage
        self.df_columns = df.columns
    def test_variable_L1_regularization_strength(self):
        # first, split the entire dataset into the training and testing subsets just for pretending this is a real
        # training scenario
        X_train, X_test, Y_train, Y_test = train_test_split(self.x,
                                                            self.y,
                                                            test_size=0.3,
                                                            random_state=0,
                                                            stratify=self.y)

        # standardize data sets
        X_train_standardized = StandardScaler().fit_transform(X_train)

        weights, params = [], []

        for c in np.arange(-4., 6.):
            lr = LogisticRegression(penalty='l1', C=10.**c, random_state=0)
            lr.fit(X_train_standardized, Y_train)
            weights.append(lr.coef_[1])
            params.append(10**c)

        weights = np.array(weights)

        Plotter.plot_variable_feature_weights(
            weights,
            params,
            self.df_columns,
            image_file_path=FilesystemUtils.get_test_resources_plot_file_name(
                'regularization/Logistic-Regression-Variable-L1-Regularized-Strength.png'
            ))
    def load_iris_dataset(self):
        # Loading the Iris dataset from scikit-learn.
        # The classes are already converted to integer labels where 0=Iris-Setosa, 1=Iris-Versicolor, 2=Iris-Virginica.
        iris = datasets.load_iris()
        x = iris.data[:, [2, 3]]
        y = iris.target
        print('Class labels:', np.unique(y))

        # plotter data and save it to file
        Plotter.plot_iris_data_set(x, FilesystemUtils.get_test_resources_plot_file_name(
            'ScikitLearn-Iris-Training-Set.png'))

        # Splitting data into 70% training and 30% test data
        x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=1, stratify=y)
        print('Labels counts in y:', np.bincount(y))
        print('Labels counts in y_train:', np.bincount(y_train))
        print('Labels counts in y_test:', np.bincount(y_test))

        # Standardize features
        sc = StandardScaler()
        sc.fit(x_train)
        x_train_std = sc.transform(x_train)
        sc.fit(x_test)
        x_test_std = sc.transform(x_test)

        self.x_train = x_train_std
        self.y_train = y_train

        self.x_test = x_test_std
        self.y_test = y_test
    def test_feature_importance(self):
        # split training and testing dataset
        x_train, x_test, y_train, y_test = train_test_split(self.x,
                                                            self.y,
                                                            test_size=0.25,
                                                            random_state=42)

        # standardize features
        stdsc = StandardScaler()
        x_train_std = stdsc.fit_transform(x_train)

        # use random forest
        forest = RandomForestClassifier(n_estimators=500, random_state=42)
        forest.fit(x_train_std, y_train)

        importance = forest.feature_importances_
        indices = np.argsort(importance)[::-1]

        feature_names = self.df_columns[1:]
        for f in range(x_train_std.shape[1]):
            print(
                "%2d) %-*s %f" %
                (f + 1, 30, feature_names[indices[f]], importance[indices[f]]))

        Plotter.plot_feature_importance(
            x_train_std.shape[1], importance[indices], feature_names[indices],
            FilesystemUtils.get_test_resources_plot_file_name(
                'feature_importance/FeatureImportance.png'))
Ejemplo n.º 15
0
    def test_adaline_with_stochastic_update(self):
        # standardize features
        x_std: np.matrix = np.copy(self.x)
        x_std[:, 0] = (self.x[:, 0] - self.x[:, 0].mean()) / self.x[:, 0].std()
        x_std[:, 1] = (self.x[:, 1] - self.x[:, 1].mean()) / self.x[:, 1].std()

        # plotter data and save it to file
        Plotter.plot_iris_data_set(
            x_std,
            FilesystemUtils.get_test_resources_plot_file_name(
                'adaline/AdalineSGD-Standardized-Training-Set.png'))

        # train adaline on standardized features with a small number of epochs
        adaline = AdalineSGD(learning_rate=0.01, num_epochs=15)
        adaline.fit(x_std, self.y)

        # plot learning curve
        curve = {
            'cost_length': len(adaline.cost),
            'cost': adaline.cost,
            'marker': 'o',
            'x_label': 'Epochs',
            'y_label': 'log(Sum-squared-error)',
            'title': 'Adaline - Learning rate 0.01'
        }
        Plotter.plot_learning_curve(
            curve,
            FilesystemUtils.get_test_resources_plot_file_name(
                'adaline/AdalineSGD-Learning-Curve-Standardized-Features.png'))

        # plot decision boundary
        Plotter.plot_decision_boundary(
            x_std,
            self.y,
            classifier=adaline,
            diagram_options={
                'x_label': 'sepal length [cm]',
                'y_label': 'petal length [cm]',
                'legend': 'upper left'
            },
            image_file_path=FilesystemUtils.get_test_resources_plot_file_name(
                'adaline/AdalineSGD-Decision-Boundary-Standardized-Features.png'
            ))

        adaline.partial_fit(x_std[0, :], self.y[0])
    def test_scikit_learn_perceptron(self):
        # Train the perceptron.
        # Most algorithms in scikit-learn already support multiclass classification via the One-versus-Rest (OvR) method
        perceptron = Perceptron(n_iter=40, eta0=0.1, random_state=1)
        perceptron.fit(self.x_train, self.y_train)

        self.predict_and_evaluate(perceptron,
                                  image_file_path=FilesystemUtils.get_test_resources_plot_file_name(
                                      'perceptron/Perceptron-ScikitLearn-Decision-Boundary.png'))
    def test_scikit_learn_knn(self):
        # Train the perceptron.
        # Most algorithms in scikit-learn already support multiclass classification via the One-versus-Rest (OvR) method
        knn = KNeighborsClassifier(n_neighbors=5, p=2, metric='minkowski')
        knn.fit(self.x_train, self.y_train)

        self.predict_and_evaluate(
            knn,
            image_file_path=FilesystemUtils.get_test_resources_plot_file_name(
                'knn/KNN-ScikitLearn-Decision-Boundary.png'))
Ejemplo n.º 18
0
    def test_scikit_learn_svm(self):
        # Train the SVM.
        # Most algorithms in scikit-learn already support multiclass classification via the One-versus-Rest (OvR) method
        svm = SVC(kernel='linear', C=1.0, random_state=1)
        svm.fit(self.x_train, self.y_train)

        self.predict_and_evaluate(
            svm,
            FilesystemUtils.get_test_resources_plot_file_name(
                'svm/SVM-ScikitLearn-Decision-Boundary.png'))
Ejemplo n.º 19
0
    def test_scikit_learn_random_forest(self):
        forest = RandomForestClassifier(criterion='gini',
                                        n_estimators=25,
                                        random_state=1,
                                        n_jobs=2)
        forest.fit(self.x_train, self.y_train)

        self.predict_and_evaluate(
            forest,
            image_file_path=FilesystemUtils.get_test_resources_plot_file_name(
                'decision_tree/RandomForest-ScikitLearn-Decision-Boundary.png')
        )
    def test_scikit_learn_perceptron_by_SGDClassifier(self):
        # Train the SVM.
        # Most algorithms in scikit-learn already support multiclass classification via the One-versus-Rest (OvR) method
        # Sometimes our datasets are too large to fit into computer memory, thus, scikit-learn also offers alternative
        # implementations viaThe SGDClassifier class, which also supports online learning via the partial_fit method.
        # The concept behind the SGDClassifier class is similar to the stochastic gradient algorithm
        perceptron = SGDClassifier(loss='perceptron')
        perceptron.fit(self.x_train, self.y_train)

        self.predict_and_evaluate(perceptron,
                                  FilesystemUtils.get_test_resources_plot_file_name(
                                      'perceptron/Perceptron-ScikitLearn-Classifier-Decision-Boundary.png'))
Ejemplo n.º 21
0
    def load_svm_nonlinear_data_set(self):
        # Load a non linearly separable dataset
        np.random.seed(1)
        x_xor = np.random.randn(200, 2)
        y_xor = np.logical_xor(x_xor[:, 0] > 0, x_xor[:, 1] > 0)
        y_xor = np.where(y_xor, 1, -1)

        # plotter data and save it to file
        Plotter.plot_svm_nonlinear_data_set(
            x_xor, y_xor,
            FilesystemUtils.get_test_resources_plot_file_name(
                'svm/SVM-ScikitLearn-NonLinear-Training-Set.png'))

        self.x_train = x_xor
        self.y_train = y_xor
Ejemplo n.º 22
0
    def test_scikit_learn_svm_nonlinear(self):
        # The γ parameter, which we set to gamma=0.1, can be understood as a cut-off parameter for the Gaussian sphere.
        # If we increase the value for γ , we increase the influence or reach of the training samples, which leads to a
        # tighter and bumpier decision boundary.
        svm = SVC(kernel='rbf', random_state=1, gamma=0.10, C=10.0)
        svm.fit(self.x_train, self.y_train)

        diagram_options = {
            'x_label': 'feature 1',
            'y_label': 'feature 2',
            'legend': 'best'
        }
        Plotter.plot_decision_boundary(
            self.x_train,
            self.y_train,
            svm,
            diagram_options,
            image_file_path=FilesystemUtils.get_test_resources_plot_file_name(
                'svm/SVM-ScikitLearn-NonLinear-Decision-Boundary.png'))
    def test_elbow_method(self):
        # To quantify the quality of clustering, we need to use intrinsic metrics—such as the within-cluster
        # Sum of Squared Errors (SSE), which is sometimes also called cluster inertia or distortion to compare the
        # performance of different k-means clusterings.
        # Conveniently, we don't need to compute the within-cluster SSE explicitly when we are using scikit-learn, as it
        # is already accessible via the inertia_ attribute after fitting a KMeans model:
        #     >>> print('Distortion: %.2f' % km.inertia_)
        #     Distortion: 72.48
        # Based on the within-cluster SSE, we can use a graphical tool, the so-called elbow method, to estimate the
        # optimal number of clusters k for a given task. Intuitively, we can say that, if k increases, the distortion
        # will decrease. This is because the samples will be closer to the centroids they are assigned to.
        # The idea behind the elbow method is to identify the value of k where the distortion begins to increase most
        # rapidly

        distortions = []
        k_range = range(1, 11)
        for i in k_range:
            km = KMeans(
                # number of clusters
                n_clusters=i,
                # initialization method
                init='k-means++',
                # number of different experiments run
                n_init=10,
                # maximum number of iteration
                max_iter=300,
                # change within cluster minimum threshold: below it, the training process is stopped
                tol=1e-04,
                # initialization seed
                random_state=42)

            km.fit(self.x)

            distortions.append(km.inertia_)

        data = np.column_stack((k_range, distortions))
        Plotter.plot_data(
            data,
            x_label='Number of clusters',
            y_label='Distorsion',
            title='Elbow method for optimal number of clusters',
            image_file_path=FilesystemUtils.get_test_resources_plot_file_name(
                'clustering/ElbowMethodForKMeansClustering.png'))
    def test_scikit_learn_roc_auc_on_wdbc_pipeline(self):
        # create a pipeline to test
        wdbc_pipeline = make_pipeline(
            StandardScaler(),
            PCA(n_components=2),
            LogisticRegression(solver='lbfgs', penalty='l2', C=100.0, random_state=42)
        )

        # create a stratified n_splits-fold cross-validation
        # In this example, we use 3 folds on the training test, actually
        cv = StratifiedKFold(n_splits=3, random_state=42).split(self.x_train, self.y_train)

        false_positive_rates = []
        true_positive_rates = []
        roc_auc_values = []

        # let's use just 10 features to have a more interesting diagram
        x_train_reduced = self.x_train[:, [4, 14]]

        # iterate over the folds to draw the related ROC curve
        for i, (train, test) in enumerate(cv):
            # compute the probabilities predicted by the classifier using the current fold
            probs = wdbc_pipeline.fit(x_train_reduced[train], self.y_train[train]).predict_proba(x_train_reduced[test])

            # computer ROC curve arrays
            fpr, tpr, thresholds = roc_curve(self.y_train[test], probs[:, 1], pos_label=1)

            # compute AUC (Area Under Curve)
            roc_auc = auc(fpr, tpr)

            false_positive_rates.append(fpr)
            true_positive_rates.append(tpr)
            roc_auc_values.append(roc_auc)

        # display results
        image_file_path = FilesystemUtils.get_test_resources_plot_file_name(
            'model_performance/ModelPerformance-ScikitLearn-ROC_AUC.png'
        )
        Plotter.plot_roc_auc(
            np.asarray(false_positive_rates), np.asarray(true_positive_rates), np.asarray(roc_auc_values),
            image_file_path=image_file_path
        )
Ejemplo n.º 25
0
    def test_sequential_feature_selection(self):
        knn = KNeighborsClassifier(n_neighbors=5)

        # standardize features
        stdsc = StandardScaler()
        x_std = stdsc.fit_transform(self.x)

        # selecting features
        sbs = SequentialFeatureSelection(knn, selected_features_number=1)
        sbs.fit(x_std, self.y)

        Plotter.plot_accuracy_by_feature_number(
            sbs.subsets_, sbs.scores_,
            FilesystemUtils.get_test_resources_plot_file_name(
                'sequential_feature_selection/AccuracyByFeatureNumber.png'))

        # for each subset of features, print column names, so to see how the elimination process worked
        print('Progressive selection explained')
        for feature_subset in sbs.subsets_:
            indices = list(feature_subset)
            print(self.df_columns[1:][indices])
    def plot_predictions(self, predictions: np.matrix, centroids: np.matrix,
                         title: str, diagram_file_name: str):
        data = [{
            'x': self.x[predictions == 0, :],
            'color': 'lightgreen',
            'marker': 's',
            'marker_size': 50,
            'edge_color': 'black',
            'label': 'cluster 1'
        }, {
            'x': self.x[predictions == 1, :],
            'color': 'orange',
            'marker': 's',
            'marker_size': 50,
            'edge_color': 'black',
            'label': 'cluster 2'
        }, {
            'x': self.x[predictions == 2, :],
            'color': 'lightblue',
            'marker': 's',
            'marker_size': 50,
            'edge_color': 'black',
            'label': 'cluster 3'
        }]

        centroids = {
            'x': centroids,
            'color': 'red',
            'marker': '*',
            'marker_size': 250,
            'edge_color': 'black',
            'label': 'centroids'
        }

        Plotter.plot_multiple_scattered_data(
            data,
            centroids,
            title,
            image_file_path=FilesystemUtils.get_test_resources_plot_file_name(
                'clustering/' + diagram_file_name))
Ejemplo n.º 27
0
    def setUp(self):
        # load subset of Iris data
        iris = datasets.load_iris()
        x_train = iris.data[:, [2, 3]]
        y_train = iris.target

        # consider only 0 and 1 labels
        x_train_01_subset = x_train[(y_train == 0) | (y_train == 1)]
        y_train_01_subset = y_train[(y_train == 0) | (y_train == 1)]

        # Standardize features
        sc = StandardScaler()
        sc.fit(x_train_01_subset)
        self.x = sc.transform(x_train_01_subset)

        self.y = y_train_01_subset
        print('Class labels:', np.unique(self.y))

        # plotter data and save it to file
        Plotter.plot_iris_data_set(
            self.x,
            FilesystemUtils.get_test_resources_plot_file_name(
                'logistic_regression/LogisticRegressionBGD-Training-Set.png'))
 def test_plot_sample_clustered_data(self):
     Plotter.plot_scattered_data(
         self.x,
         title='Sample clustered data',
         image_file_path=FilesystemUtils.get_test_resources_plot_file_name(
             'clustering/SampleClusteredData.png'))