Ejemplo n.º 1
0
class SklearnClassifier(BaseClassifier):
    def load_dataset(self, dataset_path=None):
        self.dspath = dataset_path if dataset_path is not None else "invalid path!"
        try:
            self.X = load_sparse_matrix(os.path.join(dataset_path, "X.npz"))
            self.Y = np.load(os.path.join(dataset_path, "Y.npy"))
        except IOError:
            print "Seems model files (X.npz, Y.npy) are not found..."

    def trainSGD(self):
        sgd = SGDClassifier(
            loss=self.loss,
            penalty=self.reg,
            alpha=self.alpha,
            n_iter=self.epochs,
            shuffle=True,
            n_jobs=self.multicpu,
            class_weight="auto",
        )
        # print "Classifier (sklearn SGD): training the model \t(%s)"%self.dspath
        if self.kernel_approx is True:
            rbf_feature = RBFSampler(gamma=1, n_components=100.0, random_state=1)
            Xk = rbf_feature.fit_transform(self.X)
            self.glm = OneVsRestClassifier(sgd).fit(Xk, self.Y)
        else:
            self.glm = OneVsRestClassifier(sgd).fit(self.X, self.Y)
        print "Classifier (sklearn SGD): Done. \t(%s)" % self.dspath

    def trainSVM(self):
        # svm = NuSVC(nu=0.5, kernel='rbf', degree=3, gamma=0.0, coef0=0.0, shrinking=True, probability=True, tol=0.001, cache_size=500, verbose=False, max_iter=-1)
        svm = SVC(
            C=1.0,
            kernel="rbf",
            degree=3,
            gamma=0.0,
            coef0=0.0,
            shrinking=True,
            probability=True,
            tol=0.001,
            cache_size=200,
            class_weight=None,
            verbose=True,
            max_iter=-1,
        )
        print "Classifier (sklearn NuSVC): training the model \t(%s)" % self.dspath
        self.glm = OneVsRestClassifier(svm).fit(self.X, self.Y)
        print "Classifier (sklearn NuSVC): Done. \t(%s)" % self.dspath

    def predict_f(self, testset_path=None, X=None, Y=None):
        fn_x = os.path.join(testset_path, "X.npz")
        fn_y = os.path.join(testset_path, "Y.npy")
        Xtest = load_sparse_matrix(fn_x)
        Ytest = np.load(fn_y)
        pred = self.glm.predict(Xtest)
        return pred

    def predict_f_prob(self, testset_path=None, X=None, Y=None):
        """
        Parameters
        ----------
        testset_path: path prefix for testset .npz and .npy file

        X: {array-like, sparse matrix}, shape = [n_samples, n_features]
            Data.

        Y : numpy array of shape [n_samples]
            Multi-class targets.

        Returns
        -------
        pred_p: {array-like}, shape = {n_samples, n_classes}
        """
        if hasattr(self.glm, "predict_prob"):
            fn_x = os.path.join(testset_path, "X.npz")
            fn_y = os.path.join(testset_path, "Y.npy")
            Xtest = load_sparse_matrix(fn_x)
            Ytest = np.load(fn_y)
            pred_p = self.glm.predict_prob(Xtest)
            return pred_p
        else:
            raise NotImplementedError