Exemple #1
class NaiveBayes:
    def __init__(self):
        Algorithme Naïve Bayes Bernoulli
        self.lissage = 1.1  # Lissage des données d'entrée

    def recherche_hyper(self, x_tr, t_tr):
        Recherche d'hyperparamètres pour Naïve Bayes Bernoulli
        x_train: Numpy array avec données d'entraînement
        t_train: Numpy array avec cibles pour l'entraînement

        Méthode de Grid Search: 
            prof_max: Profondeur maximale entre 10 et 50
            msf: Nombre minimal d'échantillons dans une feuille entre 2 et 10
            Mesure de la qualité de la séparation: giny et entropy
        Retourne un dictionnaire avec les meilleurs hyperparamètres
        valeurs_liss = np.arange(0.0, 1.0, 0.01)
        p_grid = [{'alpha': valeurs_liss}]

        cross_v = KFold(10, True)  # validation croisée

        # Recherche d'hyperparamètres
        self.classif = GridSearchCV(estimator=BernoulliNB(),\
                                          param_grid=p_grid, cv=cross_v)
        self.classif.fit(x_tr, t_tr)

        mei_param = self.classif.best_params_

        return mei_param

    def entrainement(self, x_train, t_train, cherche_hyp):
        Entraînement avec Naïve Bayes Bernoulli
        x_train: Numpy array avec données d'entraînement
        t_train: Numpy array avec cibles pour l'entraînement
        cherche_hyp: Chercher ou non les meilleures hyperparamètres
        Retourne un objet avec le modèle entraîné

        if cherche_hyp == True:
                'Debut de l\'entrainement NB avec recherche d\'hyperparamètres',
            parametres = self.recherche_hyper(x_train, t_train)
                'Debut de l\'entrainement NB sans recherche d\'hyperparamètres',
            parametres = {'alpha': self.lissage}

        self.classif = BernoulliNB(**parametres)

        print('Paramètres utilisés pour l\'entraînement NB :',\

        return self.classif.fit(x_train, t_train)

    def prediction(self, x_p):
        Prédiction avec Naïve Bayes Bernoulli
        x_p = Numpy array avec données pour trouver la prédiction
        Retourne les cibles t_p pour x_p et leur score
        self.t_p = self.classif.predict(x_p)
        return self.t_p
Exemple #2
class SubspaceAlignedClassifier(object):
    Class of classifiers based on Subspace Alignment.

    Methods contain the alignment itself, classifiers and general utilities.
    def __init__(self, loss='logistic', l2=1.0, num_components=1):
        Select a particular type of subspace aligned classifier.

        INPUT   (1) str 'loss': loss function for weighted classifier, options:
                    'logistic', 'quadratic', 'hinge' (def: 'logistic')
                (2) float 'l2': l2-regularization parameter value (def:0.01)
                (3) int 'num_components': number of transfer components to
                    maintain (def: 1)
        self.loss = loss
        self.l2 = l2
        self.num_components = num_components

        # Initialize untrained classifiers
        if self.loss == 'logistic':
            # Logistic regression model
            self.clf = LogisticRegression()
        elif self.loss == 'quadratic':
            # Least-squares model
            self.clf = LinearRegression()
        elif self.loss == 'hinge':
            # Linear support vector machine
            self.clf = LinearSVC()
        elif self.loss == 'dtree':
            # DecisionTreeClassifier
            self.clf = tree.DecisionTreeClassifier()
        elif self.loss == 'berno':
            # BernoulliNB
            self.clf = BernoulliNB()
            # Other loss functions are not implemented
            raise NotImplementedError

        # Maintain target principal component coefficients
        self.CZ = ''

        # Whether model has been trained
        self.is_trained = False

        # Dimensionality of training data
        self.train_data_dim = ''

    def subspace_alignment(self, X, Z, num_components=1):
        Compute subspace and alignment matrix.

        INPUT   (1) array 'X': source data set (N samples by D features)
                (2) array 'Z': target data set (M samples by D features)
                (3) int 'num_components': number of components (def: 1)
        OUTPUT  (1) array 'V': transformation matrix (D features by D features)
                (2) array 'CX': source principal component coefficients
                (3) array 'CZ': target principal component coefficients
        # Data shapes
        N, DX = X.shape
        M, DZ = Z.shape

        # Assert equivalent dimensionalities
        assert DX == DZ

        # Compute principal components
        CX = PCA(n_components=num_components, whiten=True).fit(X).components_.T
        CZ = PCA(n_components=num_components, whiten=True).fit(Z).components_.T

        # Aligned source components
        V = np.dot(CX.T, CZ)

        # Return transformation matrix and principal component coefficients
        return V, CX, CZ

    def fit(self, X, y, Z):
        Fit/train a classifier on data mapped onto transfer components.

        INPUT   (1) array 'X': source data (N samples by D features)
                (2) array 'y': source labels (N samples by 1)
                (3) array 'Z': target data (M samples by D features)
        OUTPUT  None
        # Data shapes
        N, DX = X.shape
        M, DZ = Z.shape

        # Assert equivalent dimensionalities
        assert DX == DZ

        # Transfer component analysis (store target subspace)
        V, CX, self.CZ = self.subspace_alignment(
            X, Z, num_components=self.num_components)

        # Map source data onto source principal components
        X = np.dot(X, CX)

        # Align source data to target subspace
        X = np.dot(X, V)

        # Train a weighted classifier
        if self.loss == 'logistic':
            # Logistic regression model with sample weights
            self.clf.fit(X, y)
        elif self.loss == 'quadratic':
            # Least-squares model with sample weights
            self.clf.fit(X, y)
        elif self.loss == 'hinge':
            # Linear support vector machine with sample weights
            self.clf.fit(X, y)
        elif self.loss == 'dtree':
            # DecisionTreeClassifier
            self.clf.fit(X, y)
        elif self.loss == 'berno':
            # BernoulliNB
            self.clf.fit(X, y)
            # Other loss functions are not implemented
            raise NotImplementedError

        # Mark classifier as trained
        self.is_trained = True

        # Store training data dimensionality
        self.train_data_dim = DX

    def predict(self, Z_, whiten=False):
        Make predictions on new dataset.

        INPUT   (1) array 'Z_': new data set (M samples by D features)
                (2) boolean 'whiten': whether to whiten new data (def: false)
        OUTPUT  (1) array 'preds': label predictions (M samples by 1)
        # Data shape
        M, D = Z_.shape

        # If classifier is trained, check for same dimensionality
        if self.is_trained:
            assert self.train_data_dim == D

        # Check for need to whiten data beforehand
        if whiten:
            Z_ = st.zscore(Z_)

        # Map new target data onto target subspace
        Z_ = np.dot(Z_, self.CZ)

        # Call scikit's predict function
        preds = self.clf.predict(Z_)

        # For quadratic loss function, correct predictions
        if self.loss == 'quadratic':
            preds = (np.sign(preds) + 1) / 2.

        # Return predictions array
        return preds

    def get_params(self):
        """Get classifier parameters."""
        return self.clf.get_params()

    def is_trained(self):
        """Check whether classifier is trained."""
        return self.is_trained

    # add. by pb
    def predict_proba(self, Z_, whiten=False):
        Make predictions on new dataset.

        INPUT   (1) array 'Z_': new data set (M samples by D features)
                (2) boolean 'whiten': whether to whiten new data (def: false)
        OUTPUT  (1) array 'preds': label predictions (M samples by 1)
        # Data shape
        M, D = Z_.shape

        # If classifier is trained, check for same dimensionality
        if self.is_trained:
            assert self.train_data_dim == D

        # Check for need to whiten data beforehand
        if whiten:
            Z_ = st.zscore(Z_)

        # Map new target data onto target subspace
        Z_ = np.dot(Z_, self.CZ)

        # Call scikit's predict function
        preds = self.clf.predict_proba(Z_)

        # For quadratic loss function, correct predictions
        #if self.loss == 'quadratic':
        #    preds = (np.sign(preds)+1)/2.

        # Return predictions array
        return preds
Exemple #3
class NaiveBayesModel():

    def __init__(self, trainFilePath, testFilePath):
        #Initialize model variables
        with open(trainFilePath) as train:
            self.trainData = json.load(train)
        with open(testFilePath) as test:
            self.testData = json.load(test)
        self.uniqueIngredients, self.numUnique = self.getUniqueIngredients(
        self.trainVectors, self.trainLabels = self.getTrainVectors(
        # self.testVectors, self.testLabels = self.getTestVectors(self.testData)
        self.model = BernoulliNB()

    #Get Model Hyperparameters
    def getInfo(self):

    #Remove all samples with less than 5 ingredients
    def cleanData(self, data):
        newData = []
        for entry in data:
            if (len(entry['ingredients']) > 5):
        return newData

    #Get number of unique ingredients
    def getUniqueIngredients(self, trainData):
        ingredientsDictionary = {}
        uniqueIndentifier = 1
        for item in self.trainData:
            for ingredient in item["ingredients"]:
                if ingredient not in ingredientsDictionary:
                    ingredientsDictionary[ingredient] = uniqueIndentifier
                    uniqueIndentifier += 1
        return ingredientsDictionary, uniqueIndentifier

    #Get train vectors
    def getTrainVectors(self, trainData):
        #Create trainVectors list and labels list
        trainLabels = [vector["cuisine"] for vector in self.trainData]
        trainVectors = []
        for item in trainData:
            featureVector = [0.0] * self.numUnique
            for ingredient in item["ingredients"]:
                uniqueId = self.uniqueIngredients[ingredient]
                featureVector[uniqueId] = 1
        return trainVectors, trainLabels

    #Get test vectors
    def getTestVectors(self, testData):
        testVectors = []
        testLabels = [vector["cuisine"] for vector in testData]
        for item in testData:
            featureVector = [0.0] * self.numUnique
            for ingredient in item["ingredients"]:
                if ingredient in self.uniqueIngredients:
                    uniqueId = self.uniqueIngredients[ingredient]
                    featureVector[uniqueId] = 1
        return testVectors, testLabels

    #Train model on trainData
    def trainModel(self):
        self.model.fit(self.trainVectors, self.trainLabels)

    #Make predictions on testData
    def predict(self):
        predictions = self.model.predict(self.testVectors)
        numCorrect = 0
        totalSamples = len(self.testLabels)
        for prediction, trueLabel in zip(predictions, self.testLabels):
            if (prediction == trueLabel):
                numCorrect += 1
        print("Accuracy on validation set: %.2f%%" %
              (100 * (numCorrect / totalSamples)))

    #Predict given one single vector
    def predictOnSample(self, testVector):
        totalTests = []
        featureVector = [0.0] * self.numUnique
        featureCount = 0
        for ingredient in testVector:
            if ingredient in self.uniqueIngredients:
                uniqueId = self.uniqueIngredients[ingredient]
                featureVector[uniqueId] = 1
        return self.model.predict(totalTests)
Exemple #4
class ImportanceWeightedClassifier(object):
    Class of importance-weighted classifiers.

    Methods contain different importance-weight estimators and different loss
    def __init__(self,
        Select a particular type of importance-weighted classifier.

        INPUT   (1) str 'loss': loss function for weighted classifier, options:
                    'logistic', 'quadratic', 'hinge' (def: 'logistic')
                (2) float 'l2': l2-regularization parameter value (def:0.01)
                (3) str 'iwe': importance weight estimator, options: 'lr',
                    'nn', 'rg', 'kmm', 'kde' (def: 'lr')
                (4) boolean 'smoothing': whether to apply Laplace smoothing to
                    the nearest-neighbour importance-weight estimator
                    (def: True)
                (5) float 'clip': maximum allowable importance-weight value; if
                    set to -1, then the weights are not clipped (def:-1)
                (6) str 'kernel_type': what type of kernel to use for kernel
                    density estimation or kernel mean matching, options:
                    'diste', 'rbf' (def: 'rbf')
                (7) float 'bandwidth': kernel bandwidth parameter value for
                    kernel-based weight estimators (def: 1)
        self.loss = loss
        self.l2 = l2
        self.iwe = iwe
        self.smoothing = smoothing
        self.clip = clip
        self.kernel_type = kernel_type
        self.bandwidth = bandwidth

        # Initialize untrained classifiers based on choice of loss function
        if self.loss == 'logistic':
            # Logistic regression model
            self.clf = LogisticRegression()
        elif self.loss == 'quadratic':
            # Least-squares model
            self.clf = LinearRegression()
        elif self.loss == 'hinge':
            # Linear support vector machine
            self.clf = LinearSVC()
        elif self.loss == 'dtree':
            # DecisionTreeClassifier
            self.clf = tree.DecisionTreeClassifier()
        elif self.loss == 'berno':
            # BernoulliNB
            self.clf = BernoulliNB()
            # Other loss functions are not implemented
            raise NotImplementedError

        # Whether model has been trained
        self.is_trained = False

        # Dimensionality of training data
        self.train_data_dim = ''

    def iwe_ratio_gaussians(self, X, Z):
        Estimate importance weights based on a ratio of Gaussian distributions.

        INPUT   (1) array 'X': source data (N samples by D features)
                (2) array 'Z': target data (M samples by D features)
        OUTPUT  (1) array: importance weights (N samples by 1)
        # Data shapes
        N, DX = X.shape
        M, DZ = Z.shape

        # Assert equivalent dimensionalities
        assert DX == DZ

        # Sample means in each domain
        mu_X = np.mean(X, axis=0)
        mu_Z = np.mean(Z, axis=0)

        # Sample covariances
        Si_X = np.cov(X.T)
        Si_Z = np.cov(Z.T)

        # Check for positive-definiteness of covariance matrices
        if not (is_pos_def(Si_X) or is_pos_def(Si_Z)):
            print('Warning: covariate matrices not PSD.')

            regct = -6
            while not (is_pos_def(Si_X) or is_pos_def(Si_Z)):
                print('Adding regularization: ' + str(1**regct))

                # Add regularization
                Si_X += np.eye(DX) * 10.**regct
                Si_Z += np.eye(DZ) * 10.**regct

                # Increment regularization counter
                regct += 1

        # Compute probability of X under each domain
        pT = st.multivariate_normal.pdf(X, mu_Z, Si_Z)
        pS = st.multivariate_normal.pdf(X, mu_X, Si_X)

        # Check for numerics
        assert not np.any(np.isnan(pT)) or np.any(pT == 0)
        assert not np.any(np.isnan(pS)) or np.any(pS == 0)

        # Return the ratio of probabilities
        return pT / pS

    def iwe_kernel_densities(self, X, Z):
        Estimate importance weights based on kernel density estimation.

        INPUT   (1) array 'X': source data (N samples by D features)
                (2) array 'Z': target data (M samples by D features)
        OUTPUT  (1) array: importance weights (N samples by 1)
        # Data shapes
        N, DX = X.shape
        M, DZ = Z.shape

        # Assert equivalent dimensionalities
        assert DX == DZ

        # Compute probabilities based on source kernel densities
        pT = st.gaussian_kde(Z.T).pdf(X.T)
        pS = st.gaussian_kde(X.T).pdf(X.T)

        # Check for numerics
        assert not np.any(np.isnan(pT)) or np.any(pT == 0)
        assert not np.any(np.isnan(pS)) or np.any(pS == 0)

        # Return the ratio of probabilities
        return pT / pS

    def iwe_logistic_discrimination(self, X, Z):
        Estimate importance weights based on logistic regression.

        INPUT   (1) array 'X': source data (N samples by D features)
                (2) array 'Z': target data (M samples by D features)
        OUTPUT  (1) array: importance weights (N samples by 1)
        # Data shapes
        N, DX = X.shape
        M, DZ = Z.shape

        # Assert equivalent dimensionalities
        assert DX == DZ

        # Make domain-label variable
        y = np.concatenate((np.zeros((N, 1)), np.ones((M, 1))), axis=0)

        # Concatenate data
        XZ = np.concatenate((X, Z), axis=0)

        # Call a logistic regressor
        lr = LogisticRegression(C=self.l2)

        # Predict probability of belonging to target using cross-validation
        preds = cross_val_predict(lr, XZ, y[:, 0])

        # Return predictions for source samples
        return preds[:N]

    def iwe_nearest_neighbours(self, X, Z):
        Estimate importance weights based on nearest-neighbours.

        INPUT   (1) array 'X': source data (N samples by D features)
                (2) array 'Z': target data (M samples by D features)
        OUTPUT  (1) array: importance weights (N samples by 1)
        # Data shapes
        N, DX = X.shape
        M, DZ = Z.shape

        # Assert equivalent dimensionalities
        assert DX == DZ

        # Compute Euclidean distance between samples
        d = cdist(X, Z, metric='euclidean')

        # Count target samples within each source Voronoi cell
        ix = np.argmin(d, axis=1)
        iw, _ = np.array(np.histogram(ix, np.arange(N + 1)))

        # Laplace smoothing
        if self.smoothing:
            iw = (iw + 1.) / (N + 1)

        # Weight clipping
        if self.clip > 0:
            iw = np.minimum(self.clip, np.maximum(0, iw))

        # Return weights
        return iw

    def iwe_kernel_mean_matching(self, X, Z):
        Estimate importance weights based on kernel mean matching.

        INPUT   (1) array 'X': source data (N samples by D features)
                (2) array 'Z': target data (M samples by D features)
        OUTPUT  (1) array: importance weights (N samples by 1)
        # Data shapes
        N, DX = X.shape
        M, DZ = Z.shape

        # Assert equivalent dimensionalities
        assert DX == DZ

        # Compute sample pairwise distances
        KXX = cdist(X, X, metric='euclidean')
        KXZ = cdist(X, Z, metric='euclidean')

        # Assert non-negative distances
        assert np.all(KXX >= 0)
        assert np.all(KXZ >= 0)

        # Compute kernels
        if self.kernel_type == 'rbf':
            # Radial basis functions
            KXX = np.exp(-KXX / (2 * self.bandwidth**2))
            KXZ = np.exp(-KXZ / (2 * self.bandwidth**2))

        # Collapse second kernel and normalize
        KXZ = N / M * np.sum(KXZ, axis=1)

        # Prepare for CVXOPT
        Q = matrix(KXX, tc='d')
        p = matrix(KXZ, tc='d')
        G = matrix(np.concatenate((np.ones((1, N)), -1 * np.ones(
            (1, N)), -1. * np.eye(N)),
        h = matrix(np.concatenate(
            (np.array([N / np.sqrt(N) + N], ndmin=2),
             np.array([N / np.sqrt(N) - N], ndmin=2), np.zeros((N, 1))),

        # Call quadratic program solver
        sol = solvers.qp(Q, p, G, h)

        # Return optimal coefficients as importance weights
        return np.array(sol['x'])[:, 0]

    def fit(self, X, y, Z):
        Fit/train an importance-weighted classifier.

        INPUT   (1) array 'X': source data (N samples by D features)
                (2) array 'y': source labels (N samples by 1)
                (3) array 'Z': target data (M samples by D features)
        OUTPUT  None
        # Data shapes
        N, DX = X.shape
        M, DZ = Z.shape

        # Assert equivalent dimensionalities
        assert DX == DZ

        # Find importance-weights
        if self.iwe == 'lr':
            w = self.iwe_logistic_discrimination(X, Z)
        elif self.iwe == 'rg':
            w = self.iwe_ratio_gaussians(X, Z)
        elif self.iwe == 'nn':
            w = self.iwe_nearest_neighbours(X, Z)
        elif self.iwe == 'kde':
            w = self.iwe_kernel_densities(X, Z)
        elif self.iwe == 'kmm':
            w = self.iwe_kernel_mean_matching(X, Z)
            raise NotImplementedError
        print("self.loss=", str(self.loss))
        # Train a weighted classifier
        if self.loss == 'logistic':
            # Logistic regression model with sample weights
            self.clf.fit(X, y, w)
        elif self.loss == 'quadratic':
            # Least-squares model with sample weights
            self.clf.fit(X, y, w)
        elif self.loss == 'hinge':
            # Linear support vector machine with sample weights
            self.clf.fit(X, y, w)
        elif self.loss == 'dtree':
            # DecisionTreeClassifier
            self.clf.fit(X, y, w)
        elif self.loss == 'berno':
            # BernoulliNB
            self.clf.fit(X, y, w)
            # Other loss functions are not implemented
            raise NotImplementedError

        # Mark classifier as trained
        self.is_trained = True

        # Store training data dimensionality
        self.train_data_dim = DX

    def predict(self, Z_):
        Make predictions on new dataset.

        INPUT   (1) array 'Z_': new data set (M samples by D features)
        OUTPUT  (2) array 'preds': label predictions (M samples by 1)
        # Data shape
        M, D = Z_.shape

        # If classifier is trained, check for same dimensionality
        if self.is_trained:
            assert self.train_data_dim == D

        # Call scikit's predict function
        preds = self.clf.predict(Z_)

        # For quadratic loss function, correct predictions
        if self.loss == 'quadratic':
            preds = (np.sign(preds) + 1) / 2.

        # Return predictions array
        return preds

    def get_params(self):
        """Get classifier parameters."""
        return self.clf.get_params()

    def is_trained(self):
        """Check whether classifier is trained."""
        return self.is_trained
Exemple #5
class TransferComponentClassifier(object):
    Class of classifiers based on Transfer Component Analysis.

    Methods contain component analysis and general utilities.

    def __init__(self, loss='logistic', l2=1.0, mu=1.0, num_components=1,
                 kernel_type='rbf', bandwidth=1.0, order=2.0):
        Select a particular type of transfer component classifier.

        INPUT   (1) str 'loss': loss function for weighted classifier, options:
                    'logistic', 'quadratic', 'hinge' (def: 'logistic')
                (2) float 'l2': l2-regularization parameter value (def:0.01)
                (3) float 'mu': trade-off parameter (def: 1.0)
                (4) int 'num_components': number of transfer components to
                    maintain (def: 1)
                (5) str 'kernel_type': type of kernel to use, options: 'rbf'
                    (def: 'rbf')
                (6) float 'bandwidth': kernel bandwidth for transfer component
                    analysis (def: 1.0)
                (7) float 'order': order of polynomial for kernel (def: 2.0)
        self.loss = loss
        self.l2 = l2
        self.mu = mu
        self.num_components = num_components

        self.kernel_type = kernel_type
        self.bandwidth = bandwidth
        self.order = order

        # Initialize untrained classifiers
        if self.loss == 'logistic':
            # Logistic regression model
            self.clf = LogisticRegression()
        elif self.loss == 'quadratic':
            # Least-squares model
            self.clf = LinearRegression()
        elif self.loss == 'hinge':
            # Linear support vector machine
            self.clf = LinearSVC()
        elif self.loss == 'dtree':
            # DecisionTreeClassifier
            self.clf = tree.DecisionTreeClassifier()
        elif self.loss == 'berno':
            # BernoulliNB
            self.clf = BernoulliNB()
            # Other loss functions are not implemented
            raise NotImplementedError

        # Maintain source and transfer data for computing kernels
        self.XZ = ''

        # Maintain transfer components
        self.C = ''

        # Whether model has been trained
        self.is_trained = False

        # Dimensionality of training data
        self.train_data_dim = ''

    def kernel(self, X, Z, type='rbf', order=2, bandwidth=1.0):
        Compute kernel for given data set.

        INPUT   (1) array 'X': data set (N samples by D features)
                (2) array 'Z': data set (M samples by D features)
                (3) str 'type': type of kernel, options: 'linear',
                    'polynomial', 'rbf', 'sigmoid' (def: 'linear')
                (4) float 'order': order of polynomial to use for the
                    polynomial kernel (def: 2.0)
                (5) float 'bandwidth': kernel bandwidth (def: 1.0)
        OUTPUT  (1) array: kernel matrix (N+M by N+M)
        # Data shapes
        N, DX = X.shape
        M, DZ = Z.shape

        # Assert equivalent dimensionalities
        assert DX == DZ

        # Select type of kernel to compute
        if type == 'linear':
            # Linear kernel is data outer product
            return np.dot(X, Z.T)
        elif type == 'polynomial':
            # Polynomial kernel is an exponentiated data outer product
            return (np.dot(X, Z.T) + 1)**p
        elif type == 'rbf':
            # Radial basis function kernel
            return np.exp(-cdist(X, Z) / (2.*bandwidth**2))
        elif type == 'sigmoid':
            # Sigmoidal kernel
            return 1./(1 + np.exp(np.dot(X, Z.T)))
            raise NotImplementedError

    def transfer_component_analysis(self, X, Z):
        Transfer Component Analysis.

        INPUT   (1) array 'X': source data set (N samples by D features)
                (2) array 'Z': target data set (M samples by D features)
        OUTPUT  (1) array 'C': transfer components (D features
                    by num_components)
                (2) array 'K': source and target data kernel distances
        # Data shapes
        N, DX = X.shape
        M, DZ = Z.shape

        # Assert equivalent dimensionalities
        assert DX == DZ

        # Compute kernel matrix
        XZ = np.concatenate((X, Z), axis=0)
        K = self.kernel(XZ, XZ, type=self.kernel_type,

        # Ensure positive-definiteness
        if not is_pos_def(K):
            print('Warning: covariate matrices not PSD.')

            regct = -6
            while not is_pos_def(K):
                print('Adding regularization: ' + str(10**regct))

                # Add regularization
                K += np.eye(N + M)*10.**regct

                # Increment regularization counter
                regct += 1

        # Normalization matrix
        L = np.vstack((np.hstack((np.ones((N, N))/N**2,
                                  -1*np.ones((N, M))/(N*M))),
                       np.hstack((-1*np.ones((M, N))/(N*M),
                                  np.ones((M, M))/M**2))))

        # Centering matrix
        H = np.eye(N + M) - np.ones((N + M, N + M)) / float(N + M)

        # Matrix Lagrangian objective function: (I + mu*K*L*K)^{-1}*K*H*K
        J = np.dot(np.linalg.inv(np.eye(N + M) +
                   self.mu*np.dot(np.dot(K, L), K)),
                   np.dot(np.dot(K, H), K))

        # Eigenvector decomposition as solution to trace minimization
        _, C = eigs(J, k=self.num_components)

        # Discard imaginary numbers (possible computation issue)
        return np.real(C), K

    def fit(self, X, y, Z):
        Fit/train a classifier on data mapped onto transfer components.

        INPUT   (1) array 'X': source data (N samples by D features)
                (2) array 'y': source labels (N samples by 1)
                (3) array 'Z': target data (M samples by D features)
        # Data shapes
        N, DX = X.shape
        M, DZ = Z.shape

        # Assert equivalent dimensionalities
        assert DX == DZ

        # Assert correct number of components for given dataset
        assert self.num_components <= N + M - 1

        # Maintain source and target data for later kernel computations
        self.XZ = np.concatenate((X, Z), axis=0)

        # Transfer component analysis
        self.C, K = self.transfer_component_analysis(X, Z)

        # Map source data onto transfer components
        X = np.dot(K[:N, :], self.C)
        print("self.loss:", str(self.loss))
        # Train a weighted classifier
        if self.loss == 'logistic':
            # Logistic regression model with sample weights
            self.clf.fit(X, y)
        elif self.loss == 'quadratic':
            # Least-squares model with sample weights
            self.clf.fit(X, y)
        elif self.loss == 'hinge':
            # Linear support vector machine with sample weights
            self.clf.fit(X, y)
        elif self.loss == 'dtree':
            # DecisionTreeClassifier
            self.clf.fit(X, y)
        elif self.loss == 'berno':
             # BernoulliNB
            self.clf.fit(X, y)
            # Other loss functions are not implemented
            raise NotImplementedError

        # Mark classifier as trained
        self.is_trained = True

        # Store training data dimensionality
        self.train_data_dim = DX

    def predict(self, Z_):
        Make predictions on new dataset.

        INPUT   (1) array 'Z_': new data set (M samples by D features)
        OUTPUT  (2) array 'preds': label predictions (M samples by 1)
        # Data shape
        M, D = Z_.shape

        # If classifier is trained, check for same dimensionality
        if self.is_trained:
            assert self.train_data_dim == D

        # Compute kernel for new data
        K = self.kernel(Z_, self.XZ, type=self.kernel_type,
                        bandwidth=self.bandwidth, order=self.order)

        # Map new data onto transfer components
        Z_ = np.dot(K, self.C)

        # Call scikit's predict function
        preds = self.clf.predict(Z_)

        # For quadratic loss function, correct predictions
        if self.loss == 'quadratic':
            preds = (np.sign(preds)+1)/2.

        # Return predictions array
        return preds

    def get_params(self):
        """Get classifier parameters."""
        return self.clf.get_params()

    def is_trained(self):
        """Check whether classifier is trained."""
        return self.is_trained
        # add. by pb
    def predict_proba(self, Z_, whiten=False):
        Make predictions on new dataset.

        INPUT   (1) array 'Z_': new data set (M samples by D features)
        OUTPUT  (2) array 'preds': label predictions (M samples by 1)
        # Data shape
        M, D = Z_.shape

        # If classifier is trained, check for same dimensionality
        if self.is_trained:
            assert self.train_data_dim == D

        # Compute kernel for new data
        K = self.kernel(Z_, self.XZ, type=self.kernel_type,
                        bandwidth=self.bandwidth, order=self.order)

        # Map new data onto transfer components
        Z_ = np.dot(K, self.C)

        # Call scikit's predict function
        preds = self.clf.predict_proba(Z_)

        # For quadratic loss function, correct predictions
        if self.loss == 'quadratic':
            preds = (np.sign(preds)+1)/2.

        # Return predictions array
        return preds
arrTrainLabels = arrTrain[:,1:2].astype(int) 
# print(arrTrainLabels)
# arrTrainFeatures = arrTrain[:,[2,3,4,5]]
arrTrainFeatures = arrTrain[:,[2,3,4,5,6,7,8,9,10,11,12]]
# print(arrTrainFeatures)

model = BernoulliNB(
    , binarize=0.0
    , class_prior=None
    , fit_prior=True
    ).fit(arrTrainFeatures, arrTrainLabels)

arrTest = dfTest.as_matrix().astype(int)
arrTestLabels = arrTest[0:100,1]
# arrTestFeatures = arrTest[0:10000,[2,3,4,5]]
arrTestFeatures = arrTest[0:100,[2,3,4,5,6,7,8,9,10,11,12]]

pred_proba = model.predict_proba(arrTestFeatures)

pred = model.predict(arrTestFeatures)

for i in range(len(arrTestLabels)):
	print(arrTestLabels[i:i+1,][0],"-", pred[i,])

pred_score = model.score(arrTestFeatures, arrTestLabels)
Exemple #7
cm = confusion_matrix(Yset_test, y_pred)
Accuracy = ((cm[0, 0] + cm[1, 1]) /
            (cm[0, 0] + cm[0, 1] + cm[1, 0] + cm[1, 1])) * 100
Precision = ((cm[0, 0]) / (cm[0, 0] + cm[1, 0])) * 100
Recall = ((cm[0, 0]) / (cm[0, 0] + cm[0, 1])) * 100
print("Confusion Matrix: \n%s " % (cm))
print("Accuracy: %.2f%% ; Precision: %.2f%% ; Recall: %.2f%%" %
      (Accuracy, Precision, Recall))

    title='Confusion matrix, Bernoulli NB important features')


rndF = RandomForestClassifier()
y_pred = rndF.fit(Xset_train, Yset_train).predict(Xset_test)
    "RandomForestClassifier: Number of mislabeled points out of a total %d points : %d"
    % (Xset_train.shape[0], (Yset_test != y_pred).sum()))

cm = confusion_matrix(Yset_test, y_pred)
Accuracy = ((cm[0, 0] + cm[1, 1]) /
            (cm[0, 0] + cm[0, 1] + cm[1, 0] + cm[1, 1])) * 100
Precision = ((cm[0, 0]) / (cm[0, 0] + cm[1, 0])) * 100
Recall = ((cm[0, 0]) / (cm[0, 0] + cm[0, 1])) * 100
print("Confusion Matrix: \n%s " % (cm))
print("Accuracy: %.2f%% ; Precision: %.2f%% ; Recall: %.2f%%" %
      (Accuracy, Precision, Recall))
import numpy as np
from sklearn.naive_bayes import BernoulliNB
X = np.array([[1, 2, 3, 4], [1, 3, 4, 4], [2, 4, 5, 5]])
y = np.array([1, 1, 2])
clf = BernoulliNB(alpha=1, class_prior=None, binarize=2.0, fit_prior=False)
clf.fit(X, y, sample_weight=None)  #训练样本,X表示特征向量,y类标记,sample_weight表各样本权重数组
print(clf.class_count_)  #class_count_属性:获取各类标记对应的训练样本数
print(clf.feature_count_)  #:各类别各个特征出现的次数,返回形状为(n_classes, n_features)数组)
print(clf.get_params(deep=True))  #get_params(deep=True):返回priors与其参数值组成字典
print(clf.predict_log_proba([[3, 4, 5, 4], [1, 3, 5, 6]
                             ]))  #predict_log_proba(X):输出测试样本在各个类标记上预测概率值对应对数值
print(clf.predict_proba([[3, 4, 5, 4],
                         [1, 3, 5, 6]]))  #predict_proba(X):输出测试样本在各个类标记预测概率值
print(clf.score([[3, 4, 5, 4], [1, 3, 5, 6]],
                [1, 1]))  #score(X, y, sample_weight=None):输出对测试样本的预测准确率的平均值
clf.set_params(alpha=2.0)  #set_params(**params):设置估计器参数