Example #1
0
class Classifier(object):
    """This is a classifier for codon reassignment"""

    def __init__(self, method, classifier_spec={}, scale=False, n_estimators=1000):
        if method == 'rf':
            self.clf = RandomForestClassifier(
                n_estimators=n_estimators, n_jobs=-1, max_leaf_nodes=1000, **classifier_spec)
        elif method == 'svc':
            self.clf = svm.SVC(probability=True, **classifier_spec)
        elif method == 'etc':
            self.clf = ExtraTreesClassifier(
                n_estimators=n_estimators, **classifier_spec)
        elif method == 'gnb':
            self.clf = GaussianNB()
        else:
            raise NotImplementedError(
                "The method you chose (%s) is not implemented" % method)
        self.method = method
        self.trained = False
        self.scale = scale

    @classmethod
    def load_from_file(clc, loadfile):
        """Load model from a file"""
        try:
            clf = joblib.load(loadfile)
            return clf
        except IOError:
            print('Problem with file %s, can not open it' % loadfile)
        except Exception as e:
            raise e
        return None

    def save_model(self, outfile):
        """Save model to a file"""
        joblib.dump(self, outfile)

    def train(self, X=None, Y=None):
        """Train the model"""
        if self.scale:
            X = preprocessing.scale(X)
        self.clf.fit(X, Y)
        self.X = X
        self.y = Y
        self.trained = True

    @classmethod
    def from_classifier(clc, clfier):
        newclf = clc(clfier.method, {}, clfier.scale)
        newclf.__dict__.update(clfier.__dict__)
        return newclf

    def get_score(self, X, Y):
        """Return score for classification on X"""
        if self.scale:
            X = preprocessing.scale(X)
        return self.clf.score(X, Y)

    def predict(self, X):
        """Predict values for X"""
        if not self.trained:
            raise ValueError("Classifier is not trained")
        if self.scale:
            X = preprocessing.scale(X)
        return self.clf.predict(X)

    def predict_proba(self, X):
        """Return probability for each class prediction"""
        return self.clf.predict_proba(X)

    def feature_importance(self, outfile="importance.png", features_list=[]):
        """Show each feature importance"""
        if (self.method in ['rf', 'etc']):
            importances = self.clf.feature_importances_
            if len(features_list) > 0 and len(features_list) != len(importances):
                raise ValueError("Number of features does not fit!")

            indices = np.argsort(importances)[::-1]
            n_feats = len(features_list)
            np.savetxt(outfile + ".txt", np.array([tree.feature_importances_
                                                   for tree in self.clf.estimators_]), delimiter=',', fmt='%1.3e')
            std = np.std(
                [tree.feature_importances_ for tree in self.clf.estimators_], axis=0)
            plt.figure()
            plt.title("Feature importances")
            plt.bar(range(n_feats), importances[
                    indices], width=0.5, color="b", yerr=std[indices], align="center")
            if len(features_list) > 0:
                features_list = np.asarray(features_list)[indices]
                plt.xticks(range(n_feats), features_list, rotation='vertical')
            plt.xlim([-1, n_feats])
            plt.margins(0.2)

            plt.subplots_adjust(bottom=0.15)
            plt.savefig(outfile, bbox_inches='tight')
        else:
            raise NotImplementedError(
                "Not supported for classifier other than Ensembl Tree")

    def cross_validation(self, X, Y, X_test=None, Y_test=None, tsize=0.3):
        """Cross validation on X and Y, using a sub sample"""
        if X_test is None or Y_test is None:
            X_train, X_test, Y_train, Y_test = train_test_split(
                X, Y, test_size=tsize)
        else:
            X_train = X
            Y_train = Y
        self.train(X_train, Y_train)
        Y_predicted = self.predict(X_test)
        self.get_stat(X_test, Y_test)
        return Y_predicted, self.get_score(X_test, Y_test)

    def plot_precision_recall(self, X_test, y_test, infos="", outfile="precision_recall.png"):
        """plot precicion-recall curve"""
        if self.trained:
            try:
                y_score = self.clf.decision_function(X_test)
            except:
                y_score = self.clf.predict_proba(X_test)[:, 1]
            precision, recall, _ = precision_recall_curve(y_test, y_score)
            average_precision = average_precision_score(
                y_test, y_score, average="micro")
            # Plot Precision-Recall curve for each class
            plt.clf()
            plt.plot(recall, precision,
                     label='Average Precision-recall curve (area = {0:0.2f})'
                     ''.format(average_precision))
            plt.xlim([0.0, 1.0])
            plt.ylim([0.0, 1.05])
            plt.xlabel('Recall')
            plt.ylabel('Precision')
            plt.title('Precision-Recall curve for %s (%s)' %
                      (self.method, infos))
            plt.legend(loc="lower right")
            plt.savefig(outfile)
        else:
            raise ValueError("Classifier is not trained")

    def get_stat(self, X_test, y_test):
        """Print list of score for the current classifier"""
        y_pred = self.predict(X_test)
        if hasattr(self.clf, "predict_proba"):
            prob_pos = self.clf.predict_proba(X_test)[:, 1]
        else:  # use decision function
            prob_pos = self.clf.decision_function(X_test)
            prob_pos = (prob_pos - prob_pos.min()) / \
                (prob_pos.max() - prob_pos.min())

        clf_score = brier_score_loss(y_test, prob_pos)
        print("%s:" % self.method)
        print("\tBrier: %1.3f" % (clf_score))
        print("\tPrecision: %1.3f" % precision_score(y_test, y_pred))
        print("\tRecall: %1.3f" % recall_score(y_test, y_pred))
        print("\tF1: %1.3f" % f1_score(y_test, y_pred))
        print("\tROC AUC score: %1.3f\n" % roc_auc_score(y_test, prob_pos))
clf.fit(X, y)
SVC(C=1.0,
    cache_size=200,
    class_weight=None,
    coef0=0.0,
    degree=3,
    gamma='auto',
    kernel='linear',
    max_iter=-1,
    probability=False,
    random_state=None,
    shrinking=True,
    tol=0.001,
    verbose=False)
print(clf.predict([[-0.8, -1]]))
print(clf.decision_function([[-0.8, -1]]))
print(clf.get_params())
print(clf.score([[-0.8, -1]], [1]))
print(clf.score([[-0.8, -1]], [2]))

########################################################################################################################
########################################################################################################################

from sklearn.svm import LinearSVC
from sklearn.datasets import make_blobs
import mglearn

X, y = make_blobs(random_state=42)
linear_svm = LinearSVC().fit(X, y)
mglearn.discrete_scatter(X[:, 0], X[:, 1], y)
plt.xlabel("Feature 0")
Example #3
0
                         learning_rate_init=1e-3,
                         learning_rate='adaptive',
                         tol=1e-4,
                         max_iter=200)
 elif i == 4:
     clf = LinearSVC(penalty='l2', random_state=0, tol=1e-4)
 skf_accuracy1 = []
 skf_accuracy2 = []
 for train, test in skf.split(X, y):
     clf.fit(X[train], y[train])
     if n_classes.size < 3:
         skf_accuracy1.append(
             roc_auc_score(y[test],
                           clf.predict_proba(X[test])[:,
                                                      1] if i != 4
                           else clf.decision_function(X[test]),
                           average='micro'))
         clf.fit(X_new[train], y[train])
         skf_accuracy2.append(
             roc_auc_score(
                 y[test],
                 clf.predict_proba(X_new[test])[:, 1]
                 if i != 4 else clf.decision_function(X_new[test]),
                 average='micro'))
     else:
         ytest_one_hot = label_binarize(y[test], n_classes)
         skf_accuracy1.append(
             roc_auc_score(ytest_one_hot,
                           clf.predict_proba(X[test]) if i != 4 else
                           clf.decision_function(X[test]),
                           average='micro'))
Example #4
0
svc_prediction = clf.predict(NMF1_test)
svc_score = clf.predict(NMF1_test)
evaluate(test_label, svc_score)

##min_df=5
clf = GaussianNB().fit(NMF2_train, train_label)
svc_prediction = clf.predict(NMF2_test)
svc_score = clf.predict(NMF2_test)
evaluate(test_label, svc_score)

## (h)
print('(h)------------------------------------------')
print('Logistic Regression with min_df=2: \n')
clf = linear_model.LogisticRegression().fit(LSI1_train, train_label)
svc_prediction = clf.predict(LSI1_test)
svc_score = clf.decision_function(LSI1_test)
evaluate(test_label, svc_score)

print('Logistic Regression with min_df=5: \n')
clf = linear_model.LogisticRegression().fit(LSI2_train, train_label)
svc_prediction = clf.predict(LSI2_test)
svc_score = clf.decision_function(LSI2_test)
evaluate(test_label, svc_score)

## (i)
cross_list = []
for c in [0.001, 0.01, 0.1, 1, 10, 100, 1000]:
    linear_svc = linear_model.LogisticRegression(penalty='l1', C=c).fit(
        LSI1_train, train_label)
    linear_svc.fit(LSI1_train, train_label)
    svc_prediction = linear_svc.predict(LSI1_test)
Example #5
0
class Learning(object):
    """
    usage:
        >> from feelit.features import Learning
        >> learner = Learning(verbose=args.verbose, debug=args.debug) 
        >> learner.set(X_train, y_train, feature_name)
        >>
        >> scores = {}
        >> for C in Cs:
        >>  for gamma in gammas:
        >>      score = learner.kFold(kfolder, classifier='SVM', 
        >>                          kernel='rbf', prob=False, 
        >>                          C=c, scaling=True, gamma=gamma)
        >>      scores.update({(c, gamma): score})
        >>
        >> best_C, best_gamma = max(scores.iteritems(), key=operator.itemgetter(1))[0]
        >> learner.train(classifier='SVM', kernel='rbf', prob=True, C=best_C, gamma=best_gamma, 
        >>              scaling=True, random_state=np.random.RandomState(0))
        >> results = learner.predict(X_test, yb_test, weighted_score=True, X_predict_prob=True, auc=True)
    """

    def __init__(self, X=None, y=None, **kwargs):

        loglevel = logging.ERROR if 'loglevel' not in kwargs else kwargs['loglevel']
        logging.basicConfig(format='[%(levelname)s][%(name)s] %(message)s', level=loglevel)
        self.logger = logging.getLogger(__name__+'.'+self.__class__.__name__) 

        self.X = X
        self.y = y
        self.kfold_results = []
        self.Xs = {}
        self.ys = {}
        self.scaling = False if 'scaling' not in kwargs else kwargs['scaling']

    def set(self, X, y, feature_name):
        self.X = X
        self.y = y
        self.feature_name = feature_name

    def train(self, **kwargs):
        self._train(self.X, self.y, **kwargs)

    def _train(self, X_train, y_train, **kwargs):
        """
        required:
            X_train, y_train

        options:
            classifier: 'SVM', 'SGD', 'GaussianNB'
            with_mean: True/False
            with_std: True/False
            scaling: True/False
            prob: True/False. Esimate probability during training
            random_state: seed, RandomState instance or None; for probability estimation
            kernel: 'rbf', ...
            C: float; svm parameters
            shuffle: True/False; for SGD
        """
        ## setup a classifier
        classifier = "SVM" if "classifier" not in kwargs else kwargs["classifier"]

        # ## slice 
        # delete = None if "delete" not in kwargs else kwargs["delete"]

        # if delete:
        #     X_train = np.delete(utils.toDense(self.X), delete, axis=0)
        #     y_train = np.delete(self.y, delete, axis=0)
        # else:

        self.logger.debug("%d samples x %d features in X_train" % ( X_train.shape[0], X_train.shape[1] ))
        self.logger.debug("%d samples in y_train" % ( y_train.shape[0] ))

        with_mean = True if 'with_mean' not in kwargs else kwargs['with_mean']
        with_std = True if 'with_std' not in kwargs else kwargs['with_std']

        # Cannot center sparse matrices, `with_mean` should be set as `False`
        # Douglas: this doesn't make sense
        #if utils.isSparse(self.X):
        #    with_mean = False

        self.scaling = False if 'scaling' not in kwargs else kwargs['scaling']
        if self.scaling:
            self.scaler = StandardScaler(with_mean=with_mean, with_std=with_std)
            ## apply scaling on X
            self.logger.debug("applying a standard scaling with_mean=%d, with_std=%d" % (with_mean, with_std))
            X_train = self.scaler.fit_transform(X_train)

        ## determine whether using predict or predict_proba
        self.prob = False if 'prob' not in kwargs else kwargs["prob"]
        random_state = None if 'random_state' not in kwargs else kwargs["random_state"]
        
        if classifier == "SVM":
            ## setup a svm classifier
            kernel = "rbf" if 'kernel' not in kwargs else kwargs["kernel"]
            ## cost: default 1
            C = 1.0 if "C" not in kwargs else kwargs["C"]
            ## gamma: default (1/num_features)
            num_features = X_train.shape[1]
            gamma = (1.0/num_features) if "gamma" not in kwargs else kwargs["gamma"]
            #self.clf = svm.SVC(C=C, gamma=gamma, kernel=kernel, probability=self.prob, random_state=random_state, class_weight='auto')
            self.clf = svm.SVC(C=C, gamma=gamma, kernel=kernel, probability=self.prob, random_state=random_state)
            self.params = "%s_%s C=%f gamma=%f probability=%d" % (classifier, kernel, C, gamma, self.prob)

        elif classifier == "SGD":

            shuffle = True if 'shuffle' not in kwargs else kwargs['shuffle']
            if self.prob:
                self.clf = SGDClassifier(loss="log", shuffle=shuffle)
            else:
                self.clf = SGDClassifier(shuffle=shuffle)

            self.params = "%s_%s" % (classifier, 'linear')
        elif classifier == "GaussianNB":
            self.clf = GaussianNB()

            self.params = "%s_%s" % (classifier, 'NB')
        else:
            raise Exception("currently only support SVM, SGD and GaussianNB classifiers")

        self.logger.debug(self.params)
        self.clf.fit(X_train, y_train)
    
    def dump_model(self, file_name):
        try:
            pickle.dump(self.clf, open(file_name, "w"))
        except ValueError:
            self.logger.error("failed to dump %s" % (file_name))

    def dump_scaler(self, file_name):
        try:
            if self.scaling:
                pickle.dump(self.scaler, open(file_name, "w"))
            else:
                self.logger.warning("scaler doesn't exist")
        except ValueError:
            self.logger.error("failed to dump %s" % (file_name))

    def load_model(self, file_name):
        try:
            self.clf = pickle.load( open(file_name, "r"))
        except ValueError:
            self.logger.error("failed to load %s" % (file_name))

    def load_scaler(self, file_name):
        try:
            self.scaler = pickle.load( open(file_name, "r"))
            if self.scaler:
                self.scaling = True
        except ValueError:
            self.logger.error("failed to load %s" % (file_name))

    def predict(self, X_test, y_test, **kwargs):
        '''
        return dictionary of results
        '''
        
        if self.scaling:
            X_test = self.scaler.transform(X_test)

        self.logger.info('y_test = %s', str(y_test.shape))
        y_predict = self.clf.predict(X_test)
        X_predict_prob = self.clf.predict_proba(X_test) if self.clf.probability else 0
        results = {}
        if 'score' in kwargs and kwargs['score'] == True:
            results.update({'score': self.clf.score(X_test, y_test.tolist())})
            self.logger.info('score = %f', results['score'])

        if 'weighted_score' in kwargs and kwargs['weighted_score'] == True:
            results.update({'weighted_score': self._weighted_score(y_test.tolist(), y_predict)})
            self.logger.info('weighted_score = %f', results['weighted_score'])

        if 'y_predict' in kwargs and kwargs['y_predict'] == True:
            results.update({'y_predict': y_predict})
            self.logger.info('y_predict = %f', results['y_predict'])

        if 'X_predict_prob' in kwargs and kwargs['X_predict_prob'] == True:            
            results.update({'X_predict_prob': X_predict_prob[:, 1]})
            self.logger.info('X_predict_prob = %s', str(results['X_predict_prob']))

        if 'auc' in kwargs and kwargs['auc'] == True:
            fpr, tpr, thresholds = roc_curve(y_test, X_predict_prob[:, 1])
            results.update({'auc': auc(fpr, tpr)})
            self.logger.info('auc = %f', results['auc'])

        if 'decision_value' in kwargs and kwargs['decision_value'] == True:
            results.update({'decision_value': self.clf.decision_function(X_test)})
            self.logger.debug('decision_value = %s', str(results['decision_value']))

        return results     
    
    def _weighted_score(self, y_test, y_predict):
        # calc weighted score 
        n_pos = len([val for val in y_test if val == 1])
        n_neg = len([val for val in y_test if val == -1])
        
        temp_min = min(n_pos, n_neg)
        weight_pos = 1.0/(n_pos/temp_min)
        weight_neg = 1.0/(n_neg/temp_min)
        
        correct_predict = [i for i, j in zip(y_test, y_predict) if i == j]
        weighted_sum = 0.0
        for answer in correct_predict:
            weighted_sum += weight_pos if answer == 1 else weight_neg
        
        wscore = weighted_sum / (n_pos * weight_pos + n_neg * weight_neg)
        return wscore
    
    def kfold(self, kfolder, **kwargs):
        """
        return:
            mean score for kfold training

        required:
            kfolder: generated by sklearn.cross_validatio.KFold

        options:
            same as _train
        """
        
        #amend = False if "amend" not in kwargs else kwargs["amend"]
        #if amend:
            ## amend dense matrix: replace NaN and None with float values
        #    self.check_and_amend()
        #else:
        #    self.logger.debug("skip the amending process")

        sum_score = 0.0
        for (i, (train_index, test_index)) in enumerate(kfolder):

            self.logger.info("cross-validation fold %d: train=%d, test=%d" % (i, len(train_index), len(test_index)))

            X_train, X_test, y_train, y_test = self.X[train_index], self.X[test_index], self.y[train_index], self.y[test_index]
            self._train(X_train, y_train, **kwargs)

            score = self.predict(X_test, y_test, score=True)['score']
            self.logger.info('score = %.5f' % (score))
            sum_score += score

        mean_score = sum_score/len(kfolder)
        self.logger.info('*** C = %f, mean_score = %f' % (kwargs['C'], mean_score))
        return mean_score
Example #6
0
class SoundClassifier:
    def __init__(self, algorithm):
        if algorithm == 'knn':
            self._classifier = KNeighborsClassifier(n_neighbors=6)
        elif algorithm == 'linear':
            self._classifier = LogisticRegression(
            )  # alternative => (C=100) / (C=0.01)
        elif algorithm == 'linearMulti':
            self._classifier = LinearSVC()
        elif algorithm == 'sgd':
            self._classifier = SGDClassifier(random_state=0)
        elif algorithm == 'decisionTree':
            self._classifier = DecisionTreeClassifier(
                random_state=0)  # alternative => max_depth=4
        elif algorithm == 'randomForest':
            self._classifier = RandomForestClassifier(n_estimators=10,
                                                      max_features=1300,
                                                      max_depth=8,
                                                      random_state=0)
        elif algorithm == 'gradientBoosting':
            self._classifier = GradientBoostingClassifier(
                random_state=0
            )  # alternative => max_depth=1, learning_rate=0.01
        elif algorithm == 'svm':
            self._classifier = SVC(
                C=1.3, kernel='rbf', gamma='scale'
            )  # alternative => C=1000, gamma=1000. Also pre-process data
        elif algorithm == 'neuralNetworks':
            self._classifier = MLPClassifier(
                random_state=0
            )  # alternative => max_iter=1000, alpha=1. Also pre-process data
        elif algorithm == 'gmm':
            self._classifier = GaussianProcessClassifier(
                kernel=RationalQuadratic(alpha=1, length_scale=1),
                random_state=0)
        elif algorithm == 'gnb':
            self._classifier = GaussianNB()

        else:
            print('Algorithm not found')

    def train_classifier(self, X_train, y_train):
        self._classifier.fit(X_train, y_train)

    def get_predictions(self, X_test):
        return self._classifier.predict(X_test)

    def get_accuracy(self, X_test, y_test):
        return self._classifier.score(X_test, y_test)

    def show_feature_importance(self, data, target):
        """Only works with algorithms that have feature_importances_ attribute"""
        plt.plot(self._classifier.feature_importances_, 'o')
        plt.xticks(range(data.shape[1]), target, rotation=90)
        plt.show()

    def print_decision_function(self, X_test):
        """Only works with algorithms that have decision_function method"""
        print(self._classifier.decision_function(X_test))
        # We can recover the predictions by computing the argmax
        print(np.argmax(self._classifier.decision_function(X_test), axis=1))
        print(self._classifier.predict(X_test))

    def print_prediction_probability(self, X_test):
        """Only works with algorithms that have predict_proba method"""
        print(self._classifier.predict_proba(X_test))
        # We can recover the predictions by computing the argmax
        print(np.argmax(self._classifier.decision_function(X_test), axis=1))
        print(self._classifier.predict(X_test))
Example #7
0
def plotshow(trainingSet,testSet,method):
    y=[]
    data=[]
    label=[]
    data_test=[]
    label_test=[]

    for i in range(len(trainingSet)):
        data.append(list(map(eval,trainingSet[i][:-1])))
        label.append(list(map(eval,trainingSet[i][-1])))
        y.append(list(map(eval,trainingSet[i][-1])))
    for n in range(len(testSet)):
        data_test.append(list(map(eval,testSet[n][:-1])))
        label_test.append(list(map(eval,testSet[n][-1])))
        y.append(list(map(eval,testSet[n][-1])))
        
    global clf
    if method=="高斯朴素贝叶斯":
        clf = GaussianNB(priors=None)
        
        
    if method =="多项式分布贝叶斯":
        clf = MultinomialNB(alpha=1.0, fit_prior=True, class_prior=None)
        
        
    if method =="伯努利朴素贝叶斯":
        clf = BernoulliNB(alpha=1.0, binarize=0.0, fit_prior=True, class_prior=None)
        

    ax1 = plt.subplot2grid((3, 1), (0, 0), rowspan=2)
    ax2 = plt.subplot2grid((3, 1), (2, 0))

    ax1.plot([0, 1], [0, 1], "k:", label="Perfectly calibrated")
    for clf, name in [(clf, "Naive Bayes")]:
        clf = clf.fit(data, label)
        
        if hasattr(clf, "predict_proba"):
            prob_pos = clf.predict_proba(data_test)[:, 1]
        else:  
            prob_pos = clf.decision_function(data_test)
            prob_pos = \
                (prob_pos - prob_pos.min()) / (prob_pos.max() - prob_pos.min())
        
        clf_score = brier_score_loss(label_test, prob_pos, pos_label=np.array(y).max())
        

        fraction_of_positives, mean_predicted_value = \
            calibration_curve(label_test, prob_pos, n_bins=10)

        ax1.plot(mean_predicted_value, fraction_of_positives, "s-",
                 label="%s (%1.3f)" % (name, clf_score))

        ax2.hist(prob_pos, range=(0, 1), bins=10, label=name,
                 histtype="step", lw=2)
    ax1.set_ylabel("Fraction of positives")
    ax1.set_ylim([-0.05, 1.05])
    ax1.legend(loc="lower right")
    ax1.set_title('Calibration plots  (reliability curve)')

    ax2.set_xlabel("Mean predicted value")
    ax2.set_ylabel("Count")
    ax2.legend(loc="upper center", ncol=2)

    plt.tight_layout()
    plt.savefig("E:/Anaconda/Scripts/CorsApi/snippets/static/picture/bayes.jpg")     
    score = clf.score(data_test,label_test)
    
    return score