コード例 #1
0
def classifyTestSamples(trainingFeatures, trainingCategories, testFeatures):
    clf = SGDClassifier()

    clf.fit(trainingFeatures, trainingCategories)
    predictedCategories = clf.predict(testFeatures)

    return predictedCategories
コード例 #2
0
class MachineLearning:
    def __init__(self, Master_DF):
        self.Data_Frame = Master_DF

    def Encoder(self, df):
        encoder = LabelEncoder()
        print("Fitting")
        encoder.fit(df)
        return encoder.transform(df)

    def Perceptorn_PreProcessing(self, x, y):
        X = self.Encoder(self.Data_Frame[x].factorize()[0])
        Y = self.Encoder(self.Data_Frame[y].factorize()[0])
        X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                            Y,
                                                            test_size=0.3,
                                                            random_state=0)
        sc = StandardScaler()
        sc.fit(X_train)
        X_train_std = sc.transform(X_train)
        X_test_std = sc.transform(X_test)
        return X_train_std, Y_train, X_test_std, Y_test

    def ppn_model(self, n_iter, eta0, random_state):
        self.X_train_std_uri, self.Y_train_category, self.X_test_std_uri, self.Y_test_category = self.Perceptorn_PreProcessing(
            'uri', 'category')
        ppn = Perceptron(n_iter=n_iter, eta0=eta0, random_state=random_state)
        ppn.fit(self.X_train_std_uri.reshape(len(self.X_train_std_uri), 1),
                self.Y_train_category)
        y_pred = ppn.predict(
            self.X_test_std_uri.reshape(len(self.X_test_std_uri), 1))
        return (self.Y_test_category != y_pred).sum(), accuracy_score(
            self.Y_test_category, y_pred), y_pred

    def lr_model(self, c, random_state):
        lr = LogisticRegression(C=c, random_state=random_state)
        lr.fit(self.X_train_std_uri.reshape(len(self.X_train_std_uri), 1),
               self.Y_train_category)
        y_pred = lr.predict(
            self.X_test_std_uri.reshape(len(self.X_test_std_uri), 1))
        return (self.Y_test_category != y_pred).sum(), accuracy_score(
            self.Y_test_category, y_pred), y_pred

    def ada_sd(self, n_iter, eta, random_state):
        self.ada = SGDClassifier(n_iter=n_iter,
                                 eta=eta,
                                 random_state=random_state)
        self.ada.fit(
            self.X_train_std_uri.reshape(len(self.X_train_std_uri), 1),
            self.Y_train_category)
        self.y_pred = self.ada.predict(
            self.X_test_std_uri.reshape(len(self.X_test_std_uri), 1))
        return (self.Y_test_category != self.y_pred).sum(), accuracy_score(
            self.Y_test_category, self.y_pred), self.y_pred
コード例 #3
0
def withoutPipeline():
    scaler = StandardScaler().fit(x_train) # for each x, (x - mean(all x))/std. dev. of x
                                           # this step computes the mean and std. dev.
    x_train = scaler.transform(x_train)
    x_test = scaler.transform(x_test)
    clfer = SGDClassifier()
    clfer.fit(x_train, y_train) # this will try to separate the three classes based
                                # on the two features we gave it. Hence, we will get
                                # back three lines. I.e., three sets of coefficients
                                # and three intercepts
    if(DEBUG):
        #print clfer.coef_
        #print clfer.intercept_
        #print clfer.predict(scaler.transform([[4.7, 3.1]]))
        #print clfer.decision_function(scaler.transform([[4.7, 3.1]])) # the algorithm evaluates distance from all three
                                                                      # lines and picks the largest one (in this case [0])
        pass

    # validate results
    y_predict_train = clfer.predict(x_train)
    print "% Correct results on training set:"
    print metrics.accuracy_score(y_train, y_predict_train)
    y_predict_test = clfer.predict(x_test)
    print "\n% Correct results on testing set:"
    print metrics.accuracy_score(y_test, y_predict_test)
    # Understanding the classification report:
    # Precision: TP/(TP + FP) - ideal 1 - all instances reported as x were x. In other words,
    #                                     there were no instances reported as x that were NOT x
    # Recall:    TP/(TP + FN) - ideal 1 - all instances OF x were reported as x
    # Although, accuracy does not appear in the report, it is important to know what it means:
    # Accuracy: (TP + TN) / (TP + TN + FP + FN)
    print "\nClassification Report:"
    print metrics.classification_report(y_test, y_predict_test)
    # Understanding the confusion matrix
    # how many of class i were predicted as j
    # ideal. an Identity matrix
    print "Confusion Matrix:"
    print metrics.confusion_matrix(y_test, y_predict_test)
コード例 #4
0
class SGDClassifierImpl():

    def __init__(self, loss='hinge', penalty='l2', alpha=0.0001, l1_ratio=0.15, fit_intercept=True, max_iter=None, tol=None, shuffle=True, verbose=0, epsilon=0.1, n_jobs=None, random_state=None, learning_rate='optimal', eta0=0.0, power_t=0.5, early_stopping=False, validation_fraction=0.1, n_iter_no_change=5, class_weight='balanced', warm_start=False, average=False):
        self._hyperparams = {
            'loss': loss,
            'penalty': penalty,
            'alpha': alpha,
            'l1_ratio': l1_ratio,
            'fit_intercept': fit_intercept,
            'max_iter': max_iter,
            'tol': tol,
            'shuffle': shuffle,
            'verbose': verbose,
            'epsilon': epsilon,
            'n_jobs': n_jobs,
            'random_state': random_state,
            'learning_rate': learning_rate,
            'eta0': eta0,
            'power_t': power_t,
            'early_stopping': early_stopping,
            'validation_fraction': validation_fraction,
            'n_iter_no_change': n_iter_no_change,
            'class_weight': class_weight,
            'warm_start': warm_start,
            'average': average}
        self._wrapped_model = SKLModel(**self._hyperparams)

    def fit(self, X, y=None):
        if (y is not None):
            self._wrapped_model.fit(X, y)
        else:
            self._wrapped_model.fit(X)
        return self

    def predict(self, X):
        return self._wrapped_model.predict(X)

    def predict_proba(self, X):
        return self._wrapped_model.predict_proba(X)

    def decision_function(self, X):
        return self._wrapped_model.decision_function(X)

    def partial_fit(self, X, y=None, classes = None):
      if not hasattr(self, "_wrapped_model"):
        self._wrapped_model = SKLModel(**self._hyperparams)
      self._wrapped_model.partial_fit(X, y, classes = classes)
      return self
コード例 #5
0
def stochastic_descent(xtrain, ytrain, xtest):
    clf = SGDClassifier(loss="hinge",
                        penalty="l2",
                        max_iter=10,
                        random_state=42,
                        alpha=1e-3,
                        tol=None)
    print("SGD Fitting")
    clf.fit(xtrain, ytrain)
    # Saving the model with pickle
    with open(base_dir + "Model", 'wb') as f:
        pickle.dump(clf, f)
    print("SGD Predicting")
    ytest = clf.predict(xtest)

    return ytest
コード例 #6
0
                          n_iter=10)
trainloss = []
testloss = []
for i, chunk in enumerate(
        pd.read_csv("cancer2.csv",
                    chunksize=chunksize,
                    header=None,
                    iterator=True)):
    X = chunk.iloc[:, :-1]
    y = chunk.iloc[:, -1]
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.33,
                                                        random_state=42)
    estimator.partial_fit(X, y, classes=np.unique(y))
    trainR2 = mean_squared_error(y_train, estimator.predict(X_train))
    testR2 = mean_squared_error(y_test, estimator.predict(X_test))
    trainloss.append(trainR2)
    testloss.append(testR2)
    print("trainloss:{:.4f},testloss:{:.4f} ".format(trainloss[-1],
                                                     testloss[-1]))
    if i > 3:
        break

# In[134]:

import matplotlib.pyplot as plt
plt.plot(trainloss)
plt.plot(testloss)
plt.legend(('train', 'test'))
plt.show()
コード例 #7
0
                                                    data[column_names[10]],
                                                    test_size=0.25,
                                                    random_state=33)
#查验训练样本的数量和类别分布
print(y_train.value_counts())
#查验测试样本的数量和类别分布
print(y_test.value_counts())

#使用线性分类模型从事良/恶性乳腺癌肿瘤预测任务
#标准化数据,保证每个维度的特征数据方差为1,均值为0。使得预测结果不会被某些维度过大的特征值主导
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)
#初始化 LogisticRegression 和 SGDClassifier
lr = LogisticRegression()
sgdc = SGDClassifier()
#调用 LogisticRegression 中的 fit 函数/模块用来训练模型参数
lr.fit(X_train, y_train)
#使用训练好的模型 lr 对 X_test进行预测,结果储存在变量 lr_y_predict 中
lr_y_predict = lr.predict(X_test)
#调用 SGDClassifier 中的 fit 函数/模块用来训练模型参数
sgdc.fit(X_train, y_train)
#使用训练好的模型 sgdc 对 X_test进行预测,结果储存在变量 sgdc_y_predict 中
sgdc_y_predict = sgdc.predict(X_test)

#使用线性分类模型从事良/恶性肿瘤预测任务的性能分析
#使用逻辑斯蒂回归模型自带的评分函数 score 获得模型在测试集上的准确性结果
print('LR分类器的精度:', lr.score(X_test, y_test))
#利用 classification_report 获得 LogisticRegression 其他三个指标的结果
print(classification_report(y_test, lr_y_predict, target_names=['良性', '恶性']))
コード例 #8
0
ファイル: sgd.py プロジェクト: Bryan-LL/auto-sklearn
class SGD(
    IterativeComponentWithSampleWeight,
    AutoSklearnClassificationAlgorithm,
):
    def __init__(self, loss, penalty, alpha, fit_intercept, tol,
                 learning_rate, l1_ratio=0.15, epsilon=0.1,
                 eta0=0.01, power_t=0.5, average=False, random_state=None):
        self.loss = loss
        self.penalty = penalty
        self.alpha = alpha
        self.fit_intercept = fit_intercept
        self.tol = tol
        self.learning_rate = learning_rate
        self.l1_ratio = l1_ratio
        self.epsilon = epsilon
        self.eta0 = eta0
        self.power_t = power_t
        self.random_state = random_state
        self.average = average
        self.estimator = None

    def iterative_fit(self, X, y, n_iter=2, refit=False, sample_weight=None):
        from sklearn.linear_model.stochastic_gradient import SGDClassifier

        # Need to fit at least two iterations, otherwise early stopping will not
        # work because we cannot determine whether the algorithm actually
        # converged. The only way of finding this out is if the sgd spends less
        # iterations than max_iter. If max_iter == 1, it has to spend at least
        # one iteration and will always spend at least one iteration, so we
        # cannot know about convergence.

        if refit:
            self.estimator = None

        if self.estimator is None:
            self.fully_fit_ = False

            self.alpha = float(self.alpha)
            self.l1_ratio = float(self.l1_ratio) if self.l1_ratio is not None \
                else 0.15
            self.epsilon = float(self.epsilon) if self.epsilon is not None \
                else 0.1
            self.eta0 = float(self.eta0)
            self.power_t = float(self.power_t) if self.power_t is not None \
                else 0.5
            self.average = check_for_bool(self.average)
            self.fit_intercept = check_for_bool(self.fit_intercept)
            self.tol = float(self.tol)

            self.estimator = SGDClassifier(loss=self.loss,
                                           penalty=self.penalty,
                                           alpha=self.alpha,
                                           fit_intercept=self.fit_intercept,
                                           max_iter=n_iter,
                                           tol=self.tol,
                                           learning_rate=self.learning_rate,
                                           l1_ratio=self.l1_ratio,
                                           epsilon=self.epsilon,
                                           eta0=self.eta0,
                                           power_t=self.power_t,
                                           shuffle=True,
                                           average=self.average,
                                           random_state=self.random_state,
                                           warm_start=True)
            self.estimator.fit(X, y, sample_weight=sample_weight)
        else:
            self.estimator.max_iter += n_iter
            self.estimator.max_iter = min(self.estimator.max_iter, 512)
            self.estimator._validate_params()
            self.estimator._partial_fit(
                X, y,
                alpha=self.estimator.alpha,
                C=1.0,
                loss=self.estimator.loss,
                learning_rate=self.estimator.learning_rate,
                max_iter=n_iter,
                sample_weight=sample_weight,
                classes=None,
                coef_init=None,
                intercept_init=None
            )

        if self.estimator._max_iter >= 512 or n_iter > self.estimator.n_iter_:
            self.fully_fit_ = True

        return self

    def configuration_fully_fitted(self):
        if self.estimator is None:
            return False
        elif not hasattr(self, 'fully_fit_'):
            return False
        else:
            return self.fully_fit_

    def predict(self, X):
        if self.estimator is None:
            raise NotImplementedError()
        return self.estimator.predict(X)

    def predict_proba(self, X):
        if self.estimator is None:
            raise NotImplementedError()

        if self.loss in ["log", "modified_huber"]:
            return self.estimator.predict_proba(X)
        else:
            df = self.estimator.decision_function(X)
            return softmax(df)

    @staticmethod
    def get_properties(dataset_properties=None):
        return {'shortname': 'SGD Classifier',
                'name': 'Stochastic Gradient Descent Classifier',
                'handles_regression': False,
                'handles_classification': True,
                'handles_multiclass': True,
                'handles_multilabel': False,
                'is_deterministic': True,
                'input': (DENSE, SPARSE, UNSIGNED_DATA),
                'output': (PREDICTIONS,)}

    @staticmethod
    def get_hyperparameter_search_space(dataset_properties=None):
        cs = ConfigurationSpace()

        loss = CategoricalHyperparameter("loss",
            ["hinge", "log", "modified_huber", "squared_hinge", "perceptron"],
            default_value="log")
        penalty = CategoricalHyperparameter(
            "penalty", ["l1", "l2", "elasticnet"], default_value="l2")
        alpha = UniformFloatHyperparameter(
            "alpha", 1e-7, 1e-1, log=True, default_value=0.0001)
        l1_ratio = UniformFloatHyperparameter(
            "l1_ratio", 1e-9, 1,  log=True, default_value=0.15)
        fit_intercept = UnParametrizedHyperparameter("fit_intercept", "True")
        tol = UniformFloatHyperparameter("tol", 1e-5, 1e-1, log=True,
                                         default_value=1e-4)
        epsilon = UniformFloatHyperparameter(
            "epsilon", 1e-5, 1e-1, default_value=1e-4, log=True)
        learning_rate = CategoricalHyperparameter(
            "learning_rate", ["optimal", "invscaling", "constant"],
            default_value="invscaling")
        eta0 = UniformFloatHyperparameter(
            "eta0", 1e-7, 1e-1, default_value=0.01, log=True)
        power_t = UniformFloatHyperparameter("power_t", 1e-5, 1,
                                             default_value=0.5)
        average = CategoricalHyperparameter(
            "average", ["False", "True"], default_value="False")
        cs.add_hyperparameters([loss, penalty, alpha, l1_ratio, fit_intercept,
                                tol, epsilon, learning_rate, eta0, power_t,
                                average])

        # TODO add passive/aggressive here, although not properly documented?
        elasticnet = EqualsCondition(l1_ratio, penalty, "elasticnet")
        epsilon_condition = EqualsCondition(epsilon, loss, "modified_huber")

        power_t_condition = EqualsCondition(power_t, learning_rate,
                                            "invscaling")

        # eta0 is only relevant if learning_rate!='optimal' according to code
        # https://github.com/scikit-learn/scikit-learn/blob/0.19.X/sklearn/
        # linear_model/sgd_fast.pyx#L603
        eta0_in_inv_con = InCondition(eta0, learning_rate, ["invscaling",
                                                            "constant"])
        cs.add_conditions([elasticnet, epsilon_condition, power_t_condition,
                           eta0_in_inv_con])

        return cs
コード例 #9
0
https://blog.csdn.net/quiet_girl/article/details/72517053
"""

ss = StandardScaler()
X_train = ss.fit_transform(X_train)  # 先拟合数据,然后转化它将其转化为标准形式
X_test = ss.transform(
    X_test)  # Perform standardization by centering and scaling(通过找中心和缩放等实现标准化)

# 初始化, Stochastic Gradient Descent Classifier & Logistic Regression
lr = LogisticRegression()
sgdc = SGDClassifier()
lr.fit(X_train, y_train)  # LR分类器 训练模型
lr_y_predict = lr.predict(X_test)  # 对X_test进行预测

sgdc.fit(X_train, y_train)  # 随机梯度下降分类器
sgdc_y_predict = sgdc.predict(X_test)  # 对X_test进行预测

# 性能分析(Performance)
print('Accuracy of LR Classfier:', lr.score(X_test, y_test))
print(
    classification_report(y_test,
                          lr_y_predict,
                          target_names=['Benign', 'Malignant']))

print('Accuracy of SGD Classfier:', sgdc.score(X_test, y_test))
print(
    classification_report(y_test,
                          sgdc_y_predict,
                          target_names=['Benign', 'Malignant']))

# sklearn分类:https://blog.csdn.net/u012526003/article/details/79054012
コード例 #10
0
ss = StandardScaler()
x_train = ss.fit_transform(x_train)
x_test = ss.transform(x_test)

#初始化
lr = LogisticRegression()
sgdc = SGDClassifier()
#调用LogisticRegression中的fit函数/模块用来训练模型参数
lr.fit(x_train, y_train)
#使用训练好的模型Lr对x_test进行预测,结果存储在lr_y_predict中
lr_y_predict = lr.predict(x_test)
#调用SGDClassifier中的fit函数/模块来训练模型参数
sgdc.fit(x_train, y_train)
#使用训练好的模型sgdc对X_test进行预测,结果存储在变量sgdc_y_predict中
sgdc_y_predict = sgdc.predict(x_test)

from sklearn.metrics import classification_report

#使用评分函数score获得模型在测试集合上的准确性结果
print('Accuracy of LR Classifier:', lr.score(x_test, y_test))
#利用classification_report模块获得LogisticRegression其他三个指标的结果
print(
    classification_report(y_test,
                          lr_y_predict,
                          target_names=['Benign', 'Malignant']))
print("\n")
print('Accuarcy of SGD Classifier:', sgdc.score(x_test, y_test))
print(
    classification_report(y_test,
                          sgdc_y_predict,
コード例 #11
0
ファイル: sgd.py プロジェクト: wanghia/pyinstaller_example
class SGD(AutoSklearnClassificationAlgorithm):
    def __init__(self,
                 loss,
                 penalty,
                 alpha,
                 fit_intercept,
                 n_iter,
                 learning_rate,
                 l1_ratio=0.15,
                 epsilon=0.1,
                 eta0=0.01,
                 power_t=0.5,
                 average=False,
                 random_state=None):
        self.loss = loss
        self.penalty = penalty
        self.alpha = alpha
        self.fit_intercept = fit_intercept
        self.n_iter = n_iter
        self.learning_rate = learning_rate
        self.l1_ratio = l1_ratio
        self.epsilon = epsilon
        self.eta0 = eta0
        self.power_t = power_t
        self.random_state = random_state
        self.average = average
        self.estimator = None

    def fit(self, X, y, sample_weight=None):
        self.iterative_fit(X,
                           y,
                           n_iter=1,
                           sample_weight=sample_weight,
                           refit=True)
        while not self.configuration_fully_fitted():
            self.iterative_fit(X, y, n_iter=1, sample_weight=sample_weight)

        return self

    def iterative_fit(self, X, y, n_iter=1, refit=False, sample_weight=None):
        from sklearn.linear_model.stochastic_gradient import SGDClassifier

        if refit:
            self.estimator = None

        if self.estimator is None:

            self.alpha = float(self.alpha)
            self.fit_intercept = self.fit_intercept == 'True'
            self.n_iter = int(self.n_iter)
            self.l1_ratio = float(
                self.l1_ratio) if self.l1_ratio is not None else 0.15
            self.epsilon = float(
                self.epsilon) if self.epsilon is not None else 0.1
            self.eta0 = float(self.eta0)
            self.power_t = float(
                self.power_t) if self.power_t is not None else 0.25
            self.average = self.average == 'True'

            self.estimator = SGDClassifier(loss=self.loss,
                                           penalty=self.penalty,
                                           alpha=self.alpha,
                                           fit_intercept=self.fit_intercept,
                                           n_iter=n_iter,
                                           learning_rate=self.learning_rate,
                                           l1_ratio=self.l1_ratio,
                                           epsilon=self.epsilon,
                                           eta0=self.eta0,
                                           power_t=self.power_t,
                                           shuffle=True,
                                           average=self.average,
                                           random_state=self.random_state)
        else:
            self.estimator.n_iter += n_iter

        self.estimator.partial_fit(X,
                                   y,
                                   classes=np.unique(y),
                                   sample_weight=sample_weight)

        if self.estimator.n_iter >= self.n_iter:
            self.fully_fit_ = True

        return self

    def configuration_fully_fitted(self):
        if self.estimator is None:
            return False
        elif not hasattr(self, 'fully_fit_'):
            return False
        else:
            return self.fully_fit_

    def predict(self, X):
        if self.estimator is None:
            raise NotImplementedError()
        return self.estimator.predict(X)

    def predict_proba(self, X):
        if self.estimator is None:
            raise NotImplementedError()

        if self.loss in ["log", "modified_huber"]:
            return self.estimator.predict_proba(X)
        else:
            df = self.estimator.decision_function(X)
            return softmax(df)

    @staticmethod
    def get_properties(dataset_properties=None):
        return {
            'shortname': 'SGD Classifier',
            'name': 'Stochastic Gradient Descent Classifier',
            'handles_regression': False,
            'handles_classification': True,
            'handles_multiclass': True,
            'handles_multilabel': False,
            'is_deterministic': True,
            'input': (DENSE, SPARSE, UNSIGNED_DATA),
            'output': (PREDICTIONS, )
        }

    @staticmethod
    def get_hyperparameter_search_space(dataset_properties=None):
        cs = ConfigurationSpace()

        loss = CategoricalHyperparameter(
            "loss",
            ["hinge", "log", "modified_huber", "squared_hinge", "perceptron"],
            default="log")
        penalty = CategoricalHyperparameter("penalty",
                                            ["l1", "l2", "elasticnet"],
                                            default="l2")
        alpha = UniformFloatHyperparameter("alpha",
                                           10e-7,
                                           1e-1,
                                           log=True,
                                           default=0.0001)
        l1_ratio = UniformFloatHyperparameter("l1_ratio",
                                              1e-9,
                                              1,
                                              log=True,
                                              default=0.15)
        fit_intercept = UnParametrizedHyperparameter("fit_intercept", "True")
        n_iter = UniformIntegerHyperparameter("n_iter",
                                              5,
                                              1000,
                                              log=True,
                                              default=20)
        epsilon = UniformFloatHyperparameter("epsilon",
                                             1e-5,
                                             1e-1,
                                             default=1e-4,
                                             log=True)
        learning_rate = CategoricalHyperparameter(
            "learning_rate", ["optimal", "invscaling", "constant"],
            default="optimal")
        eta0 = UniformFloatHyperparameter("eta0", 10**-7, 0.1, default=0.01)
        power_t = UniformFloatHyperparameter("power_t", 1e-5, 1, default=0.25)
        average = CategoricalHyperparameter("average", ["False", "True"],
                                            default="False")
        cs.add_hyperparameters([
            loss, penalty, alpha, l1_ratio, fit_intercept, n_iter, epsilon,
            learning_rate, eta0, power_t, average
        ])

        # TODO add passive/aggressive here, although not properly documented?
        elasticnet = EqualsCondition(l1_ratio, penalty, "elasticnet")
        epsilon_condition = EqualsCondition(epsilon, loss, "modified_huber")
        # eta0 seems to be always active according to the source code; when
        # learning_rate is set to optimial, eta0 is the starting value:
        # https://github.com/scikit-learn/scikit-learn/blob/0.15.X/sklearn/linear_model/sgd_fast.pyx
        #eta0_and_inv = EqualsCondition(eta0, learning_rate, "invscaling")
        #eta0_and_constant = EqualsCondition(eta0, learning_rate, "constant")
        #eta0_condition = OrConjunction(eta0_and_inv, eta0_and_constant)
        power_t_condition = EqualsCondition(power_t, learning_rate,
                                            "invscaling")

        cs.add_conditions([elasticnet, epsilon_condition, power_t_condition])

        return cs
		[190,90,47],[175,64,39],[177,70,40],[159,55,37],
		[171,75,42],[181,85,43]]

#Corresponding gender tags
Y = ['male','female','female','female','male','male',
		'male','female','male','female','male']

# Decision Tree classifier- takes in the input data to predict whether male or female
classifier = tree.DecisionTreeClassifier()
classifier = classifier.fit(X,Y)

#Support Vector Machine Classifier
classifier1 = svm.SVC()
classifier1 = classifier1.fit(X,Y)

#Stochastic Gradient Descent 
clf = SGDClassifier()
clf = clf.fit(X,Y)

#Prediction step
#prediction for decision Trees
prediction = classifier.predict([[172,75,35]])
print(prediction)

#Prediction for Support Vector Machines
prediction1 =classifier1.predict([[177,70,43]])
print(prediction1)

#Prediction for Stochastic Gradient Descent
pred = clf.predict([[172,75,35]])
print(pred)
コード例 #13
0
    X_batch_kernel_approx, y_batch_onehot = encode(X_batch, y_batch,
                                                   one_hot_encoder,
                                                   column_transformer,
                                                   rbf_sampler)

    # make one pass of stochastic gradient descent over the batch.
    sgd_classifier.partial_fit(X_batch_kernel_approx, y_batch, classes=[0, 1])

    # print train/test accuracy metrics every 5 batch
    if (batch_no % 5) == 0:
        message = "batch {:>4} ".format(batch_no)
        for origin, X, y_true_onehot in zip(
            ('train', 'val'), (X_batch_kernel_approx, X_test_kernel_approx),
            (y_batch_onehot, y_true_test_onehot)):

            y_pred = sgd_classifier.predict(X)

            # preprocess correctly the labels and prediction to match
            # average_precision_score expectations
            y_pred_onehot = one_hot_encoder.transform(y_pred.reshape(-1, 1))

            score = average_precision_score(y_true_onehot, y_pred_onehot)
            message += "{} precision: {:.4f}  ".format(origin, score)
            if origin == 'val':
                test_scores_rbf.append(score)
                train_times_rbf.append(time.perf_counter() - t0)
                online_train_set_sizes.append((batch_no + 1) * batchsize)

        print(message)

###############################################################################
コード例 #14
0
def run(keyn, nPart):
    all_classes = np.array([0, 1])
    allKeys = [l.split()[0] for l in open('keywordsAll.txt').readlines()]
    keyFreqs = [
        float(l.split()[1]) / 4205907
        for l in open('keywordsAll.txt').readlines()
    ]
    key = allKeys[keyn]
    freq = keyFreqs[keyn]

    opt = 'body+title+code'
    bv = 'True'
    nneg = 'True'
    nv = 'None'
    #testopt = 'c'
    #testopt = 'w'
    #testopt = 'l2'
    testopt = 'l1'

    if testopt == 'c':
        cls = SGDClassifier(loss='hinge',
                            learning_rate="constant",
                            alpha=1e-6,
                            eta0=1e-2,
                            penalty='l2')
    elif testopt == 'w':
        cls = SGDClassifier(class_weight={1: 1.0 / freq / 8.0, 0: 1})
    elif testopt == 'l2':
        cls = SGDClassifier(loss='log', alpha=1e-5, penalty='l2')
    elif testopt == 'l1':
        cls = SGDClassifier(loss='log', alpha=1e-5, penalty='l1')

    outputName = 'key_' + str(
        keyn) + '_SGDtune_' + opt + '_partialfit_' + testopt + '.txt'
    pklName = 'SGD_key_' + str(keyn) + '_' + testopt + '.pkl'
    n0, ntrain = resumeJob(outputName, pklName)

    body_test, y_test = getTestSet(10, key, opt, testSize=0.2, seed=123)
    tot_pos = sum(y_test)
    vectorizer = HashingVectorizer(decode_error='ignore',
                                   n_features=2**20,
                                   token_pattern=r"\b\w[\w#+.-]*(?<!\.$)",
                                   binary=str2bool(bv),
                                   norm=normOpt(nv),
                                   non_negative=str2bool(nneg))

    X_test = vectorizer.transform(body_test)
    #print 'test case:', len(y_test), 'positive', tot_pos, 'key:', key, 'X norm:', X_test.sum(), 'binary:', bv, 'norm:', nv, 'nneg:', nneg
    if n0 >= 2:
        cls = joblib.load(pklName)
    for n in xrange(n0, 10):
        outfile = open(outputName, 'a')
        data = json.load(gzip.open('Train.rdup.' + str(n) + '.json.gz'))
        minibatch_size = len(data) / nPart + 1
        for i in xrange(nPart):
            n1 = i * minibatch_size
            n2 = (i + 1) * minibatch_size
            if i == nPart - 1:
                n2 = len(data)
            ntrain += (n2 - n1)
            body_train, y_train = getMiniBatch(data, n1, n2, key, opt)
            X_train = vectorizer.transform(body_train)
            shuffledRange = range(n2 - n1)
            for n_iter in xrange(5):
                X_train, y_train = shuffle(X_train, y_train)
            cls.partial_fit(X_train, y_train, classes=all_classes)
            y_pred = cls.predict(X_test)
            f1 = metrics.f1_score(y_test, y_pred)
            p = metrics.precision_score(y_test, y_pred)
            r = metrics.recall_score(y_test, y_pred)
            accu = cls.score(X_train, y_train)
            y_pred = cls.predict(X_train)
            f1t = metrics.f1_score(y_train, y_pred)
            outfile.write(
                "%3d %8d %.4f %.3f %.3f %.3f %.3f %5d  %5d\n" %
                (n, ntrain, accu, f1t, f1, p, r, sum(y_pred), tot_pos))
        _ = joblib.dump(cls, pklName, compress=9)
        outfile.close()
コード例 #15
0
ファイル: sgd.py プロジェクト: automl/paramsklearn
class SGD(ParamSklearnClassificationAlgorithm):
    def __init__(self, loss, penalty, alpha, fit_intercept, n_iter,
                 learning_rate, class_weight=None, l1_ratio=0.15, epsilon=0.1,
                 eta0=0.01, power_t=0.5, average=False, random_state=None):
        self.loss = loss
        self.penalty = penalty
        self.alpha = alpha
        self.fit_intercept = fit_intercept
        self.n_iter = n_iter
        self.learning_rate = learning_rate
        self.class_weight = class_weight
        self.l1_ratio = l1_ratio
        self.epsilon = epsilon
        self.eta0 = eta0
        self.power_t = power_t
        self.random_state = random_state
        self.average = average
        self.estimator = None

    def fit(self, X, y):
        while not self.configuration_fully_fitted():
            self.iterative_fit(X, y, n_iter=1)

        return self

    def iterative_fit(self, X, y, n_iter=1, refit=False):
        if refit:
            self.estimator = None

        if self.estimator is None:
            self.alpha = float(self.alpha)
            self.fit_intercept = self.fit_intercept == 'True'
            self.n_iter = int(self.n_iter)
            if self.class_weight == "None":
                self.class_weight = None
            self.l1_ratio = float(self.l1_ratio) if self.l1_ratio is not None else 0.15
            self.epsilon = float(self.epsilon) if self.epsilon is not None else 0.1
            self.eta0 = float(self.eta0)
            self.power_t = float(self.power_t) if self.power_t is not None else 0.25
            self.average = self.average == 'True'
            self.estimator = SGDClassifier(loss=self.loss,
                                           penalty=self.penalty,
                                           alpha=self.alpha,
                                           fit_intercept=self.fit_intercept,
                                           n_iter=self.n_iter,
                                           learning_rate=self.learning_rate,
                                           class_weight=self.class_weight,
                                           l1_ratio=self.l1_ratio,
                                           epsilon=self.epsilon,
                                           eta0=self.eta0,
                                           power_t=self.power_t,
                                           shuffle=True,
                                           average=self.average,
                                           random_state=self.random_state)

        self.estimator.n_iter += n_iter
        self.estimator.fit(X, y)
        return self

    def configuration_fully_fitted(self):
        if self.estimator is None:
            return False
        return not self.estimator.n_iter < self.n_iter

    def predict(self, X):
        if self.estimator is None:
            raise NotImplementedError()
        return self.estimator.predict(X)

    def predict_proba(self, X):
        if self.estimator is None:
            raise NotImplementedError()

        if self.loss in ["log", "modified_huber"]:
            return self.estimator.predict_proba(X)
        else:
            df = self.estimator.decision_function(X)
            return softmax(df)

    @staticmethod
    def get_properties(dataset_properties=None):
        return {'shortname': 'SGD Classifier',
                'name': 'Stochastic Gradient Descent Classifier',
                'handles_missing_values': False,
                'handles_nominal_values': False,
                'handles_numerical_features': True,
                'prefers_data_scaled': True,
                'prefers_data_normalized': True,
                'handles_regression': False,
                'handles_classification': True,
                'handles_multiclass': True,
                'handles_multilabel': False,
                'is_deterministic': True,
                'handles_sparse': True,
                'input': (DENSE, SPARSE, UNSIGNED_DATA),
                'output': (PREDICTIONS,),
                # TODO find out what is best used here!
                'preferred_dtype' : None}

    @staticmethod
    def get_hyperparameter_search_space(dataset_properties=None):
        cs = ConfigurationSpace()

        loss = cs.add_hyperparameter(CategoricalHyperparameter("loss",
            ["hinge", "log", "modified_huber", "squared_hinge", "perceptron"],
            default="hinge"))
        penalty = cs.add_hyperparameter(CategoricalHyperparameter(
            "penalty", ["l1", "l2", "elasticnet"], default="l2"))
        alpha = cs.add_hyperparameter(UniformFloatHyperparameter(
            "alpha", 10e-7, 1e-1, log=True, default=0.0001))
        l1_ratio = cs.add_hyperparameter(UniformFloatHyperparameter(
            "l1_ratio", 0, 1, default=0.15))
        fit_intercept = cs.add_hyperparameter(UnParametrizedHyperparameter(
            "fit_intercept", "True"))
        n_iter = cs.add_hyperparameter(UniformIntegerHyperparameter(
            "n_iter", 5, 1000, default=20))
        epsilon = cs.add_hyperparameter(UniformFloatHyperparameter(
            "epsilon", 1e-5, 1e-1, default=1e-4, log=True))
        learning_rate = cs.add_hyperparameter(CategoricalHyperparameter(
            "learning_rate", ["optimal", "invscaling", "constant"],
            default="optimal"))
        eta0 = cs.add_hyperparameter(UniformFloatHyperparameter(
            "eta0", 10**-7, 0.1, default=0.01))
        power_t = cs.add_hyperparameter(UniformFloatHyperparameter(
            "power_t", 1e-5, 1, default=0.25))
        average = cs.add_hyperparameter(CategoricalHyperparameter(
            "average", ["False", "True"], default="False"))

        # TODO add passive/aggressive here, although not properly documented?
        elasticnet = EqualsCondition(l1_ratio, penalty, "elasticnet")
        epsilon_condition = EqualsCondition(epsilon, loss, "modified_huber")
        # eta0 seems to be always active according to the source code; when
        # learning_rate is set to optimial, eta0 is the starting value:
        # https://github.com/scikit-learn/scikit-learn/blob/0.15.X/sklearn/linear_model/sgd_fast.pyx
        #eta0_and_inv = EqualsCondition(eta0, learning_rate, "invscaling")
        #eta0_and_constant = EqualsCondition(eta0, learning_rate, "constant")
        #eta0_condition = OrConjunction(eta0_and_inv, eta0_and_constant)
        power_t_condition = EqualsCondition(power_t, learning_rate, "invscaling")

        cs.add_condition(elasticnet)
        cs.add_condition(epsilon_condition)
        cs.add_condition(power_t_condition)

        return cs

    def __str__(self):
        return "ParamSklearn StochasticGradientClassifier"
コード例 #16
0
class SGD(AutoSklearnClassificationAlgorithm):
    def __init__(self,
                 loss,
                 penalty,
                 alpha,
                 fit_intercept,
                 n_iter,
                 learning_rate,
                 class_weight,
                 l1_ratio=0.15,
                 epsilon=0.1,
                 eta0=0.01,
                 power_t=0.5,
                 random_state=None):
        self.loss = loss
        self.penalty = penalty
        self.alpha = alpha
        self.fit_intercept = fit_intercept
        self.n_iter = n_iter
        self.learning_rate = learning_rate
        self.class_weight = class_weight
        self.l1_ratio = l1_ratio
        self.epsilon = epsilon
        self.eta0 = eta0
        self.power_t = power_t
        self.random_state = random_state
        self.estimator = None

    def fit(self, X, Y):
        # TODO: maybe scale training data that its norm becomes 1?
        # http://scikit-learn.org/stable/modules/sgd.html#id1
        self.alpha = float(self.alpha)
        self.fit_intercept = bool(self.fit_intercept)
        self.n_iter = int(self.n_iter)
        if self.class_weight == "None":
            self.class_weight = None
        self.l1_ratio = float(self.l1_ratio)
        self.epsilon = float(self.epsilon)
        self.eta0 = float(self.eta0)
        self.power_t = float(self.power_t)

        self.estimator = SGDClassifier(loss=self.loss,
                                       penalty=self.penalty,
                                       alpha=self.alpha,
                                       fit_intercept=self.fit_intercept,
                                       n_iter=self.n_iter,
                                       learning_rate=self.learning_rate,
                                       class_weight=self.class_weight,
                                       l1_ratio=self.l1_ratio,
                                       epsilon=self.epsilon,
                                       eta0=self.eta0,
                                       power_t=self.power_t,
                                       shuffle=True,
                                       random_state=self.random_state)
        self.estimator.fit(X, Y)
        return self

    def predict(self, X):
        if self.estimator is None:
            raise NotImplementedError()
        return self.estimator.predict(X)

    def predict_proba(self, X):
        if self.estimator is None:
            raise NotImplementedError()

        if self.loss in ["log", "modified_huber"]:
            return self.estimator.predict_proba(X)
        else:
            df = self.estimator.decision_function(X)
            return softmax(df)

    @staticmethod
    def get_properties():
        return {
            'shortname': 'SGD Classifier',
            'name': 'Stochastic Gradient Descent Classifier',
            'handles_missing_values': False,
            'handles_nominal_values': False,
            'handles_numerical_features': True,
            'prefers_data_scaled': True,
            'prefers_data_normalized': True,
            'handles_multiclass': True,
            'handles_multilabel': False,
            'is_deterministic': True,
            'handles_sparse': True,
            # TODO find out what is best used here!
            'preferred_dtype': None
        }

    @staticmethod
    def get_hyperparameter_search_space(dataset_properties=None):
        loss = CategoricalHyperparameter(
            "loss",
            ["hinge", "log", "modified_huber", "squared_hinge", "perceptron"],
            default="hinge")
        penalty = CategoricalHyperparameter("penalty",
                                            ["l1", "l2", "elasticnet"],
                                            default="l2")
        alpha = UniformFloatHyperparameter("alpha",
                                           10**-7,
                                           10**-1,
                                           log=True,
                                           default=0.0001)
        l1_ratio = UniformFloatHyperparameter("l1_ratio", 0, 1, default=0.15)
        fit_intercept = UnParametrizedHyperparameter("fit_intercept", "True")
        n_iter = UniformIntegerHyperparameter("n_iter", 5, 1000, default=20)
        epsilon = UniformFloatHyperparameter("epsilon",
                                             1e-5,
                                             1e-1,
                                             default=1e-4,
                                             log=True)
        learning_rate = CategoricalHyperparameter(
            "learning_rate", ["optimal", "invscaling", "constant"],
            default="optimal")
        eta0 = UniformFloatHyperparameter("eta0", 10**-7, 0.1, default=0.01)
        power_t = UniformFloatHyperparameter("power_t", 1e-5, 1, default=0.5)
        # This does not allow for other resampling methods!
        class_weight = CategoricalHyperparameter("class_weight",
                                                 ["None", "auto"],
                                                 default="None")
        cs = ConfigurationSpace()
        cs.add_hyperparameter(loss)
        cs.add_hyperparameter(penalty)
        cs.add_hyperparameter(alpha)
        cs.add_hyperparameter(l1_ratio)
        cs.add_hyperparameter(fit_intercept)
        cs.add_hyperparameter(n_iter)
        cs.add_hyperparameter(epsilon)
        cs.add_hyperparameter(learning_rate)
        cs.add_hyperparameter(eta0)
        cs.add_hyperparameter(power_t)
        cs.add_hyperparameter(class_weight)

        # TODO add passive/aggressive here, although not properly documented?
        elasticnet = EqualsCondition(l1_ratio, penalty, "elasticnet")
        epsilon_condition = EqualsCondition(epsilon, loss, "modified_huber")
        # eta0 seems to be always active according to the source code; when
        # learning_rate is set to optimial, eta0 is the starting value:
        # https://github.com/scikit-learn/scikit-learn/blob/0.15.X/sklearn/linear_model/sgd_fast.pyx
        #eta0_and_inv = EqualsCondition(eta0, learning_rate, "invscaling")
        #eta0_and_constant = EqualsCondition(eta0, learning_rate, "constant")
        #eta0_condition = OrConjunction(eta0_and_inv, eta0_and_constant)
        power_t_condition = EqualsCondition(power_t, learning_rate,
                                            "invscaling")

        cs.add_condition(elasticnet)
        cs.add_condition(epsilon_condition)
        cs.add_condition(power_t_condition)

        return cs

    def __str__(self):
        return "AutoSklearn StochasticGradientClassifier"
コード例 #17
0
Listx = [[188, 57, 30], [167, 32, 22], [193, 65, 29], [185, 53, 27],
         [164, 45, 22], [157, 38, 24], [179, 52, 27], [175, 68, 26],
         [167, 39, 24], [178, 62, 27], [158, 46, 26]]

#List of labels Y. Gender female or male

Listy = [
    'male', 'female', 'male', 'male', 'female', 'female', 'male', 'male',
    'female', 'male', 'female'
]

#Model to store the decision tree model: Clasifier

Clasifier_tree = tree.DecisionTreeClassifier()
Clasifier_Sgradient = SGDClassifier()
Clasifier_naive = GaussianNB()
#Training stage
Clasifier_tree = Clasifier_tree.fit(Listx, Listy)
Clasifier_Sgradient = Clasifier_Sgradient.fit(Listx, Listy)
Clasifier_naive = Clasifier_naive.fit(Listx, Listy)

#Test stage
Listz = [[150, 35, 21]]
Prediction_tree = Clasifier_tree.predict(Listz)
Prediction_Gradient = Clasifier_Sgradient.predict(Listz)
Prediction_naive = Clasifier_naive.predict(Listz)

print(Prediction_tree)
print(Prediction_Gradient)
print(Prediction_naive)
コード例 #18
0
class SGD:
    def __init__(self,
                 loss,
                 penalty,
                 alpha,
                 fit_intercept,
                 tol,
                 learning_rate,
                 l1_ratio=0.15,
                 epsilon=0.1,
                 eta0=0.01,
                 power_t=0.5,
                 average=False,
                 random_state=None):
        self.loss = loss
        self.penalty = penalty
        self.alpha = alpha
        self.fit_intercept = fit_intercept
        self.tol = tol
        self.learning_rate = learning_rate
        self.l1_ratio = l1_ratio
        self.epsilon = epsilon
        self.eta0 = eta0
        self.power_t = power_t
        self.random_state = random_state
        self.average = average
        self.estimator = None

    def fit(self, X, y, sample_weight=None):
        self.iterative_fit(X,
                           y,
                           n_iter=2,
                           refit=True,
                           sample_weight=sample_weight)
        iteration = 2
        while not self.configuration_fully_fitted():
            n_iter = int(2**iteration / 2)
            self.iterative_fit(X,
                               y,
                               n_iter=n_iter,
                               sample_weight=sample_weight)
            iteration += 1
        return self

    def iterative_fit(self, X, y, n_iter=2, refit=False, sample_weight=None):
        from sklearn.linear_model.stochastic_gradient import SGDClassifier

        # Need to fit at least two iterations, otherwise early stopping will not
        # work because we cannot determine whether the algorithm actually
        # converged. The only way of finding this out is if the sgd spends less
        # iterations than max_iter. If max_iter == 1, it has to spend at least
        # one iteration and will always spend at least one iteration, so we
        # cannot know about convergence.

        if refit:
            self.estimator = None

        if self.estimator is None:
            self.fully_fit_ = False

            self.alpha = float(self.alpha)
            self.l1_ratio = float(self.l1_ratio) if self.l1_ratio is not None \
                else 0.15
            self.epsilon = float(self.epsilon) if self.epsilon is not None \
                else 0.1
            self.eta0 = float(self.eta0)
            self.power_t = float(self.power_t) if self.power_t is not None \
                else 0.5
            self.average = check_for_bool(self.average)
            self.fit_intercept = check_for_bool(self.fit_intercept)
            self.tol = float(self.tol)

            self.estimator = SGDClassifier(loss=self.loss,
                                           penalty=self.penalty,
                                           alpha=self.alpha,
                                           fit_intercept=self.fit_intercept,
                                           max_iter=n_iter,
                                           tol=self.tol,
                                           learning_rate=self.learning_rate,
                                           l1_ratio=self.l1_ratio,
                                           epsilon=self.epsilon,
                                           eta0=self.eta0,
                                           power_t=self.power_t,
                                           shuffle=True,
                                           average=self.average,
                                           random_state=self.random_state,
                                           warm_start=True)
            self.estimator.fit(X, y, sample_weight=sample_weight)
        else:
            self.estimator.max_iter += n_iter
            self.estimator.max_iter = min(self.estimator.max_iter, 512)
            self.estimator._validate_params()
            self.estimator._partial_fit(
                X,
                y,
                alpha=self.estimator.alpha,
                C=1.0,
                loss=self.estimator.loss,
                learning_rate=self.estimator.learning_rate,
                max_iter=n_iter,
                sample_weight=sample_weight,
                classes=None,
                coef_init=None,
                intercept_init=None)

        if self.estimator._max_iter >= 512 or n_iter > self.estimator.n_iter_:
            self.fully_fit_ = True

        return self

    def configuration_fully_fitted(self):
        if self.estimator is None:
            return False
        elif not hasattr(self, 'fully_fit_'):
            return False
        else:
            return self.fully_fit_

    def predict(self, X):
        if self.estimator is None:
            raise NotImplementedError()
        return self.estimator.predict(X)

    def predict_proba(self, X):
        if self.estimator is None:
            raise NotImplementedError()

        if self.loss in ["log", "modified_huber"]:
            return self.estimator.predict_proba(X)
        else:
            df = self.estimator.decision_function(X)
            return softmax(df)
コード例 #19
0
ファイル: sgd.py プロジェクト: goldenair/to-know-autoML
class SGD(AutoSklearnClassificationAlgorithm):
    def __init__(self,
                 loss,
                 penalty,
                 alpha,
                 fit_intercept,
                 tol,
                 learning_rate,
                 l1_ratio=0.15,
                 epsilon=0.1,
                 eta0=0.01,
                 power_t=0.5,
                 average=False,
                 random_state=None):
        self.loss = loss
        self.penalty = penalty
        self.alpha = alpha
        self.fit_intercept = fit_intercept
        self.tol = tol
        self.learning_rate = learning_rate
        self.l1_ratio = l1_ratio
        self.epsilon = epsilon
        self.eta0 = eta0
        self.power_t = power_t
        self.random_state = random_state
        self.average = average
        self.estimator = None

    def fit(self, X, y, sample_weight=None):
        n_iter = 2
        self.iterative_fit(X,
                           y,
                           n_iter=n_iter,
                           sample_weight=sample_weight,
                           refit=True)
        while not self.configuration_fully_fitted():
            n_iter *= 2
            self.iterative_fit(X,
                               y,
                               n_iter=n_iter,
                               sample_weight=sample_weight)

        return self

    def iterative_fit(self, X, y, n_iter=2, refit=False, sample_weight=None):
        from sklearn.linear_model.stochastic_gradient import SGDClassifier

        # Need to fit at least two iterations, otherwise early stopping will not
        # work because we cannot determine whether the algorithm actually
        # converged. The only way of finding this out is if the sgd spends less
        # iterations than max_iter. If max_iter == 1, it has to spend at least
        # one iteration and will always spend at least one iteration, so we
        # cannot know about convergence.

        if refit:
            self.estimator = None

        if self.estimator is None:

            self.alpha = float(self.alpha)
            self.fit_intercept = self.fit_intercept == 'True'
            self.l1_ratio = float(
                self.l1_ratio) if self.l1_ratio is not None else 0.15
            self.epsilon = float(
                self.epsilon) if self.epsilon is not None else 0.1
            self.eta0 = float(self.eta0)
            self.power_t = float(
                self.power_t) if self.power_t is not None else 0.25
            self.average = self.average == 'True'
            self.tol = float(self.tol)

            self.estimator = SGDClassifier(loss=self.loss,
                                           penalty=self.penalty,
                                           alpha=self.alpha,
                                           fit_intercept=self.fit_intercept,
                                           max_iter=n_iter,
                                           tol=self.tol,
                                           learning_rate=self.learning_rate,
                                           l1_ratio=self.l1_ratio,
                                           epsilon=self.epsilon,
                                           eta0=self.eta0,
                                           power_t=self.power_t,
                                           shuffle=True,
                                           average=self.average,
                                           random_state=self.random_state,
                                           warm_start=True)
            self.estimator.fit(X, y, sample_weight=sample_weight)
        else:
            self.estimator.max_iter += n_iter
            self.estimator._validate_params()
            self.estimator._partial_fit(
                X,
                y,
                alpha=self.estimator.alpha,
                C=1.0,
                loss=self.estimator.loss,
                learning_rate=self.estimator.learning_rate,
                max_iter=n_iter,
                sample_weight=sample_weight,
                classes=None,
                coef_init=None,
                intercept_init=None)

        if self.estimator._max_iter >= 1000 or n_iter > self.estimator.n_iter_:
            self.fully_fit_ = True

        return self

    def configuration_fully_fitted(self):
        if self.estimator is None:
            return False
        elif not hasattr(self, 'fully_fit_'):
            return False
        else:
            return self.fully_fit_

    def predict(self, X):
        if self.estimator is None:
            raise NotImplementedError()
        return self.estimator.predict(X)

    def predict_proba(self, X):
        if self.estimator is None:
            raise NotImplementedError()

        if self.loss in ["log", "modified_huber"]:
            return self.estimator.predict_proba(X)
        else:
            df = self.estimator.decision_function(X)
            return softmax(df)

    @staticmethod
    def get_properties(dataset_properties=None):
        return {
            'shortname': 'SGD Classifier',
            'name': 'Stochastic Gradient Descent Classifier',
            'handles_regression': False,
            'handles_classification': True,
            'handles_multiclass': True,
            'handles_multilabel': False,
            'is_deterministic': True,
            'input': (DENSE, SPARSE, UNSIGNED_DATA),
            'output': (PREDICTIONS, )
        }

    @staticmethod
    def get_hyperparameter_search_space(dataset_properties=None):
        cs = ConfigurationSpace()

        loss = CategoricalHyperparameter(
            "loss",
            ["hinge", "log", "modified_huber", "squared_hinge", "perceptron"],
            default_value="log")
        penalty = CategoricalHyperparameter("penalty",
                                            ["l1", "l2", "elasticnet"],
                                            default_value="l2")
        alpha = UniformFloatHyperparameter("alpha",
                                           1e-7,
                                           1e-1,
                                           log=True,
                                           default_value=0.0001)
        l1_ratio = UniformFloatHyperparameter("l1_ratio",
                                              1e-9,
                                              1,
                                              log=True,
                                              default_value=0.15)
        fit_intercept = UnParametrizedHyperparameter("fit_intercept", "True")
        tol = UniformFloatHyperparameter("tol",
                                         1e-5,
                                         1e-1,
                                         log=True,
                                         default_value=1e-4)
        epsilon = UniformFloatHyperparameter("epsilon",
                                             1e-5,
                                             1e-1,
                                             default_value=1e-4,
                                             log=True)
        learning_rate = CategoricalHyperparameter(
            "learning_rate", ["optimal", "invscaling", "constant"],
            default_value="invscaling")
        eta0 = UniformFloatHyperparameter("eta0",
                                          1e-7,
                                          1e-1,
                                          default_value=0.01)
        power_t = UniformFloatHyperparameter("power_t",
                                             1e-5,
                                             1,
                                             default_value=0.25)
        average = CategoricalHyperparameter("average", ["False", "True"],
                                            default_value="False")
        cs.add_hyperparameters([
            loss, penalty, alpha, l1_ratio, fit_intercept, tol, epsilon,
            learning_rate, eta0, power_t, average
        ])

        # TODO add passive/aggressive here, although not properly documented?
        elasticnet = EqualsCondition(l1_ratio, penalty, "elasticnet")
        epsilon_condition = EqualsCondition(epsilon, loss, "modified_huber")
        # eta0 seems to be always active according to the source code; when
        # learning_rate is set to optimial, eta0 is the starting value:
        # https://github.com/scikit-learn/scikit-learn/blob/0.15.X/sklearn/linear_model/sgd_fast.pyx
        #eta0_and_inv = EqualsCondition(eta0, learning_rate, "invscaling")
        #eta0_and_constant = EqualsCondition(eta0, learning_rate, "constant")
        #eta0_condition = OrConjunction(eta0_and_inv, eta0_and_constant)
        power_t_condition = EqualsCondition(power_t, learning_rate,
                                            "invscaling")

        cs.add_conditions([elasticnet, epsilon_condition, power_t_condition])

        return cs
コード例 #20
0
X_train_feature = vec.fit_transform(train_data['word_seg'])
X_test_feature = vec.transform(test_data['word_seg'])

# --------------情感值预测开始------------------------
y_train_sent = train_data['sentiment_value'].astype(int)
X_train_sent,X_test_sent,y_train_sent,y_test_sent=\
    train_test_split(X_train_feature,y_train_sent,test_size=0.1,random_state=42)
# clf = LogisticRegression(C=4, dual=True)
# clf =svm.LinearSVC()
# clf =RandomForestClassifier()
clf = SGDClassifier(n_iter=80)
# tune_params(X_train_sent,y_train_sent)
clf.fit(X_train_sent, y_train_sent)

# 在训练集评估模型
pred_test_sent = clf.predict(X_test_sent)
# 精确度=真阳性/(真阳性+假阳性)
precision = precision_score(y_test_sent,
                            pred_test_sent,
                            pos_label=None,
                            average='weighted')
# 召回率=真阳性/(真阳性+假阴性)
recall = recall_score(y_test_sent,
                      pred_test_sent,
                      pos_label=None,
                      average='weighted')
# F1
f1 = f1_score(y_test_sent, pred_test_sent, pos_label=None, average='weighted')
# 精确率
accuracy = accuracy_score(y_test_sent, pred_test_sent)
print("precision:{:.4f}-recall:{:.4f}-f1:{:.4f}-accuracy:{:.4f}".format(
コード例 #21
0

# sklearn implementation
from sklearn.linear_model import Perceptron
from sklearn.linear_model.stochastic_gradient import SGDClassifier
from sklearn.metrics import accuracy_score


# Fitting an sklearn Perceptron and SGDClassifier with perceptron loss function (these should be identical)
clf = Perceptron(random_state=None, eta0= 0.1, shuffle=False, penalty=None, class_weight=None, fit_intercept=False)
clf2 = SGDClassifier(loss="perceptron",eta0=0.1,learning_rate="constant",penalty=None,random_state=None,shuffle=False,fit_intercept=False,warm_start=False,average=False,n_iter=1000)
clf.fit(x_train, y_train)
clf2.fit(x_train, y_train)

y_predict = clf.predict(x_test)
y_preSGD = clf2.predict(x_test)

print "sklearn Perceptron accuracy:"
print accuracy_score(y_test, y_predict)

print "sklearn SGDClassifier accuracy:"
print accuracy_score(y_test, y_preSGD)

print "my perceptron accuracy:"
print accuracy_score(y_test, y_pred)
print "\n"
#print clf.coef_

def x22(x1, w):
    w0 = clf.coef_[0][0]
    w1 = clf.coef_[0][1]
コード例 #22
0
class SGD(
    IterativeComponentWithSampleWeight,
    BaseClassificationModel,
):
    def __init__(self, loss, penalty, alpha, fit_intercept, tol,
                 learning_rate, l1_ratio=0.15, epsilon=0.1,
                 eta0=0.01, power_t=0.5, average=False, random_state=None):
        self.loss = loss
        self.penalty = penalty
        self.alpha = alpha
        self.fit_intercept = fit_intercept
        self.tol = tol
        self.learning_rate = learning_rate
        self.l1_ratio = l1_ratio
        self.epsilon = epsilon
        self.eta0 = eta0
        self.power_t = power_t
        self.random_state = random_state
        self.average = average
        self.estimator = None
        self.time_limit = None
        self.start_time = time.time()

    def iterative_fit(self, X, y, n_iter=2, refit=False, sample_weight=None):
        from sklearn.linear_model.stochastic_gradient import SGDClassifier

        # Need to fit at least two iterations, otherwise early stopping will not
        # work because we cannot determine whether the algorithm actually
        # converged. The only way of finding this out is if the sgd spends less
        # iterations than max_iter. If max_iter == 1, it has to spend at least
        # one iteration and will always spend at least one iteration, so we
        # cannot know about convergence.

        if refit:
            self.estimator = None

        if self.estimator is None:
            if isinstance(self.loss, tuple):
                nested_loss = self.loss
                self.loss = nested_loss[0]
                if self.loss == 'modified_huber':
                    self.epsilon = nested_loss[1]['epsilon']

            if isinstance(self.penalty, tuple):
                nested_penalty = self.penalty
                self.penalty = nested_penalty[0]
                if self.penalty == "elasticnet":
                    self.l1_ratio = nested_penalty[1]['l1_ratio']

            if isinstance(self.learning_rate, tuple):
                nested_learning_rate = self.learning_rate
                self.learning_rate = nested_learning_rate[0]
                if self.learning_rate == 'invscaling':
                    self.eta0 = nested_learning_rate[1]['eta0']
                    self.power_t = nested_learning_rate[1]['power_t']
                elif self.learning_rate == 'constant':
                    self.eta0 = nested_learning_rate[1]['eta0']
                self.fully_fit_ = False

            self.alpha = float(self.alpha)
            self.l1_ratio = float(self.l1_ratio) if self.l1_ratio is not None \
                else 0.15
            self.epsilon = float(self.epsilon) if self.epsilon is not None \
                else 0.1
            self.eta0 = float(self.eta0) if self.eta0 is not None else 0.01
            self.power_t = float(self.power_t) if self.power_t is not None \
                else 0.5
            self.average = check_for_bool(self.average)
            self.fit_intercept = check_for_bool(self.fit_intercept)
            self.tol = float(self.tol)

            self.estimator = SGDClassifier(loss=self.loss,
                                           penalty=self.penalty,
                                           alpha=self.alpha,
                                           fit_intercept=self.fit_intercept,
                                           max_iter=n_iter,
                                           tol=self.tol,
                                           learning_rate=self.learning_rate,
                                           l1_ratio=self.l1_ratio,
                                           epsilon=self.epsilon,
                                           eta0=self.eta0,
                                           power_t=self.power_t,
                                           shuffle=True,
                                           average=self.average,
                                           random_state=self.random_state,
                                           warm_start=True)
            self.estimator.fit(X, y, sample_weight=sample_weight)
        else:
            self.estimator.max_iter += n_iter
            self.estimator.max_iter = min(self.estimator.max_iter, 512)
            self.estimator._validate_params()
            self.estimator._partial_fit(
                X, y,
                alpha=self.estimator.alpha,
                C=1.0,
                loss=self.estimator.loss,
                learning_rate=self.estimator.learning_rate,
                max_iter=n_iter,
                sample_weight=sample_weight,
                classes=None,
                coef_init=None,
                intercept_init=None
            )

        if self.estimator.max_iter >= 512 or n_iter > self.estimator.n_iter_:
            self.fully_fit_ = True

        return self

    def configuration_fully_fitted(self):
        if self.estimator is None:
            return False
        elif not hasattr(self, 'fully_fit_'):
            return False
        else:
            return self.fully_fit_

    def predict(self, X):
        if self.estimator is None:
            raise NotImplementedError()
        return self.estimator.predict(X)

    def predict_proba(self, X):
        if self.estimator is None:
            raise NotImplementedError()

        if self.loss in ["log", "modified_huber"]:
            return self.estimator.predict_proba(X)
        else:
            df = self.estimator.decision_function(X)
            return softmax(df)

    @staticmethod
    def get_properties(dataset_properties=None):
        return {'shortname': 'SGD Classifier',
                'name': 'Stochastic Gradient Descent Classifier',
                'handles_regression': False,
                'handles_classification': True,
                'handles_multiclass': True,
                'handles_multilabel': False,
                'is_deterministic': True,
                'input': (DENSE, SPARSE, UNSIGNED_DATA),
                'output': (PREDICTIONS,)}

    @staticmethod
    def get_hyperparameter_search_space(dataset_properties=None, optimizer='smac'):
        if optimizer == 'smac':
            cs = ConfigurationSpace()

            loss = CategoricalHyperparameter("loss",
                                             ["hinge", "log", "modified_huber", "squared_hinge", "perceptron"],
                                             default_value="log")
            penalty = CategoricalHyperparameter(
                "penalty", ["l1", "l2", "elasticnet"], default_value="l2")
            alpha = UniformFloatHyperparameter(
                "alpha", 1e-7, 1e-1, log=True, default_value=0.0001)
            l1_ratio = UniformFloatHyperparameter(
                "l1_ratio", 1e-9, 1, log=True, default_value=0.15)
            fit_intercept = UnParametrizedHyperparameter("fit_intercept", "True")
            tol = UniformFloatHyperparameter("tol", 1e-5, 1e-1, log=True,
                                             default_value=1e-4)
            epsilon = UniformFloatHyperparameter(
                "epsilon", 1e-5, 1e-1, default_value=1e-4, log=True)
            learning_rate = CategoricalHyperparameter(
                "learning_rate", ["optimal", "invscaling", "constant"],
                default_value="invscaling")
            eta0 = UniformFloatHyperparameter(
                "eta0", 1e-7, 1e-1, default_value=0.01, log=True)
            power_t = UniformFloatHyperparameter("power_t", 1e-5, 1, log=True,
                                                 default_value=0.5)
            average = CategoricalHyperparameter(
                "average", ["False", "True"], default_value="False")
            cs.add_hyperparameters([loss, penalty, alpha, l1_ratio, fit_intercept,
                                    tol, epsilon, learning_rate, eta0, power_t,
                                    average])

            # TODO add passive/aggressive here, although not properly documented?
            elasticnet = EqualsCondition(l1_ratio, penalty, "elasticnet")
            epsilon_condition = EqualsCondition(epsilon, loss, "modified_huber")

            power_t_condition = EqualsCondition(power_t, learning_rate,
                                                "invscaling")

            # eta0 is only relevant if learning_rate!='optimal' according to code
            # https://github.com/scikit-learn/scikit-learn/blob/0.19.X/sklearn/
            # linear_model/sgd_fast.pyx#L603
            eta0_in_inv_con = InCondition(eta0, learning_rate, ["invscaling",
                                                                "constant"])
            cs.add_conditions([elasticnet, epsilon_condition, power_t_condition,
                               eta0_in_inv_con])

            return cs
        elif optimizer == 'tpe':
            eta0 = hp.loguniform('sgd_eta0', np.log(1e-7), np.log(1e-1))
            space = {
                'loss': hp.choice('sgd_loss', [
                    ("modified_huber", {'epsilon': hp.loguniform('sgd_epsilon', np.log(1e-5), np.log(1e-1))}),
                    ("hinge", {}),
                    ("log", {}),
                    ("squared_hinge", {}),
                    ("perceptron", {})]),
                'penalty': hp.choice('sgd_penalty',
                                     [("elasticnet",
                                       {'l1_ratio': hp.loguniform('sgd_l1_ratio', np.log(1e-9), np.log(1))}),
                                      ("l1", None),
                                      ("l2", None)]),
                'alpha': hp.loguniform('sgd_alpha', np.log(1e-7), np.log(1e-1)),
                'fit_intercept': hp.choice('sgd_fit_intercept', ["True"]),
                'tol': hp.loguniform('sgd_tol', np.log(1e-5), np.log(1e-1)),
                'learning_rate': hp.choice('sgd_learning_rate', [("optimal", {}),
                                                                 ("invscaling",
                                                                  {'power_t': hp.loguniform('sgd_power_t', np.log(1e-5),
                                                                                            np.log(1)),
                                                                   'eta0': eta0}),
                                                                 ("constant", {'eta0': eta0})]),

                'average': hp.choice('sgd_average', ["True", "False"])}

            init_trial = {'loss': ("log", {}),
                          'penalty': ("l2", {}),
                          'alpha': 1e-4,
                          'fit_intercept': "True",
                          'tol': 1e-4,
                          'learning_rate': ("invscaling", {'power_t': 0.5, 'eta0': 0.01}),
                          'average': "False"}

            return space
コード例 #23
0
def result():
    if request.method == 'POST':
        path = request.files.get('myFile')

        # Reading the CSV file and converting it into a pandas data-frame
        df = pd.read_csv(path, encoding="ISO-8859-1")

        # Reading the name for the file for the model that will be saved
        filename = request.form['filename']

        # Reading the names of the feature and label as strings
        str1 = request.form['feature']
        str2 = request.form['label']

        # Assigning the feature and label variables to the respective columns
        if str1 in list(df) and str2 in list(df):
            y = df[str2]
            X = df[str1]
        else:
            return render_template('nameError.html')
        '''
        # Removing the punctuations and HTTP links in the feature text input
        x = []
        for subject in X:
            result = re.sub(r"http\S+", "", subject)
            replaced = re.sub(r'[^a-zA-Z0-9 ]+', '', result)
            x.append(replaced)
        X = pd.Series(x)
        '''
        X = X.str.lower()

        # Optional use of Tokenization and Lemmatization using Natural Language Processing in SpaCy
        """
        texts = []
        for doc in X:
            doc = nlp(doc, disable=['parser', 'ner'])
            tokens = [tok.lemma_.lower().strip() for tok in doc if tok.lemma_ != '-PRON-']
            tokens = [tok for tok in tokens if tok not in stopwords]
            tokens = ' '.join(tokens)
            texts.append(tokens)

        X = pd.Series(texts)
        """

        # Splitting the data-set into 2 parts : Training data and Test data
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.33,
                                                            shuffle=True)

        tfidfvect = TfidfVectorizer(ngram_range=(1, 1))
        X_train_tfidf = tfidfvect.fit_transform(X_train)

        # Fitting all the classification models one by one and recording their accuracies and execution times

        start = time()
        clf1 = LinearSVC()
        clf1.fit(X_train_tfidf, y_train)
        pred_SVC = clf1.predict(tfidfvect.transform(X_test))

        a1 = accuracy_score(y_test, pred_SVC)
        end = time()
        print("accuracy SVC: {} and time: {} s".format(a1, (end - start)))

        start = time()
        clf2 = LogisticRegression(n_jobs=-1,
                                  multi_class='multinomial',
                                  solver='newton-cg')
        clf2.fit(X_train_tfidf, y_train)
        pred_LR = clf2.predict(tfidfvect.transform(X_test))
        a2 = accuracy_score(y_test, pred_LR)
        end = time()
        print("accuracy LR: {} and time: {}".format(a2, (end - start)))

        start = time()
        clf3 = RandomForestClassifier(n_jobs=-1)

        clf3.fit(X_train_tfidf, y_train)
        pred = clf3.predict(tfidfvect.transform(X_test))
        a3 = accuracy_score(y_test, pred)
        end = time()
        print("accuracy RFC: {} and time: {}".format(a3, (end - start)))

        start = time()
        clf4 = MultinomialNB()

        clf4.fit(X_train_tfidf, y_train)
        pred = clf4.predict(tfidfvect.transform(X_test))
        a4 = accuracy_score(y_test, pred)
        end = time()
        print("accuracy MNB: {} and time: {}".format(a4, (end - start)))

        start = time()
        clf11 = SGDClassifier(n_jobs=-1)

        clf11.fit(X_train_tfidf, y_train)
        pred = clf11.predict(tfidfvect.transform(X_test))
        a11 = accuracy_score(y_test, pred)
        end = time()
        print("accuracy SGDC: {} and time: {}".format(a11, (end - start)))
        start = time()
        clf12 = SGDClassifier(n_jobs=-1)

        clf12.fit(X_train_tfidf, y_train)
        pred = clf12.predict(tfidfvect.transform(X_test))
        a12 = accuracy_score(y_test, pred)
        end = time()
        print("accuracy XGBC: {} and time: {}".format(a12, (end - start)))

        # Comparing the accuracies of all the models and then saving(dumping) the model with the highest accuracy using pickle for later use.

        acu_list = [a1, a2, a3, a4, a11, a12]
        max_list = max(acu_list)

        if max_list == a1:
            pickle.dump(clf1, open(filename + '_model', 'wb'))
        elif max_list == a2:
            pickle.dump(clf2, open(filename + '_model', 'wb'))
        elif max_list == a3:
            pickle.dump(clf3, open(filename + '_model', 'wb'))
        elif max_list == a4:
            pickle.dump(clf4, open(filename + '_model', 'wb'))
        elif max_list == a11:
            pickle.dump(clf11, open(filename + '_model', 'wb'))
        elif max_list == a12:
            pickle.dump(clf12, open(filename + '_model', 'wb'))

        pickle.dump(tfidfvect, open(filename + '_tfidfVect', 'wb'))

        return render_template("result.html",
                               ac1=a1,
                               ac2=a2,
                               ac3=a3,
                               ac4=a4,
                               ac11=a11,
                               ac12=a12)
コード例 #24
0
def result():
    if request.method == 'POST':
        path = request.files.get('myFile')

        df = pd.read_csv(path, encoding="ISO-8859-1")

        filename = request.form['filename']

        str1 = request.form['feature']
        str2 = request.form['label']

        if str1 in list(df) and str2 in list(df):
            y = df[str2]
            X = df[str1]
        else:
            return render_template('nameError.html')

        x = []
        for subject in X:
            result = re.sub(r"http\S+", "", subject)
            replaced = re.sub(r'[^a-zA-Z0-9 ]+', '', result)
            x.append(replaced)
        X = pd.Series(x)

        X = X.str.lower()
        """
        texts = []
        for doc in X:
            doc = nlp(doc, disable=['parser', 'ner'])
            tokens = [tok.lemma_.lower().strip() for tok in doc if tok.lemma_ != '-PRON-']
            tokens = [tok for tok in tokens if tok not in stopwords]
            tokens = ' '.join(tokens)
            texts.append(tokens)

        X = pd.Series(texts)
        """
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.33)

        tfidfvect = TfidfVectorizer(ngram_range=(1, 1))
        X_train_tfidf = tfidfvect.fit_transform(X_train)

        start = time()
        clf1 = LinearSVC()
        clf1.fit(X_train_tfidf, y_train)
        pred_SVC = clf1.predict(tfidfvect.transform(X_test))

        a1 = accuracy_score(y_test, pred_SVC)
        end = time()
        print("accuracy SVC: {} and time: {} s".format(a1, (end - start)))

        start = time()
        clf2 = LogisticRegression(n_jobs=-1,
                                  multi_class='multinomial',
                                  solver='newton-cg')
        clf2.fit(X_train_tfidf, y_train)
        pred_LR = clf2.predict(tfidfvect.transform(X_test))
        a2 = accuracy_score(y_test, pred_LR)
        end = time()
        print("accuracy LR: {} and time: {}".format(a2, (end - start)))

        start = time()
        clf3 = RandomForestClassifier(n_jobs=-1)

        clf3.fit(X_train_tfidf, y_train)
        pred = clf3.predict(tfidfvect.transform(X_test))
        a3 = accuracy_score(y_test, pred)
        end = time()
        print("accuracy RFC: {} and time: {}".format(a3, (end - start)))

        start = time()
        clf4 = MultinomialNB()

        clf4.fit(X_train_tfidf, y_train)
        pred = clf4.predict(tfidfvect.transform(X_test))
        a4 = accuracy_score(y_test, pred)
        end = time()
        print("accuracy MNB: {} and time: {}".format(a4, (end - start)))

        start = time()
        clf5 = GaussianNB()

        clf5.fit(X_train_tfidf.toarray(), y_train)
        pred = clf5.predict(tfidfvect.transform(X_test).toarray())
        a5 = accuracy_score(y_test, pred)
        end = time()
        print("accuracy GNB: {} and time: {}".format(a5, (end - start)))

        start = time()
        clf6 = LogisticRegressionCV(n_jobs=-1)
        clf6.fit(X_train_tfidf, y_train)
        pred_LR = clf6.predict(tfidfvect.transform(X_test))
        a6 = accuracy_score(y_test, pred_LR)
        end = time()
        print("accuracy LRCV: {} and time: {}".format(a6, (end - start)))

        start = time()
        clf7 = AdaBoostClassifier()
        clf7.fit(X_train_tfidf, y_train)
        pred_LR = clf7.predict(tfidfvect.transform(X_test))
        a7 = accuracy_score(y_test, pred_LR)
        end = time()
        print("accuracy ABC: {} and time: {}".format(a7, (end - start)))

        start = time()
        clf8 = BernoulliNB()

        clf8.fit(X_train_tfidf.toarray(), y_train)
        pred = clf8.predict(tfidfvect.transform(X_test).toarray())
        a8 = accuracy_score(y_test, pred)
        end = time()
        print("accuracy BNB: {} and time: {}".format(a8, (end - start)))

        start = time()
        clf9 = Perceptron(n_jobs=-1)

        clf9.fit(X_train_tfidf.toarray(), y_train)
        pred = clf9.predict(tfidfvect.transform(X_test).toarray())
        a9 = accuracy_score(y_test, pred)
        end = time()
        print("accuracy Per: {} and time: {}".format(a9, (end - start)))
        start = time()
        clf10 = RidgeClassifierCV()

        clf10.fit(X_train_tfidf.toarray(), y_train)
        pred = clf10.predict(tfidfvect.transform(X_test).toarray())
        a10 = accuracy_score(y_test, pred)
        end = time()
        print("accuracy RidCV: {} and time: {}".format(a10, (end - start)))

        start = time()
        clf11 = SGDClassifier(n_jobs=-1)

        clf11.fit(X_train_tfidf.toarray(), y_train)
        pred = clf11.predict(tfidfvect.transform(X_test).toarray())
        a11 = accuracy_score(y_test, pred)
        end = time()
        print("accuracy SGDC: {} and time: {}".format(a11, (end - start)))
        start = time()
        clf12 = SGDClassifier(n_jobs=-1)

        clf12.fit(X_train_tfidf.toarray(), y_train)
        pred = clf12.predict(tfidfvect.transform(X_test).toarray())
        a12 = accuracy_score(y_test, pred)
        end = time()
        print("accuracy XGBC: {} and time: {}".format(a12, (end - start)))

        acu_list = [a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12]
        max_list = max(acu_list)

        if max_list == a1:
            pickle.dump(clf1, open(filename + '_model', 'wb'))
        elif max_list == a2:
            pickle.dump(clf2, open(filename + '_model', 'wb'))
        elif max_list == a3:
            pickle.dump(clf3, open(filename + '_model', 'wb'))
        elif max_list == a4:
            pickle.dump(clf4, open(filename + '_model', 'wb'))
        elif max_list == a5:
            pickle.dump(clf5, open(filename + '_model', 'wb'))
        elif max_list == a6:
            pickle.dump(clf6, open(filename + '_model', 'wb'))
        elif max_list == a7:
            pickle.dump(clf7, open(filename + '_model', 'wb'))
        elif max_list == a8:
            pickle.dump(clf8, open(filename + '_model', 'wb'))
        elif max_list == a9:
            pickle.dump(clf9, open(filename + '_model', 'wb'))
        elif max_list == a10:
            pickle.dump(clf10, open(filename + '_model', 'wb'))
        elif max_list == a11:
            pickle.dump(clf11, open(filename + '_model', 'wb'))
        elif max_list == a12:
            pickle.dump(clf12, open(filename + '_model', 'wb'))

        pickle.dump(tfidfvect, open(filename + '_tfidfVect', 'wb'))

        return render_template("result.html",
                               ac1=a1,
                               ac2=a2,
                               ac3=a3,
                               ac4=a4,
                               ac5=a5,
                               ac6=a6,
                               ac7=a7,
                               ac8=a8,
                               ac9=a9,
                               ac10=a10,
                               ac11=a11,
                               ac12=a12)
コード例 #25
0

# sklearn implementation
from sklearn.linear_model import Perceptron
from sklearn.linear_model.stochastic_gradient import SGDClassifier
from sklearn.metrics import accuracy_score


# Fitting an sklearn Perceptron and SGDClassifier with perceptron loss function (these should be identical)
clf = Perceptron(random_state=None, eta0= 0.1, shuffle=False, penalty=None, class_weight=None, fit_intercept=False)
clf2 = SGDClassifier(loss="perceptron",eta0=0.1,learning_rate="constant",penalty=None,random_state=None,shuffle=False,fit_intercept=False,warm_start=False,average=False,n_iter=1000)
clf.fit(x_train, y_train)
clf2.fit(x_train, y_train)

y_predict = clf.predict(x_test)
y_preSGD = clf2.predict(x_test)

print "sklearn Perceptron accuracy:"
print accuracy_score(y_test, y_predict)

print "sklearn SGDClassifier accuracy:"
print accuracy_score(y_test, y_preSGD)

print "my perceptron accuracy:"
print accuracy_score(y_test, y_pred)
print "\n"
#print clf.coef_

def x22(x1, w):
    w0 = clf.coef_[0][0]
    w1 = clf.coef_[0][1]