Esempio n. 1
0
class kernelsvm():
    def __init__(self, theta0, alpha, loss_metric):
        self.theta0 = theta0
        self.alpha = alpha
        self.loss_metric = loss_metric
    def fit(self, X, y, idx_SR):
        n_SR = len(idx_SR)
        self.feature_map_nystroem = General_Nystroem(kernel='rbf', gamma=self.theta0, n_components=n_SR)
        X_features = self.feature_map_nystroem.fit_transform(X,idx_SR)
        print("fitting SGD")
        self.clf = SGDClassifier(loss=self.loss_metric,alpha=self.alpha)
        self.clf.fit(X_features, y)
        print("fitting SGD finished")
    def predict(self, X):
        print("Predicting")
        X_transform = self.feature_map_nystroem.transform(X)
        return self.clf.predict(X_transform), X_transform
    def decision_function(self, X):
        # X should be the transformed input!
        return self.clf.decision_function(X)
    def err_rate(self, y_true, y_pred):
        acc = accuracy_score(y_true, y_pred)
        err_rate = 1.0-acc
        return err_rate
    def get_params(self):
        return self.clf.get_params()
Esempio n. 2
0
def main(input_path, output_path):
    client = MlflowClient()
    experiment = client.get_experiment_by_name("iris")
    with mlflow.start_run(experiment_id=experiment.experiment_id):
        # Import dataset
        logging.info(f"reading {input_path}")
        mlflow.log_artifact(input_path)
        iris = pd.read_csv(input_path)
        X = iris.drop("Species", axis=1)
        y = iris.Species
        # Instantiate PCA
        pca = PCA()
        # Instatiate logistic_regression
        logistic = SGDClassifier(loss='log', penalty='l2', max_iter=100, tol=1e-3, random_state=0)
        mlflow.log_params(logistic.get_params())
        # Parameters grid to try
        param_grid = {
            'pca__n_components': [2, 3],
            'logistic__alpha': np.logspace(-4, 4, 5),
        }
        mlflow.log_params(param_grid)
        # Define training pipeline
        pipe = Pipeline(steps=[('pca', pca), ('logistic', logistic)])
        # Training
        logging.info("beginning training")
        search = GridSearchCV(pipe, param_grid, iid=False, cv=3, return_train_score=False)
        search.fit(X, y)
        print(f"Best parameter (CV score={search.best_score_}):")
        print(search.best_params_)
        mlflow.log_params(search.best_params_)
        mlflow.log_metric("best_score", search.best_score_)
        # Save best model
        logging.info("saving best model")
        dump(search.best_estimator_, output_path)
        mlflow.log_artifact(output_path)
Esempio n. 3
0
class PlattScaledSVM(BaseEstimator, ClassifierMixin):
    def __init__(self, **svm_kwargs):
        self.svm_kwargs = svm_kwargs
        self.svm = SGDClassifier(loss="hinge", **self.svm_kwargs)
        self.lr = LogisticRegression()

    def __str__(self):
        param_str = ', '.join([
            "{0}={1}".format(k, v) for (k, v) in self.svm.get_params().items()
        ])
        return "PlattScaledSVM({})".format(param_str)

    def __repr__(self):
        param_str = ', '.join([
            "{0}={1}".format(k, v) for (k, v) in self.svm.get_params().items()
        ])
        return "PlattScaledSVM({})".format(param_str)

    def fit(self, X, y):
        self.svm.fit(X, y)
        dists = self.svm.decision_function(X)
        self.lr.fit(dists.reshape(-1, 1), y)
        return self

    def predict(self, X, y=None):
        dists = self.svm.decision_function(X)
        preds = self.lr.predict(dists.reshape(-1, 1))

    def predict_proba(self, X, y=None):
        dists = self.svm.decision_function(X)
        probs = self.lr.predict_proba(dists.reshape(-1, 1))
        return probs

    def get_params(self, deep=True):
        return self.svm_kwargs

    def set_params(self, **parameters):
        for parameter, value in parameters.items():
            self.setattr(parameter, value)
        return self
Esempio n. 4
0
def SVM(train_bow_tf_idf, train_labels, bow_test_tf_idf, test_labels):
    # training the support vector machine (SVM) model. Linear classifiers (SVM) with SGD training

    model = SGDClassifier(loss='squared_hinge',
                          average=100,
                          penalty='l2',
                          alpha=0.0001,
                          random_state=None,
                          max_iter=100,
                          tol=None,
                          n_jobs=-1)
    model.fit(train_bow_tf_idf, train_labels)

    print()
    print('------- Support Vector Machine (SVM) -------')
    # evaluate the model
    print('Default hyperparameters:')
    print(model.get_params())
    train_pred = model.predict(train_bow_tf_idf)
    print('SVM train accuracy = {}'.format(
        (train_pred == train_labels).mean()))
    test_pred = model.predict(bow_test_tf_idf)
    print('SVM test accuracy = {}'.format((test_pred == test_labels).mean()))

    # # gridsearch for best Hyperparameter
    # parameters = {'alpha': (1, 0.1, 0.01, 0.001, 0.0001 ),
    #               'loss': ('squared_hinge', 'hinge' )
    #               }
    # gs_clf = GridSearchCV(model, parameters, n_jobs=-1)
    # gs_clf = gs_clf.fit(train_bow_tf_idf, train_data.target)
    #
    # best_parameters = gs_clf.best_estimator_.get_params()
    # print('Best params using gridSearch:')
    # print(best_parameters)
    # gstrain_pred = gs_clf.predict(train_bow_tf_idf)
    # print('New hyperparameters SVM train accuracy = {}'.format((gstrain_pred == train_labels).mean()))
    # gstest_pred = gs_clf.predict(bow_test_tf_idf)
    # print('New hyperparameters SVM test accuracy = {}'.format((gstest_pred == test_labels).mean()))
    # print('---------------------------------------')
    # print()

    return model, test_pred
Esempio n. 5
0
def get_sgd(x_train, t_train, x_val, t_val, search=False):
    # {'alpha': 0.1, 'loss': 'hinge', 'penalty': 'l2'}
    # params {'alpha': 0.1, 'loss': 'squared_hinge', 'penalty': 'l2'}
    # sgd validated at (array([0.83904737, 0.77597488, 0.63281863, 0.64005236, 0.78926702]), 0.5431523356769534)
    # SGD tested at (array([0.5211474 , 0.75460637, 0.42106365, 0.84335079, 0.82848168]), 0.5295225644352521)
    if search:
        sgd_params = param_sel(
            x_train, t_train, SGDClassifier(max_iter=2000), {
                'alpha': [0.01, 0.06, 0.1, 0.6, 1],
                'loss': ['hinge', 'log', 'squared_hinge', 'modified_huber'],
                'penalty': ['l2']
            })
    else:
        sgd_params = {'alpha': 0.1, 'loss': 'squared_hinge', 'penalty': 'l2'}

    sgd_classifier = SGDClassifier(**sgd_params, max_iter=2000)
    sgd_classifier.fit(x_train, t_train)
    print("SGD params:", sgd_classifier.get_params())
    print("SGD validated at", validate(sgd_classifier, x_val, t_val))
    return sgd_classifier
Esempio n. 6
0
def train_and_predict(train_X, train_y, test_X, test_y, coco, prefix):
    """
        Trains an SVM classifier using the given training dataset.
        Then, makes predictions with the given test dataset.
    """

    # train
    sgd_clf = SGDClassifier(random_state=42, max_iter=1000, tol=1e-3)
    sgd_clf.fit(train_X, train_y)
    print(sgd_clf.get_params(deep=True))

    # test
    print("Predicting with trained SVM...")
    y_predictions = sgd_clf.predict(test_X)
    print('Percentage correct: ',
          100 * np.sum(y_predictions == test_y) / len(test_y))

    # plot confusion matrix
    unique_labels = np.unique(test_y + list(y_predictions))
    categories = coco.loadCats(unique_labels)
    category_names = [cat["name"] for cat in categories]
    build_confusion_matrix(sgd_clf, test_X, test_y, category_names,\
        "{0}/{1}_confusion_matrix.png".format(results_folder, prefix))
Esempio n. 7
0
X_train, X_test, y_train, y_test = train_test_split(TempTrain, TrainTransaction['isFraud'], test_size=0.1, random_state=42)

#Set up SDG Model
SDGModel=SGDClassifier(loss="log", penalty="l2", max_iter=1000)
SDGModel.fit(X_train, y_train)

#Predict values
PredictedValues=SDGModel.predict(X_test)

#Metrics
print(confusion_matrix(y_test, PredictedValues))
print(classification_report(y_test, PredictedValues))

#Save Parameters
text_file = open("Params_V3.txt", "w")
text_file.write("%s\n" % SDGModel.get_params())
text_file.write("%s\n" % confusion_matrix(y_test, PredictedValues))
text_file.write("%s\n" % classification_report(y_test, PredictedValues))
text_file.close()

#Try with test
TestSet_dev=pd.read_csv(zip.ZipFile('Data/test_transaction.csv.zip').open("test_transaction.csv"))
X_test_dev=TestSet_dev[np.concatenate((["C"+str(X) for X in [1,2,3,5,6,7,11,12]],["D"+str(X) for X in range(1,16)]))]
X_test_dev.shape
X_test_dev.dropna().shape
X_test_dev.fillna(value=0, inplace=True)

##################
#Submit predictions
PredictedValues_Dev=SDGModel.predict(X_test_dev)
Esempio n. 8
0
class RBFSVMLearner(learners.BaseLearner):
    def __init__(self,
                 loss="hinge",
                 penalty='l2',
                 alpha=1e-9,
                 l1_ratio=0,
                 fit_intercept=True,
                 max_iter=MAX_ITER,
                 tol=None,
                 shuffle=True,
                 verbose=False,
                 epsilon=stochastic_gradient.DEFAULT_EPSILON,
                 n_jobs=1,
                 random_state=None,
                 learning_rate="optimal",
                 eta0=0.0,
                 power_t=0.5,
                 class_weight=None,
                 warm_start=False,
                 average=False,
                 n_iter=2000,
                 gamma_frac=0.1,
                 use_linear=False):
        super().__init__(verbose)
        self._alpha = alpha
        self._gamma_frac = gamma_frac
        self._n_iter = n_iter
        self._use_linear = use_linear
        self._learner = SGDClassifier(loss=loss,
                                      penalty=penalty,
                                      alpha=self._alpha,
                                      l1_ratio=l1_ratio,
                                      fit_intercept=fit_intercept,
                                      max_iter=max_iter,
                                      tol=tol,
                                      shuffle=shuffle,
                                      verbose=verbose,
                                      epsilon=epsilon,
                                      n_jobs=n_jobs,
                                      average=average,
                                      learning_rate=learning_rate,
                                      eta0=eta0,
                                      power_t=power_t,
                                      class_weight=class_weight,
                                      warm_start=warm_start,
                                      n_iter=self._n_iter,
                                      random_state=random_state)

        self.gamma = None
        self.X_ = None
        self.classes_ = None
        self.kernels_ = None
        self.y_ = None

    def learner(self):
        return self._learner

    def fit(self, training_data, classes):
        if self._use_linear:
            return self._learner.fit(training_data, classes)
        # Check that training_data, classes
        training_data, classes = check_X_y(training_data, classes)

        # Get the kernel matrix
        dist = euclidean_distances(training_data, squared=True)
        median = np.median(dist)
        del dist
        gamma = median
        gamma *= self._gamma_frac
        self.gamma = 1 / gamma
        kernels = rbf_kernel(training_data, None, self.gamma)

        self.X_ = training_data
        self.classes_ = unique_labels(classes)
        self.kernels_ = kernels
        self.y_ = classes
        self._learner.fit(self.kernels_, self.y_)

        # Return the classifier
        return self

    def predict(self, data):
        if self._use_linear:
            return self._learner.predict(data)

        # Check is fit had been called
        check_is_fitted(self, ['X_', 'y_', '_learner', 'kernels_'])
        # Input validation
        data = check_array(data)
        new_kernels = rbf_kernel(data, self.X_, self.gamma)
        pred = self._learner.predict(new_kernels)
        return pred

    # We pass gamma_frac around
    def get_params(self, deep=True):
        """
        Get the current parameters for the learner. This passes the call back to the learner from learner()

        :param deep: If true, fetch deeply
        :return: The parameters
        """
        extra_params = {
            'gamma_frac': self._gamma_frac,
            'use_linear': self._use_linear
        }
        params = self._learner.get_params(deep)

        return {k: v for d in (params, extra_params) for k, v in d.items()}

    def set_params(self, **params):
        """
        Set the current parameters for the learner. This passes the call back to the learner from learner()

        :param params: The params to set
        :return: self
        """
        if 'gamma_frac' in params:
            self._gamma_frac = params.pop('gamma_frac', None)
        if 'use_linear' in params:
            self._use_linear = params.pop('use_linear', None)

        return self._learner.set_params(**params)
Esempio n. 9
0
class MiniBatchSGD(BaseEstimator):
    def __init__(self,
                 verbose=False,
                 test=None,
                 batch_size=100,
                 *args,
                 **kwargs):
        self.classifier = SGDClassifier(*args, **kwargs)
        self.verbose = verbose
        self.test = test
        self.batch_size = batch_size

    def fit(self, X, y):
        indices = np.arange(X.shape[0])
        num_batches = int(1. * X.shape[0] / self.batch_size) + (
            (X.shape[0] % self.batch_size) != 0)
        classes = np.unique(y)

        for i in range(self.classifier.n_iter):
            if self.verbose:
                print('epoch {}'.format(i))

            np.random.shuffle(indices)

            for j in range(num_batches):
                X_batch = X[indices[j * self.batch_size:(j + 1) *
                                    self.batch_size]]
                y_batch = y[indices[j * self.batch_size:(j + 1) *
                                    self.batch_size]]

                start = time.time()
                self.classifier.partial_fit(X, y, classes)
                y_pred = self.classifier.predict(X)
                p, r, f, s = precision_recall_fscore_support(y, y_pred)
                cost = f

                if self.verbose:
                    print("epoch: {} batch: {} cost: {} time: {}".format(
                        i, j, cost,
                        time.time() - start))
                if j % 10 == 0 and self.verbose and self.test is not None:
                    for index in range(len(self.test)):
                        y_pred = self.classifier.predict(self.test[index][0])
                        p, r, f, s = precision_recall_fscore_support(
                            self.test[index][1], y_pred)
                        print("precision: {} recall: {} ".format(p, r))

    def predict(self, X):
        return self.classifier.predict(X)

    def decision_function(self, X):
        return self.classifier.decision_function(X)

    def get_params(self):
        params = self.classifier.get_params()
        params.update({
            'verbose': self.verbose,
            'test': self.test,
            'batch_size': self.batch_size
        })
        return params
Esempio n. 10
0
from sklearn.linear_model import SGDClassifier

linear_name = 'SGDClassifier'

linear_params_grid = {
    'alpha': [0.0001],
    'loss': ['log'],
    # 'loss': ['hinge', 'log'],
    'max_iter': [1000],
    'tol': [0.001, 0.0001, 0.01, 0.1]
}

linear = SGDClassifier(random_state=42)

if __name__ == "__main__":
    print(linear.get_params())
Esempio n. 11
0
                      run_name=f"run_{experiment_name}"):

    #-------Load data -----------#
    iris = pd.read_csv(input_data_path)
X = iris.drop("Species", axis=1)
y = iris.Species

#-------Define model and parameters----------#

pca = PCA()
logistic = SGDClassifier(loss='log',
                         penalty='l2',
                         max_iter=200,
                         tol=1e-3,
                         random_state=0)
logistic.get_params()
param_grid = {
    'pca__n_components': [2],
    'logistic__alpha': np.logspace(-2, 1, 2),
}
mlflow.log_params(param_grid)
pipe = Pipeline(steps=[('pca', pca), ('logistic', logistic)])

#--------Training ----------#

logging.info("beginning training")
search = GridSearchCV(pipe, param_grid, cv=2, return_train_score=False)
search.fit(X, y)
logging.info(f"Best parameter (CV score={search.best_score_}):")

best_param_renamed = {