def SGD_c_fit(X, y): clf = SGDClassifier(loss='log', penalty='l2', alpha=1e-3, n_iter=5, shuffle=True) return clf.fit(X, y)
def classifyTestSamples(trainingFeatures, trainingCategories, testFeatures): clf = SGDClassifier() clf.fit(trainingFeatures, trainingCategories) predictedCategories = clf.predict(testFeatures) return predictedCategories
def fit(self, X, Y): # TODO: maybe scale training data that its norm becomes 1? # http://scikit-learn.org/stable/modules/sgd.html#id1 self.alpha = float(self.alpha) self.fit_intercept = bool(self.fit_intercept) self.n_iter = int(self.n_iter) if self.class_weight == "None": self.class_weight = None self.l1_ratio = float(self.l1_ratio) self.epsilon = float(self.epsilon) self.eta0 = float(self.eta0) self.power_t = float(self.power_t) self.estimator = SGDClassifier(loss=self.loss, penalty=self.penalty, alpha=self.alpha, fit_intercept=self.fit_intercept, n_iter=self.n_iter, learning_rate=self.learning_rate, class_weight=self.class_weight, l1_ratio=self.l1_ratio, epsilon=self.epsilon, eta0=self.eta0, power_t=self.power_t, shuffle=True, random_state=self.random_state) self.estimator.fit(X, Y) return self
def iterative_fit(self, X, y, n_iter=1, refit=False): if refit: self.estimator = None if self.estimator is None: self.alpha = float(self.alpha) self.fit_intercept = self.fit_intercept == 'True' self.n_iter = int(self.n_iter) if self.class_weight == "None": self.class_weight = None self.l1_ratio = float(self.l1_ratio) if self.l1_ratio is not None else 0.15 self.epsilon = float(self.epsilon) if self.epsilon is not None else 0.1 self.eta0 = float(self.eta0) self.power_t = float(self.power_t) if self.power_t is not None else 0.25 self.average = self.average == 'True' self.estimator = SGDClassifier(loss=self.loss, penalty=self.penalty, alpha=self.alpha, fit_intercept=self.fit_intercept, n_iter=self.n_iter, learning_rate=self.learning_rate, class_weight=self.class_weight, l1_ratio=self.l1_ratio, epsilon=self.epsilon, eta0=self.eta0, power_t=self.power_t, shuffle=True, average=self.average, random_state=self.random_state) self.estimator.n_iter += n_iter self.estimator.fit(X, y) return self
def training(processed_train_csv_file): processed_train_samples = pd.read_csv(processed_train_csv_file) processed_train_samples = processed_train_samples.replace([np.inf, -np.inf], np.nan) processed_train_samples = processed_train_samples.fillna(value=0) processed_train_samples_index_lst = processed_train_samples.index.tolist() random.shuffle(processed_train_samples_index_lst) shuffled_train_samples = processed_train_samples.ix[processed_train_samples_index_lst] col_names = shuffled_train_samples.columns.tolist() col_names.remove("booking_bool") features = shuffled_train_samples[col_names].values labels = shuffled_train_samples["booking_bool"].values print "Training Random Forest Classifier" rf_classifier = RandomForestClassifier(n_estimators=150, verbose=2, learning_rate=0.1, min_samples_split=10) rf_classifier.fit(features, labels) print "Saving the Random Forest Classifier" data_io.save_model(rf_classifier, model_name="rf_classifier.pkl") print "Training Gradient Boosting Classifier" gb_classifier = GradientBoostingClassifier(n_estimators=150, verbose=2, learning_rate=0.1, min_samples_split=10) gb_classifier.fit(features, labels) print "Saving the Gradient Boosting Classifier" data_io.save_model(gb_classifier, model_name="gb_classifier.pkl") print "Training SGD Classifier" sgd_classifier = SGDClassifier(loss="modifier_huber", verbose=2, n_jobs=-1) sgd_classifier.fit(features, labels) print "Saving the SGD Classifier" data_io.save_model(sgd_classifier, model_name="sgd_classifier.pkl")
def SGD(self, train_features, test_features): print("in SGD") self.train_features = train_features self.test_features = test_features scores = [] submission = pd.DataFrame.from_dict({'id': test['Id']}) SGD_file = 'SGD.pckl' SGD_model_pkl = open(SGD_file, 'wb') for class_name in class_names: train_target = train[class_name] classifier = SGDClassifier(loss='modified_huber', penalty='l2', alpha=0.001, random_state=42, max_iter=200, tol=0.20, learning_rate='optimal') cv_score = np.mean( cross_val_score(classifier, train_features, train_target, cv=3, scoring='roc_auc')) scores.append(cv_score) print('CV score for class {} is {}'.format(class_name, cv_score)) classifier.fit(train_features, train_target) pickle.dump(classifier, SGD_model_pkl) submission[class_name] = classifier.predict_proba(test_features)[:, 1] print('Total CV score is {}'.format(np.mean(scores))) SGD_model_pkl.close() submission.to_csv('SGD.csv', index=False)
def fit(self, data, args): self.model = SGDClassifier(loss="log") with Timer() as t: self.model.fit(data.X_train, data.y_train) return t.interval
def ada_sd(self, n_iter, eta, random_state): self.ada = SGDClassifier(n_iter=n_iter, eta=eta, random_state=random_state) self.ada.fit( self.X_train_std_uri.reshape(len(self.X_train_std_uri), 1), self.Y_train_category) self.y_pred = self.ada.predict( self.X_test_std_uri.reshape(len(self.X_test_std_uri), 1)) return (self.Y_test_category != self.y_pred).sum(), accuracy_score( self.Y_test_category, self.y_pred), self.y_pred
def test_label_binarizer_iris(): lb = LabelBinarizer() Y = lb.fit_transform(iris.target) clfs = [SGDClassifier().fit(iris.data, Y[:, k]) for k in range(len(lb.classes_))] Y_pred = np.array([clf.decision_function(iris.data) for clf in clfs]).T y_pred = lb.inverse_transform(Y_pred) accuracy = np.mean(iris.target == y_pred) y_pred2 = SGDClassifier().fit(iris.data, iris.target).predict(iris.data) accuracy2 = np.mean(iris.target == y_pred2) assert_almost_equal(accuracy, accuracy2)
class MachineLearning: def __init__(self, Master_DF): self.Data_Frame = Master_DF def Encoder(self, df): encoder = LabelEncoder() print("Fitting") encoder.fit(df) return encoder.transform(df) def Perceptorn_PreProcessing(self, x, y): X = self.Encoder(self.Data_Frame[x].factorize()[0]) Y = self.Encoder(self.Data_Frame[y].factorize()[0]) X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=0) sc = StandardScaler() sc.fit(X_train) X_train_std = sc.transform(X_train) X_test_std = sc.transform(X_test) return X_train_std, Y_train, X_test_std, Y_test def ppn_model(self, n_iter, eta0, random_state): self.X_train_std_uri, self.Y_train_category, self.X_test_std_uri, self.Y_test_category = self.Perceptorn_PreProcessing( 'uri', 'category') ppn = Perceptron(n_iter=n_iter, eta0=eta0, random_state=random_state) ppn.fit(self.X_train_std_uri.reshape(len(self.X_train_std_uri), 1), self.Y_train_category) y_pred = ppn.predict( self.X_test_std_uri.reshape(len(self.X_test_std_uri), 1)) return (self.Y_test_category != y_pred).sum(), accuracy_score( self.Y_test_category, y_pred), y_pred def lr_model(self, c, random_state): lr = LogisticRegression(C=c, random_state=random_state) lr.fit(self.X_train_std_uri.reshape(len(self.X_train_std_uri), 1), self.Y_train_category) y_pred = lr.predict( self.X_test_std_uri.reshape(len(self.X_test_std_uri), 1)) return (self.Y_test_category != y_pred).sum(), accuracy_score( self.Y_test_category, y_pred), y_pred def ada_sd(self, n_iter, eta, random_state): self.ada = SGDClassifier(n_iter=n_iter, eta=eta, random_state=random_state) self.ada.fit( self.X_train_std_uri.reshape(len(self.X_train_std_uri), 1), self.Y_train_category) self.y_pred = self.ada.predict( self.X_test_std_uri.reshape(len(self.X_test_std_uri), 1)) return (self.Y_test_category != self.y_pred).sum(), accuracy_score( self.Y_test_category, self.y_pred), self.y_pred
def do_training(processed_train_csv_file): ## Processed train samples reading # read saved processed train samples from the given csv file processed_train_samples = pd.read_csv(processed_train_csv_file) # inf to nan processed_train_samples = processed_train_samples.replace([np.inf, -np.inf], np.nan) # nan to 0 processed_train_samples = processed_train_samples.fillna(value=0) processed_train_samples_index_lst = processed_train_samples.index.tolist() # 之前排过序,这里shuffle一下,效果更好 random.shuffle(processed_train_samples_index_lst) # organize new train samples and targets shuffled_train_samples = processed_train_samples.ix[processed_train_samples_index_lst] col_names = shuffled_train_samples.columns.tolist() col_names.remove("booking_bool") features = shuffled_train_samples[col_names].values labels = shuffled_train_samples['booking_bool'].values # Model training # 1 Random Forest Classifier print("Training Random Forest Classifier") rf_classifier = RandomForestClassifier(n_estimators=150, verbose=2, n_jobs=-1, min_samples_split=10) rf_classifier.fit(features, labels) print("Saving the Random Forest Classifier") data_io.save_model(rf_classifier, model_name='rf_classifier.pkl') # 2 Gradient Boosting Classifier print("Gradient Boosting Classifier") gb_classifier = GradientBoostingClassifier(n_estimators=150, verbose=2, learning_rate=0.1, min_samples_split=10) gb_classifier.fit(features, labels) print("Saving the Gradient Boosting Classifier") data_io.save_model(gb_classifier, model_name='gb_classifier.pkl') # 3 SGD Classifier print("SGD Classifier") sgd_classifier = SGDClassifier(loss="modified_huber", verbose=2, n_jobs=-1) sgd_classifier.fit(features, labels) print("saved the SGD Classifier") data_io.save_model(sgd_classifier, model_name='sgd_classifier.pkl')
def train_model(texts, points, num_classses, model_dir, text_encoding='utf-8'): """ Given an iterable of (text, lat, lon) items, cluster the points into #num_classes and use them as labels, then extract unigram features, train a classifier and save it in models/model_name for future use. Args: texts -- an iterable (e.g. a list) of texts e.g. ['this is the first text', 'this is the second text']. points -- an iterable (e.g. a list) of tuples in the form of (lat, lon) where coordinates are of type float e.g. [(1.2343, -10.239834r),(5.634534, -12.47563)] num_classes -- the number of desired clusters/labels/classes of the model. model_name -- the name of the directory within models/ that the model will be saved. """ if os.path.exists(model_dir): logging.error("Model directory " + model_dir + " already exists, please try another address.") sys.exit(-1) else: os.mkdir(model_dir) from sklearn.cluster import KMeans from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.linear_model.stochastic_gradient import SGDClassifier kmeans = KMeans(n_clusters=num_classses, random_state=0) points_arr = numpy.array(points) kmeans.fit_transform(points_arr) cluster_centers = kmeans.cluster_centers_ sample_clusters = kmeans.labels_ label_coordinate = {} for i in range(cluster_centers.shape[0]): lat, lon = cluster_centers[i, 0], cluster_centers[i, 1] label_coordinate[i] = (lat, lon) logging.info('extracting features from text...') vectorizer = TfidfVectorizer(encoding=text_encoding, stop_words='english', ngram_range=(1,1), max_df=0.5, min_df=0, binary=True, norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=True) X_train = vectorizer.fit_transform(texts) Y_train = sample_clusters vectorizer.stop_words_ = None logging.info('the number of samples is %d and the number of features is %d' % (X_train.shape[0], X_train.shape[1])) logging.info('training the classifier...') logging.warn('Note that alpha (regularisation strength) should be tuned based on the performance on validation data.') clf = SGDClassifier(loss='log', penalty='elasticnet', alpha=5e-5, l1_ratio=0.9, fit_intercept=True, n_iter=5, n_jobs=2, random_state=0, learning_rate="optimal") clf.fit(X_train, Y_train) clf.coef_ = csr_matrix(clf.coef_) logging.info('retrieving address of the given points using geopy (requires internet access).') coordinate_address = retrieve_location_from_coordinates(label_coordinate.values()) logging.info('dumping the the vectorizer, clf (trained model), label_coordinates and coordinate_locations into pickle files in ' + model_dir) dump_model(clf, vectorizer, coordinate_address, label_coordinate, model_dir)
def main(): raw_data = np.loadtxt(sys.stdin) samples = raw_data.shape[0] X = np.empty((samples, 401)) Y = raw_data[:,0] for i in xrange(samples): X[i] = transform(raw_data[i, 1:]) clf = SGDClassifier(loss = _LOSS, penalty = _PENALTY, fit_intercept = False, shuffle = True, alpha = _REGULARIZATION) clf.fit(X, Y) sys.stdout.write('%s\t' % _KEY) for coeff in clf.coef_.flatten(): sys.stdout.write("%f " % coeff) sys.stdout.write("\n")
def train(): X = df_train.drop(['cust_id', 'y', 'cust_group'], axis=1, inplace=False) y = df_train['y'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) print(X_train.shape, X_test.shape) names = [ "KNeighborsClassifier", "SGDClassifier", "LogisticRegression", "RandomForestClassifier", "GradientBoostingClassifier", "AdaBoostClassifier", "DecisionTreeClassifier" ] classifiers = [ KNeighborsClassifier(), SGDClassifier(loss='log'), LogisticRegression(C=4.0), RandomForestClassifier(oob_score=True), GradientBoostingClassifier(), AdaBoostClassifier(), DecisionTreeClassifier(max_depth=3) ] for name, clf in zip(names, classifiers): print("====" * 20) print("traing..." + name) clf.fit(X_train, y_train) prob = clf.predict_proba(X_test).astype(float) # pred = np.argmax(prob, axis=1) print("mean_squared_error:", mean_squared_error(y_test, prob[:, 1])) print("log_loss:", log_loss(y_test.astype(int), prob[:, 1])) print("roc_auc_score:", roc_auc_score(y_test, prob[:, 1]))
def stochastic_descent(xtrain, ytrain, xtest): clf = SGDClassifier(loss="hinge", penalty="l2", max_iter=10, random_state=42, alpha=1e-3, tol=None) print("SGD Fitting") clf.fit(xtrain, ytrain) # Saving the model with pickle with open(base_dir + "Model", 'wb') as f: pickle.dump(clf, f) print("SGD Predicting") ytest = clf.predict(xtest) return ytest
def fit(self, X, y=None): self._sklearn_model = SKLModel(**self._hyperparams) if (y is not None): self._sklearn_model.fit(X, y) else: self._sklearn_model.fit(X) return self
def begin_test(train_x, test_x, train_y, test_y): x = train_x + test_x y = train_y + test_y clf1 = LinearRegression() clf2 = LogisticRegression() clf3 = SGDClassifier() clf4 = SVC() clf5 = KNeighborsClassifier() clf6 = MLPClassifier() clf7 = DecisionTreeClassifier() clf8 = MultinomialNB() # clf1.fit(train_x, train_y) # y_pred = clf1.predict(test_x) # print("LinearRegressionAccuracy ", accuracy_score(test_y, y_pred.round())) eclf = VotingClassifier( estimators=[('logr', clf2), ('sgd', clf3), ('svm', clf4), ('kn', clf5), ('nn', clf6), ('dt', clf7)], voting='hard') for label, clf in zip( ['LogisticRegressionClassifier', 'SGDClassifierClassifier', 'SVCClassifier', 'NearestNeighbourClassifier', 'NeuralNetworkClassifier', 'DecisionTreeClassifier', 'MultinomialNB', 'EnsembleClassifier'], [clf2, clf3, clf4, clf5, clf6, clf7, clf8, eclf]): scores = cross_val_score(clf, x, y, cv=10, scoring='accuracy') f_measure = cross_val_score(clf, x, y, cv=10, scoring='f1') # print(scores) print(label, "Accuracy: ", scores.mean(), "+/- ", scores.std()) print(label, "F-measure: ", f_measure.mean())
def initialize_algorithm(self, hyperparameters): from sklearn.linear_model.stochastic_gradient import SGDClassifier hyperparameters = self.initialize_hyperparameters(hyperparameters) sgd = SGDClassifier( loss=hyperparameters["loss"], penalty=hyperparameters["penalty"], alpha=float(hyperparameters["alpha"]), fit_intercept=bool(hyperparameters["fit_intercept"]), n_iter=int(hyperparameters["n_iter"]), learning_rate=hyperparameters["learning_rate"], l1_ratio=float(hyperparameters["l1_ratio"]), epsilon=float(hyperparameters["epsilon"]), eta0=float(hyperparameters["eta0"]), power_t=float(hyperparameters["power_t"]), shuffle=True, average=bool(hyperparameters["average"]), random_state=None) # sgd = self.algorithm( # loss=hyperparameters["loss"], # penalty=hyperparameters["penalty"], # alpha=hyperparameters["alpha"], # fit_intercept=hyperparameters["fit_intercept"], # n_iter=hyperparameters["n_iter"], # learning_rate=hyperparameters["learning_rate"], # l1_ratio=hyperparameters["l1_ratio"], # epsilon=hyperparameters["epsilon"], # eta0=hyperparameters["eta0"], # power_t=hyperparameters["power_t"], # average=hyperparameters["average"], # random_state=None) return (self.get_full_name(), sgd)
def iterative_fit(self, X, y, n_iter=1, refit=False, sample_weight=None): from sklearn.linear_model.stochastic_gradient import SGDClassifier if refit: self.estimator = None if self.estimator is None: self._iterations = 0 self.alpha = float(self.alpha) self.fit_intercept = self.fit_intercept == 'True' self.n_iter = int(self.n_iter) self.l1_ratio = float( self.l1_ratio) if self.l1_ratio is not None else 0.15 self.epsilon = float( self.epsilon) if self.epsilon is not None else 0.1 self.eta0 = float(self.eta0) self.power_t = float( self.power_t) if self.power_t is not None else 0.25 self.average = self.average == 'True' self.estimator = SGDClassifier( loss=self.loss, penalty=self.penalty, alpha=self.alpha, fit_intercept=self.fit_intercept, n_iter=1, learning_rate=self.learning_rate, l1_ratio=self.l1_ratio, epsilon=self.epsilon, eta0=self.eta0, power_t=self.power_t, shuffle=True, average=self.average, random_state=self.random_state, ) self.estimator.n_iter = n_iter self.estimator.partial_fit(X, y, classes=np.unique(y), sample_weight=sample_weight) if self._iterations >= self.n_iter: self.fully_fit_ = True self._iterations += n_iter return self
def CPLELearningWrapper(X_train, y_train, X_test): from frameworks.CPLELearning import CPLELearningModel #clf = RandomForestClassifier() from sklearn.linear_model.stochastic_gradient import SGDClassifier clf = SGDClassifier(loss='log', penalty='l1') ssmodel = CPLELearningModel(clf) newlabels = np.concatenate((np.array(y_train), -np.ones(len(X_test)))) ssmodel.fit(np.concatenate((X_train, X_test)), newlabels) return ssmodel.predict(X_test)
def demo(instances=2000): """ _test_comparison_prequential This demo will test a prequential evaluation when more than one learner is passed, which makes it a comparison task. Parameters ---------- instances: int The evaluation's maximum number of instances. """ # Stream setup stream = FileStream("../datasets/covtype.csv", -1, 1) # stream = SEAGenerator(classification_function=2, sample_seed=53432, balance_classes=False) stream.prepare_for_use() # Setup the classifier clf = SGDClassifier() # classifier = KNNAdwin(k=8, max_window_size=2000,leaf_size=40, categorical_list=None) # classifier = OzaBaggingAdwin(h=KNN(k=8, max_window_size=2000, leaf_size=30, categorical_list=None)) clf_one = KNNAdwin(k=8, max_window_size=1000, leaf_size=30) # clf_two = KNN(k=8, max_window_size=1000, leaf_size=30) # clf_two = LeverageBagging(h=KNN(), ensemble_length=2) t_one = OneHotToCategorical([[10, 11, 12, 13], [ 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53 ]]) # t_two = OneHotToCategorical([[10, 11, 12, 13], # [14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, # 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53]]) pipe_one = Pipeline([('one_hot_to_categorical', t_one), ('KNN', clf_one)]) # pipe_two = Pipeline([('one_hot_to_categorical', t_two), ('KNN', clf_two)]) classifier = [clf, pipe_one] # classifier = SGDRegressor() # classifier = PerceptronMask() # Setup the pipeline # pipe = Pipeline([('Classifier', classifier)]) # Setup the evaluator evaluator = EvaluatePrequential(pretrain_size=2000, output_file='teste.csv', max_samples=instances, batch_size=1, n_wait=200, max_time=1000, show_plot=True, metrics=['performance', 'kappa_t']) # Evaluate evaluator.evaluate(stream=stream, model=classifier)
class SGDClassifierImpl(): def __init__(self, loss='hinge', penalty='l2', alpha=0.0001, l1_ratio=0.15, fit_intercept=True, max_iter=None, tol=None, shuffle=True, verbose=0, epsilon=0.1, n_jobs=None, random_state=None, learning_rate='optimal', eta0=0.0, power_t=0.5, early_stopping=False, validation_fraction=0.1, n_iter_no_change=5, class_weight='balanced', warm_start=False, average=False): self._hyperparams = { 'loss': loss, 'penalty': penalty, 'alpha': alpha, 'l1_ratio': l1_ratio, 'fit_intercept': fit_intercept, 'max_iter': max_iter, 'tol': tol, 'shuffle': shuffle, 'verbose': verbose, 'epsilon': epsilon, 'n_jobs': n_jobs, 'random_state': random_state, 'learning_rate': learning_rate, 'eta0': eta0, 'power_t': power_t, 'early_stopping': early_stopping, 'validation_fraction': validation_fraction, 'n_iter_no_change': n_iter_no_change, 'class_weight': class_weight, 'warm_start': warm_start, 'average': average} self._wrapped_model = SKLModel(**self._hyperparams) def fit(self, X, y=None): if (y is not None): self._wrapped_model.fit(X, y) else: self._wrapped_model.fit(X) return self def predict(self, X): return self._wrapped_model.predict(X) def predict_proba(self, X): return self._wrapped_model.predict_proba(X) def decision_function(self, X): return self._wrapped_model.decision_function(X) def partial_fit(self, X, y=None, classes = None): if not hasattr(self, "_wrapped_model"): self._wrapped_model = SKLModel(**self._hyperparams) self._wrapped_model.partial_fit(X, y, classes = classes) return self
def tune_params(X, y): # param_test2 = {'alpha': [0.0001, 0.00001, 0.00002]} # 10 param_test3 = {'n_iter': range(10, 100, 10)} # 10 gsearch1 = GridSearchCV(estimator=SGDClassifier(random_state=10, n_iter=50), param_grid=param_test3, scoring='accuracy', cv=5) gsearch1.fit(X, y) print(gsearch1.cv_results_, gsearch1.best_params_, gsearch1.best_score_)
def get_models(): models = dict() models['NN'] = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(40, 10), random_state=1) models['sgdc'] = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=42) return models
def __init__(self, positive_class): # Create an online classifier i.e. supporting `partial_fit()` self.classifier = SGDClassifier(loss = 'log') # Here we propose to learn a binary classification of the positive class # and all other documents self.positive_class = positive_class # structure to track accuracy history self.stats = {'n_train': 0, 'n_train_pos': 0, 'accuracy': 0.0, 'accuracy_history': [(0, 0)], 't0': time.time(), 'runtime_history': [(0, 0)]}
def online_model(): vectorizer = HashingVectorizer(preprocessor=preprocessor, tokenizer=lemmatizer, alternate_sign=False, ngram_range=(1, 2), stop_words=STOP_WORD_lemma) clf = SGDClassifier(loss='log', max_iter=5) pipe = OnlinePipeline([('vectorizer', vectorizer), ('classifier', clf)]) return pipe
def train(self, dataset): """Train the model with a dataset Args: dataset (list): List of training files """ # Get the original training set training_set = self.model["algo"].training_set # Append the new data to it for text in dataset: self.logger.debug("Processing " + text.filename + "...") unigrams = Unigrams( join( self.config["root"], self.config["dirs"]["models_root"], self.config["dirs"]["models"]["inline"], self.config["models"]["inline"]["unigrams"], )) for p in text.text: for line in p: if line.grade % 5 != 0: # Unclassified lines are useless for the training continue f = MachineLearningFeatures() features = f.extract_features(line, unigrams.ngrams, text.stats) result = int(line.grade / 5) training_set["features"].append(features) training_set["results"].append(result) self.logger.debug("Saving training set...") save( training_set, join(self.config["dirs"]["models_root"], self.config["dirs"]["models"]["learning"], self.config["models"]["learning"]["training_set"])) self.logger.debug("Training model...") ml_classifier = SGDClassifier(loss="log", class_weight="auto") self.model["algo"].set_classifier(ml_classifier) self.model["algo"].set_training_set(training_set["features"], training_set["results"]) self.model["algo"].train() save( self.model["algo"].classifier, join(self.config["dirs"]["models_root"], self.config["dirs"]["models"]["learning"], self.config["models"]["learning"]["classifier"]))
def demo(): # The classifier we will use (other options: SAMKNN, LeverageBagging, SGD) h = [HoeffdingTree(), SAMKNN(), LeverageBagging(), SGDClassifier()] # Demo 1 -- plot should not fail demo_parameterized(h) # Demo 2 -- csv output should look nice demo_parameterized(h, "sea_stream.csv", False) # Demo 3 -- should not give "'NoneType' object is not iterable" error demo_parameterized(h, "covtype.csv", False)
def withoutPipeline(): scaler = StandardScaler().fit(x_train) # for each x, (x - mean(all x))/std. dev. of x # this step computes the mean and std. dev. x_train = scaler.transform(x_train) x_test = scaler.transform(x_test) clfer = SGDClassifier() clfer.fit(x_train, y_train) # this will try to separate the three classes based # on the two features we gave it. Hence, we will get # back three lines. I.e., three sets of coefficients # and three intercepts if(DEBUG): #print clfer.coef_ #print clfer.intercept_ #print clfer.predict(scaler.transform([[4.7, 3.1]])) #print clfer.decision_function(scaler.transform([[4.7, 3.1]])) # the algorithm evaluates distance from all three # lines and picks the largest one (in this case [0]) pass # validate results y_predict_train = clfer.predict(x_train) print "% Correct results on training set:" print metrics.accuracy_score(y_train, y_predict_train) y_predict_test = clfer.predict(x_test) print "\n% Correct results on testing set:" print metrics.accuracy_score(y_test, y_predict_test) # Understanding the classification report: # Precision: TP/(TP + FP) - ideal 1 - all instances reported as x were x. In other words, # there were no instances reported as x that were NOT x # Recall: TP/(TP + FN) - ideal 1 - all instances OF x were reported as x # Although, accuracy does not appear in the report, it is important to know what it means: # Accuracy: (TP + TN) / (TP + TN + FP + FN) print "\nClassification Report:" print metrics.classification_report(y_test, y_predict_test) # Understanding the confusion matrix # how many of class i were predicted as j # ideal. an Identity matrix print "Confusion Matrix:" print metrics.confusion_matrix(y_test, y_predict_test)
def __init__(self, loss='hinge', penalty='l2', alpha=0.0001, l1_ratio=0.15, fit_intercept=True, max_iter=None, tol=None, shuffle=True, verbose=0, epsilon=0.1, n_jobs=None, random_state=None, learning_rate='optimal', eta0=0.0, power_t=0.5, early_stopping=False, validation_fraction=0.1, n_iter_no_change=5, class_weight='balanced', warm_start=False, average=False, n_iter=None): self._hyperparams = { 'loss': loss, 'penalty': penalty, 'alpha': alpha, 'l1_ratio': l1_ratio, 'fit_intercept': fit_intercept, 'max_iter': max_iter, 'tol': tol, 'shuffle': shuffle, 'verbose': verbose, 'epsilon': epsilon, 'n_jobs': n_jobs, 'random_state': random_state, 'learning_rate': learning_rate, 'eta0': eta0, 'power_t': power_t, 'early_stopping': early_stopping, 'validation_fraction': validation_fraction, 'n_iter_no_change': n_iter_no_change, 'class_weight': class_weight, 'warm_start': warm_start, 'average': average, 'n_iter': n_iter } self._wrapped_model = SKLModel(**self._hyperparams)
def demo(output_file=None, instances=40000): """ _test_holdout This demo runs a holdout evaluation task with one learner. The default stream is a WaveformGenerator. The default learner is a SGDClassifier, which is inserted into a Pipeline structure. All the default values can be changing by uncommenting/commenting the code below. Parameters ---------- output_file: string The name of the csv output file instances: int The evaluation's max number of instances """ # Setup the File Stream #opt = FileOption("FILE", "OPT_NAME", "../datasets/covtype.csv", "CSV", False) #stream = FileStream(opt, -1, 1) stream = WaveformGenerator() stream.prepare_for_use() # Setup the classifier classifier = SGDClassifier() #classifier = PassiveAggressiveClassifier() #classifier = SGDRegressor() #classifier = PerceptronMask() # Setup the pipeline pipe = Pipeline([('Classifier', classifier)]) # Setup the evaluator eval = EvaluateHoldout(pretrain_size=10000, test_size=2000, dynamic_test_set=True, max_instances=instances, batch_size=1, n_wait=15000, max_time=1000, output_file=output_file, task_type='classification', show_plot=True, plot_options=['kappa', 'kappa_t', 'performance']) # Evaluate eval.eval(stream=stream, classifier=pipe)
def demo(output_file=None, instances=40000): """ _test_comparison_holdout This demo will test a holdout evaluation task when more than one learner is evaluated, which makes it a comparison task. Parameters ---------- output_file: string, optional If passed this parameter indicates the output file name. If left blank, no output file will be generated. instances: int (Default: 40000) The evaluation's maximum number of instances. """ # Setup the File Stream # opt = FileOption("FILE", "OPT_NAME", "../datasets/covtype.csv", "CSV", False) # stream = FileStream(opt, -1, 1) stream = WaveformGenerator() stream.prepare_for_use() # Setup the classifier clf_one = SGDClassifier() clf_two = KNNAdwin(k=8, max_window_size=2000) # classifier = PassiveAggressiveClassifier() # classifier = SGDRegressor() # classifier = PerceptronMask() # Setup the pipeline classifier = [clf_one, clf_two] # Setup the evaluator evaluator = EvaluateHoldout(pretrain_size=2000, test_size=2000, dynamic_test_set=True, max_instances=instances, batch_size=1, n_wait=5000, max_time=1000, output_file=output_file, task_type='classification', show_plot=True, plot_options=['kappa']) # Evaluate evaluator.eval(stream=stream, classifier=classifier)
def demo(output_file=None, instances=40000): """ _test_prequential_mol This demo shows the evaluation process of a MOL classifier, initialized with sklearn's SGDClassifier. Parameters ---------- output_file: string The name of the csv output file instances: int The evaluation's max number of instances """ # Setup the File Stream #opt = FileOption("FILE", "OPT_NAME", "../datasets/music.csv", "CSV", False) #stream = FileStream(opt, 0, 6) stream = MultilabelGenerator(n_samples=instances) #stream = WaveformGenerator() stream.prepare_for_use() # Setup the classifier classifier = MultiOutputLearner(SGDClassifier(n_iter=100)) #classifier = SGDClassifier() #classifier = PassiveAggressiveClassifier() #classifier = SGDRegressor() #classifier = PerceptronMask() # Setup the pipeline pipe = Pipeline([('Classifier', classifier)]) # Setup the evaluator eval = EvaluatePrequential( pretrain_size=5000, max_instances=instances - 10000, batch_size=1, n_wait=200, max_time=1000, output_file=output_file, task_type='multi_output', show_plot=True, plot_options=['hamming_score', 'j_index', 'exact_match']) # Evaluate eval.eval(stream=stream, classifier=pipe)
def begin_test(train_x, test_x, train_y, test_y): x = train_x + test_x y = train_y + test_y clf1 = LinearRegression() clf2 = LogisticRegression() clf3 = SGDClassifier() clf4 = SVC() clf5 = KNeighborsClassifier() clf6 = MLPClassifier() clf7 = DecisionTreeClassifier() clf8 = MultinomialNB() # clf1.fit(train_x, train_y) # y_pred = clf1.predict(test_x) # print("LinearRegressionAccuracy ", accuracy_score(test_y, y_pred.round())) eclf = VotingClassifier(estimators=[('logr', clf2), ('sgd', clf3), ('svm', clf4), ('kn', clf5), ('nn', clf6), ('dt', clf7)], voting='hard') # for label, clf in zip( # ['LogisticRegressionClassifier', 'SGDClassifierClassifier', 'SVCClassifier', # 'NearestNeighbourClassifier', 'NeuralNetworkClassifier', 'DecisionTreeClassifier', # 'MultinomialNB', 'EnsembleClassifier'], # [clf2, clf3, clf4, clf5, clf6, clf7, clf8, eclf]): arr = [[16, 54], [18, 45], [23, 54], [33, 54], [37, 45]] for k, j in arr: clf = DecisionTreeClassifier(splitter='best', max_depth=j, min_samples_split=k) clf.fit(train_x, train_y) y_pred = clf.predict(test_x) correct_count = 0 for i in range(len(y_pred)): if (y_pred[i] == test_y[i]): correct_count += 1 # if(correct_count/len(y_pred)>=0.74): print( str(q) + " " + str(k) + " " + str(j) + " " + str(correct_count / len(y_pred)))
def demo(output_file=None, instances=40000): """ _test_prequential_mol This demo shows the evaluation process of a MOL classifier, initialized with sklearn's SGDClassifier. Parameters ---------- output_file: string The name of the csv output file instances: int The evaluation's max number of instances """ # Setup the File Stream stream = MultilabelGenerator(n_samples=instances) # stream = WaveformGenerator() # Setup the classifier classifier = MultiOutputLearner(SGDClassifier(n_iter=100)) # classifier = SGDClassifier() # classifier = PassiveAggressiveClassifier() # classifier = SGDRegressor() # classifier = PerceptronMask() # Setup the pipeline pipe = Pipeline([('Classifier', classifier)]) # Setup the evaluator evaluator = EvaluatePrequential( pretrain_size=5000, max_samples=instances - 10000, batch_size=1, n_wait=200, max_time=1000, output_file=output_file, show_plot=True, metrics=['hamming_score', 'j_index', 'exact_match']) # Evaluate evaluator.evaluate(stream=stream, model=pipe)
tfile.extractall(self.data_path) print("done !") def iterdocs(self): """Iterate doc by doc, yield a dict.""" for root, _dirnames, filenames in os.walk(self.data_path): for filename in fnmatch.filter(filenames, '*.sgm'): path = os.path.join(root, filename) parser = ReutersParser() for doc in parser.parse(open(path)): yield doc hasher = HashingVectorizer(decode_error='ignore', n_features=2 ** 18) classifier = SGDClassifier() data_streamer = ReutersStreamReader('reuters').iterdocs() all_classes = np.array([0, 1]) positive_class = 'acq' def get_minibatch(doc_iter, size, transformer=hasher, pos_class=positive_class): """Extract a minibatch of examples, return a tuple X, y. Note: size is before excluding invalid docs with no topics assigned. """ data = [('{title}\n\n{body}'.format(**doc), pos_class in doc['topics'])
cancer = fetch_mldata("Lung cancer (Ontario)") X = cancer.target.T ytrue = np.copy(cancer.data).flatten() ytrue[ytrue > 0] = 1 # label a few points labeled_N = 4 ys = np.array([-1] * len(ytrue)) # -1 denotes unlabeled point random_labeled_points = random.sample(np.where(ytrue == 0)[0], labeled_N / 2) + random.sample( np.where(ytrue == 1)[0], labeled_N / 2 ) ys[random_labeled_points] = ytrue[random_labeled_points] # supervised score # basemodel = WQDA() # weighted Quadratic Discriminant Analysis basemodel = SGDClassifier(loss="log", penalty="l1") # scikit logistic regression basemodel.fit(X[random_labeled_points, :], ys[random_labeled_points]) print "supervised log.reg. score", basemodel.score(X, ytrue) # fast (but naive, unsafe) self learning framework ssmodel = SelfLearningModel(basemodel) ssmodel.fit(X, ys) print "self-learning log.reg. score", ssmodel.score(X, ytrue) # semi-supervised score (base model has to be able to take weighted samples) ssmodel = CPLELearningModel(basemodel) ssmodel.fit(X, ys) print "CPLE semi-supervised log.reg. score", ssmodel.score(X, ytrue) # semi-supervised score, WQDA model ssmodel = CPLELearningModel(WQDA(), predict_from_probabilities=True) # weighted Quadratic Discriminant Analysis
class SGD(ParamSklearnClassificationAlgorithm): def __init__(self, loss, penalty, alpha, fit_intercept, n_iter, learning_rate, class_weight=None, l1_ratio=0.15, epsilon=0.1, eta0=0.01, power_t=0.5, average=False, random_state=None): self.loss = loss self.penalty = penalty self.alpha = alpha self.fit_intercept = fit_intercept self.n_iter = n_iter self.learning_rate = learning_rate self.class_weight = class_weight self.l1_ratio = l1_ratio self.epsilon = epsilon self.eta0 = eta0 self.power_t = power_t self.random_state = random_state self.average = average self.estimator = None def fit(self, X, y): while not self.configuration_fully_fitted(): self.iterative_fit(X, y, n_iter=1) return self def iterative_fit(self, X, y, n_iter=1, refit=False): if refit: self.estimator = None if self.estimator is None: self.alpha = float(self.alpha) self.fit_intercept = self.fit_intercept == 'True' self.n_iter = int(self.n_iter) if self.class_weight == "None": self.class_weight = None self.l1_ratio = float(self.l1_ratio) if self.l1_ratio is not None else 0.15 self.epsilon = float(self.epsilon) if self.epsilon is not None else 0.1 self.eta0 = float(self.eta0) self.power_t = float(self.power_t) if self.power_t is not None else 0.25 self.average = self.average == 'True' self.estimator = SGDClassifier(loss=self.loss, penalty=self.penalty, alpha=self.alpha, fit_intercept=self.fit_intercept, n_iter=self.n_iter, learning_rate=self.learning_rate, class_weight=self.class_weight, l1_ratio=self.l1_ratio, epsilon=self.epsilon, eta0=self.eta0, power_t=self.power_t, shuffle=True, average=self.average, random_state=self.random_state) self.estimator.n_iter += n_iter self.estimator.fit(X, y) return self def configuration_fully_fitted(self): if self.estimator is None: return False return not self.estimator.n_iter < self.n_iter def predict(self, X): if self.estimator is None: raise NotImplementedError() return self.estimator.predict(X) def predict_proba(self, X): if self.estimator is None: raise NotImplementedError() if self.loss in ["log", "modified_huber"]: return self.estimator.predict_proba(X) else: df = self.estimator.decision_function(X) return softmax(df) @staticmethod def get_properties(dataset_properties=None): return {'shortname': 'SGD Classifier', 'name': 'Stochastic Gradient Descent Classifier', 'handles_missing_values': False, 'handles_nominal_values': False, 'handles_numerical_features': True, 'prefers_data_scaled': True, 'prefers_data_normalized': True, 'handles_regression': False, 'handles_classification': True, 'handles_multiclass': True, 'handles_multilabel': False, 'is_deterministic': True, 'handles_sparse': True, 'input': (DENSE, SPARSE, UNSIGNED_DATA), 'output': (PREDICTIONS,), # TODO find out what is best used here! 'preferred_dtype' : None} @staticmethod def get_hyperparameter_search_space(dataset_properties=None): cs = ConfigurationSpace() loss = cs.add_hyperparameter(CategoricalHyperparameter("loss", ["hinge", "log", "modified_huber", "squared_hinge", "perceptron"], default="hinge")) penalty = cs.add_hyperparameter(CategoricalHyperparameter( "penalty", ["l1", "l2", "elasticnet"], default="l2")) alpha = cs.add_hyperparameter(UniformFloatHyperparameter( "alpha", 10e-7, 1e-1, log=True, default=0.0001)) l1_ratio = cs.add_hyperparameter(UniformFloatHyperparameter( "l1_ratio", 0, 1, default=0.15)) fit_intercept = cs.add_hyperparameter(UnParametrizedHyperparameter( "fit_intercept", "True")) n_iter = cs.add_hyperparameter(UniformIntegerHyperparameter( "n_iter", 5, 1000, default=20)) epsilon = cs.add_hyperparameter(UniformFloatHyperparameter( "epsilon", 1e-5, 1e-1, default=1e-4, log=True)) learning_rate = cs.add_hyperparameter(CategoricalHyperparameter( "learning_rate", ["optimal", "invscaling", "constant"], default="optimal")) eta0 = cs.add_hyperparameter(UniformFloatHyperparameter( "eta0", 10**-7, 0.1, default=0.01)) power_t = cs.add_hyperparameter(UniformFloatHyperparameter( "power_t", 1e-5, 1, default=0.25)) average = cs.add_hyperparameter(CategoricalHyperparameter( "average", ["False", "True"], default="False")) # TODO add passive/aggressive here, although not properly documented? elasticnet = EqualsCondition(l1_ratio, penalty, "elasticnet") epsilon_condition = EqualsCondition(epsilon, loss, "modified_huber") # eta0 seems to be always active according to the source code; when # learning_rate is set to optimial, eta0 is the starting value: # https://github.com/scikit-learn/scikit-learn/blob/0.15.X/sklearn/linear_model/sgd_fast.pyx #eta0_and_inv = EqualsCondition(eta0, learning_rate, "invscaling") #eta0_and_constant = EqualsCondition(eta0, learning_rate, "constant") #eta0_condition = OrConjunction(eta0_and_inv, eta0_and_constant) power_t_condition = EqualsCondition(power_t, learning_rate, "invscaling") cs.add_condition(elasticnet) cs.add_condition(epsilon_condition) cs.add_condition(power_t_condition) return cs def __str__(self): return "ParamSklearn StochasticGradientClassifier"
class PositiveClassClassifier(object): hvectorizer = HashingVectorizer(tokenizer = LemmaTokenizer(), n_features = 2 ** 15, stop_words = 'english', lowercase = True, non_negative = True) all_classes = np.array([0, 1]) def __init__(self, positive_class): # Create an online classifier i.e. supporting `partial_fit()` self.classifier = SGDClassifier(loss = 'log') # Here we propose to learn a binary classification of the positive class # and all other documents self.positive_class = positive_class # structure to track accuracy history self.stats = {'n_train': 0, 'n_train_pos': 0, 'accuracy': 0.0, 'accuracy_history': [(0, 0)], 't0': time.time(), 'runtime_history': [(0, 0)]} def progress(self): """Report progress information, return a string.""" duration = time.time() - self.stats['t0'] s = "%(n_train)6d train docs (%(n_train_pos)6d positive) " % self.stats s += "accuracy: %(accuracy).6f " % self.stats s += "in %.2fs (%5d docs/s)" % (duration, self.stats['n_train'] / duration) return s def train(self): minibatch_iterator = iter_minibatchs(OVA_TRAIN_FILE, self.hvectorizer, self.positive_class) # Main loop : iterate on mini-batchs of examples for i, (x_train, y_train) in enumerate(minibatch_iterator): # update estimator with examples in the current mini-batch self.classifier.partial_fit(x_train, y_train, classes=self.all_classes) # accumulate test accuracy stats self.stats['n_train'] += x_train.shape[0] self.stats['n_train_pos'] += sum(y_train) self.stats['accuracy'] = self.score() self.stats['accuracy_history'].append((self.stats['accuracy'], self.stats['n_train'])) self.stats['runtime_history'].append((self.stats['accuracy'], time.time() - self.stats['t0'])) #if i % 10 == 0: # print self.progress() def score(self): TEST_BATCHES_NO = 20 minibatch_iterator = iter_minibatchs(TEST_FILE, self.hvectorizer, self.positive_class) score = 0 for i, (x_test, y_test) in enumerate(minibatch_iterator): y_test = np.asarray(y_test) score += self.classifier.score(x_test, y_test) if i >= TEST_BATCHES_NO - 1: break return score / TEST_BATCHES_NO
def iterative_fit(self, X, y, n_iter=2, refit=False, sample_weight=None): from sklearn.linear_model.stochastic_gradient import SGDClassifier # Need to fit at least two iterations, otherwise early stopping will not # work because we cannot determine whether the algorithm actually # converged. The only way of finding this out is if the sgd spends less # iterations than max_iter. If max_iter == 1, it has to spend at least # one iteration and will always spend at least one iteration, so we # cannot know about convergence. if refit: self.estimator = None if self.estimator is None: self.fully_fit_ = False self.alpha = float(self.alpha) self.l1_ratio = float(self.l1_ratio) if self.l1_ratio is not None \ else 0.15 self.epsilon = float(self.epsilon) if self.epsilon is not None \ else 0.1 self.eta0 = float(self.eta0) self.power_t = float(self.power_t) if self.power_t is not None \ else 0.5 self.average = check_for_bool(self.average) self.fit_intercept = check_for_bool(self.fit_intercept) self.tol = float(self.tol) self.estimator = SGDClassifier(loss=self.loss, penalty=self.penalty, alpha=self.alpha, fit_intercept=self.fit_intercept, max_iter=n_iter, tol=self.tol, learning_rate=self.learning_rate, l1_ratio=self.l1_ratio, epsilon=self.epsilon, eta0=self.eta0, power_t=self.power_t, shuffle=True, average=self.average, random_state=self.random_state, warm_start=True) self.estimator.fit(X, y, sample_weight=sample_weight) else: self.estimator.max_iter += n_iter self.estimator.max_iter = min(self.estimator.max_iter, 512) self.estimator._validate_params() self.estimator._partial_fit( X, y, alpha=self.estimator.alpha, C=1.0, loss=self.estimator.loss, learning_rate=self.estimator.learning_rate, max_iter=n_iter, sample_weight=sample_weight, classes=None, coef_init=None, intercept_init=None ) if self.estimator._max_iter >= 512 or n_iter > self.estimator.n_iter_: self.fully_fit_ = True return self
def SGD_c_fit(X,y): clf = SGDClassifier(loss='log', penalty='l2', alpha=1e-3, n_iter=5, shuffle=True) return clf.fit(X, y)
class SGD( IterativeComponentWithSampleWeight, AutoSklearnClassificationAlgorithm, ): def __init__(self, loss, penalty, alpha, fit_intercept, tol, learning_rate, l1_ratio=0.15, epsilon=0.1, eta0=0.01, power_t=0.5, average=False, random_state=None): self.loss = loss self.penalty = penalty self.alpha = alpha self.fit_intercept = fit_intercept self.tol = tol self.learning_rate = learning_rate self.l1_ratio = l1_ratio self.epsilon = epsilon self.eta0 = eta0 self.power_t = power_t self.random_state = random_state self.average = average self.estimator = None def iterative_fit(self, X, y, n_iter=2, refit=False, sample_weight=None): from sklearn.linear_model.stochastic_gradient import SGDClassifier # Need to fit at least two iterations, otherwise early stopping will not # work because we cannot determine whether the algorithm actually # converged. The only way of finding this out is if the sgd spends less # iterations than max_iter. If max_iter == 1, it has to spend at least # one iteration and will always spend at least one iteration, so we # cannot know about convergence. if refit: self.estimator = None if self.estimator is None: self.fully_fit_ = False self.alpha = float(self.alpha) self.l1_ratio = float(self.l1_ratio) if self.l1_ratio is not None \ else 0.15 self.epsilon = float(self.epsilon) if self.epsilon is not None \ else 0.1 self.eta0 = float(self.eta0) self.power_t = float(self.power_t) if self.power_t is not None \ else 0.5 self.average = check_for_bool(self.average) self.fit_intercept = check_for_bool(self.fit_intercept) self.tol = float(self.tol) self.estimator = SGDClassifier(loss=self.loss, penalty=self.penalty, alpha=self.alpha, fit_intercept=self.fit_intercept, max_iter=n_iter, tol=self.tol, learning_rate=self.learning_rate, l1_ratio=self.l1_ratio, epsilon=self.epsilon, eta0=self.eta0, power_t=self.power_t, shuffle=True, average=self.average, random_state=self.random_state, warm_start=True) self.estimator.fit(X, y, sample_weight=sample_weight) else: self.estimator.max_iter += n_iter self.estimator.max_iter = min(self.estimator.max_iter, 512) self.estimator._validate_params() self.estimator._partial_fit( X, y, alpha=self.estimator.alpha, C=1.0, loss=self.estimator.loss, learning_rate=self.estimator.learning_rate, max_iter=n_iter, sample_weight=sample_weight, classes=None, coef_init=None, intercept_init=None ) if self.estimator._max_iter >= 512 or n_iter > self.estimator.n_iter_: self.fully_fit_ = True return self def configuration_fully_fitted(self): if self.estimator is None: return False elif not hasattr(self, 'fully_fit_'): return False else: return self.fully_fit_ def predict(self, X): if self.estimator is None: raise NotImplementedError() return self.estimator.predict(X) def predict_proba(self, X): if self.estimator is None: raise NotImplementedError() if self.loss in ["log", "modified_huber"]: return self.estimator.predict_proba(X) else: df = self.estimator.decision_function(X) return softmax(df) @staticmethod def get_properties(dataset_properties=None): return {'shortname': 'SGD Classifier', 'name': 'Stochastic Gradient Descent Classifier', 'handles_regression': False, 'handles_classification': True, 'handles_multiclass': True, 'handles_multilabel': False, 'is_deterministic': True, 'input': (DENSE, SPARSE, UNSIGNED_DATA), 'output': (PREDICTIONS,)} @staticmethod def get_hyperparameter_search_space(dataset_properties=None): cs = ConfigurationSpace() loss = CategoricalHyperparameter("loss", ["hinge", "log", "modified_huber", "squared_hinge", "perceptron"], default_value="log") penalty = CategoricalHyperparameter( "penalty", ["l1", "l2", "elasticnet"], default_value="l2") alpha = UniformFloatHyperparameter( "alpha", 1e-7, 1e-1, log=True, default_value=0.0001) l1_ratio = UniformFloatHyperparameter( "l1_ratio", 1e-9, 1, log=True, default_value=0.15) fit_intercept = UnParametrizedHyperparameter("fit_intercept", "True") tol = UniformFloatHyperparameter("tol", 1e-5, 1e-1, log=True, default_value=1e-4) epsilon = UniformFloatHyperparameter( "epsilon", 1e-5, 1e-1, default_value=1e-4, log=True) learning_rate = CategoricalHyperparameter( "learning_rate", ["optimal", "invscaling", "constant"], default_value="invscaling") eta0 = UniformFloatHyperparameter( "eta0", 1e-7, 1e-1, default_value=0.01, log=True) power_t = UniformFloatHyperparameter("power_t", 1e-5, 1, default_value=0.5) average = CategoricalHyperparameter( "average", ["False", "True"], default_value="False") cs.add_hyperparameters([loss, penalty, alpha, l1_ratio, fit_intercept, tol, epsilon, learning_rate, eta0, power_t, average]) # TODO add passive/aggressive here, although not properly documented? elasticnet = EqualsCondition(l1_ratio, penalty, "elasticnet") epsilon_condition = EqualsCondition(epsilon, loss, "modified_huber") power_t_condition = EqualsCondition(power_t, learning_rate, "invscaling") # eta0 is only relevant if learning_rate!='optimal' according to code # https://github.com/scikit-learn/scikit-learn/blob/0.19.X/sklearn/ # linear_model/sgd_fast.pyx#L603 eta0_in_inv_con = InCondition(eta0, learning_rate, ["invscaling", "constant"]) cs.add_conditions([elasticnet, epsilon_condition, power_t_condition, eta0_in_inv_con]) return cs
plt.figure(2) plt.scatter(features[:,1], features[:,2], c = labels) plt.plot(x1, x_2) # sklearn implementation from sklearn.linear_model import Perceptron from sklearn.linear_model.stochastic_gradient import SGDClassifier from sklearn.metrics import accuracy_score # Fitting an sklearn Perceptron and SGDClassifier with perceptron loss function (these should be identical) clf = Perceptron(random_state=None, eta0= 0.1, shuffle=False, penalty=None, class_weight=None, fit_intercept=False) clf2 = SGDClassifier(loss="perceptron",eta0=0.1,learning_rate="constant",penalty=None,random_state=None,shuffle=False,fit_intercept=False,warm_start=False,average=False,n_iter=1000) clf.fit(x_train, y_train) clf2.fit(x_train, y_train) y_predict = clf.predict(x_test) y_preSGD = clf2.predict(x_test) print "sklearn Perceptron accuracy:" print accuracy_score(y_test, y_predict) print "sklearn SGDClassifier accuracy:" print accuracy_score(y_test, y_preSGD) print "my perceptron accuracy:" print accuracy_score(y_test, y_pred) print "\n"