class LogReg: def __init__(self): self.load_data() self.clf = LogisticRegression(class_weight = 'balanced') self.train() self.predict() def load_data(self): train_csv = './data/train.csv' test_csv = './data/test.csv' df_train = pd.read_csv(train_csv, header=0) df_test = pd.read_csv(test_csv, header=0) arr_train = df_train.values arr_test = df_test.values self.train_X = arr_train[0::,1::] self.train_Y = arr_train[0::, 0] self.test_X = arr_test[0::, 1::] self.test_ID = arr_test[0::,0] def train(self): self.clf.fit(self.train_X, self.train_Y) def predict(self): self.test_Y = self.clf.predict_proba(self.test_X) def get_training_accuracy(self): return (self.clf.score(self.train_X, self.train_Y)) def store_result(self): df_out = pd.DataFrame() df_out['Id'] = self.test_ID df_out['Action'] = self.test_Y[0::,1] df_out.to_csv('./data/results/c1_result.csv',index=False)
def main(): classes = [ 'chimp', 'corvette', 'tokyo', 'goldengatebridge' ] images, labels = get_labels(classes) std_features = get_standard_features(images) k = 256 surf_features = get_visual_words(images, k) tas_features = get_tas_features(images) feature_dict = { 'Std': std_features, 'SURF': surf_features, 'TAS': tas_features #'Zernike': zernike_features } best_features = log_classify(feature_dict, labels) classifier = LogisticRegression() classifier.fit(best_features, labels)
def test_warm_start(): # A 1-iteration second fit on same data should give almost same result # with warm starting, and quite different result without warm starting. # Warm starting does not work with liblinear solver. X, y = iris.data, iris.target solvers = ['newton-cg', 'sag'] # old scipy doesn't have maxiter if sp_version >= (0, 12): solvers.append('lbfgs') for warm_start in [True, False]: for fit_intercept in [True, False]: for solver in solvers: for multi_class in ['ovr', 'multinomial']: clf = LogisticRegression(tol=1e-4, multi_class=multi_class, warm_start=warm_start, solver=solver, random_state=42, max_iter=100, fit_intercept=fit_intercept) clf.fit(X, y) coef_1 = clf.coef_ clf.max_iter = 1 with ignore_warnings(): clf.fit(X, y) cum_diff = np.sum(np.abs(coef_1 - clf.coef_)) msg = ("Warm starting issue with %s solver in %s mode " "with fit_intercept=%s and warm_start=%s" % (solver, multi_class, str(fit_intercept), str(warm_start))) if warm_start: assert_greater(2.0, cum_diff, msg) else: assert_greater(cum_diff, 2.0, msg)
def test_write_parameters(): # Test that we can write to coef_ and intercept_ clf = LogisticRegression(random_state=0) clf.fit(X, Y1) clf.coef_[:] = 0 clf.intercept_[:] = 0 assert_array_almost_equal(clf.decision_function(X), 0)
def mlogistic(): X = [] # 前三行作为输入样本 X.append("f**k you") X.append("f**k you all") X.append("hello everyone") # 后两句作为测试样本 X.append("f**k me") X.append("hello boy") # y为样本标注 y = [1,1,0] vectorizer = TfidfVectorizer() # 取X的前三句作为输入做tfidf转换 X_train = vectorizer.fit_transform(X[:-2]) print X_train # 取X的后两句用“上句生成”的tfidf做转换 X_test = vectorizer.transform(X[-2:]) print X_test # 用逻辑回归模型做训练 classifier = LogisticRegression() classifier.fit(X_train, y) # 做测试样例的预测 predictions = classifier.predict(X_test) print predictions
def test_consistency_path(): """Test that the path algorithm is consistent""" rng = np.random.RandomState(0) X = np.concatenate((rng.randn(100, 2) + [1, 1], rng.randn(100, 2))) y = [1] * 100 + [-1] * 100 Cs = np.logspace(0, 4, 10) f = ignore_warnings # can't test with fit_intercept=True since LIBLINEAR # penalizes the intercept for method in ('lbfgs', 'newton-cg', 'liblinear'): coefs, Cs = f(logistic_regression_path)( X, y, Cs=Cs, fit_intercept=False, tol=1e-16, solver=method) for i, C in enumerate(Cs): lr = LogisticRegression(C=C, fit_intercept=False, tol=1e-16) lr.fit(X, y) lr_coef = lr.coef_.ravel() assert_array_almost_equal(lr_coef, coefs[i], decimal=4) # test for fit_intercept=True for method in ('lbfgs', 'newton-cg', 'liblinear'): Cs = [1e3] coefs, Cs = f(logistic_regression_path)( X, y, Cs=Cs, fit_intercept=True, tol=1e-4, solver=method) lr = LogisticRegression(C=Cs[0], fit_intercept=True, tol=1e-4, intercept_scaling=10000) lr.fit(X, y) lr_coef = np.concatenate([lr.coef_.ravel(), lr.intercept_]) assert_array_almost_equal(lr_coef, coefs[0], decimal=4)
def test_regularization_path(self): # Check results using logistic path num_samples = 10 num_feat = 5 X, y = make_classification(n_samples=num_samples, n_features=num_feat, n_informative=3, n_classes=2, random_state=0, weights=[0.5, 0.5]) matrix = np.zeros((num_samples, num_feat + 2)) matrix[:,:-2] = X matrix[:, -2] = np.ones(num_samples) matrix[:, -1] = y # Betas to test logitfitL1 = LogisticRegressionL1() lambda_grid = np.exp(-1 * np.linspace(1, 17, 200)) path = logitfitL1.fit(matrix, lambda_grid) # Sklearn cs = l1_min_c(X, y, loss='log') * np.logspace(0, 3) # Computing regularization path using sklearn clf = LogisticRegression(C=1.0, penalty='l1', tol=1e-6) coefs_ = [] for c in cs: clf.set_params(C=c) clf.fit(X, y) coefs_.append(clf.coef_.ravel().copy()) skbetas = np.append(clf.intercept_[0], clf.coef_) np.testing.assert_almost_equal(skbetas, logitfitL1.coef_, 1)
def test_consistency_path(): # Test that the path algorithm is consistent rng = np.random.RandomState(0) X = np.concatenate((rng.randn(100, 2) + [1, 1], rng.randn(100, 2))) y = [1] * 100 + [-1] * 100 Cs = np.logspace(0, 4, 10) f = ignore_warnings # can't test with fit_intercept=True since LIBLINEAR # penalizes the intercept for solver in ("lbfgs", "newton-cg", "liblinear", "sag"): coefs, Cs, _ = f(logistic_regression_path)( X, y, Cs=Cs, fit_intercept=False, tol=1e-5, solver=solver, random_state=0 ) for i, C in enumerate(Cs): lr = LogisticRegression(C=C, fit_intercept=False, tol=1e-5, random_state=0) lr.fit(X, y) lr_coef = lr.coef_.ravel() assert_array_almost_equal(lr_coef, coefs[i], decimal=4, err_msg="with solver = %s" % solver) # test for fit_intercept=True for solver in ("lbfgs", "newton-cg", "liblinear", "sag"): Cs = [1e3] coefs, Cs, _ = f(logistic_regression_path)( X, y, Cs=Cs, fit_intercept=True, tol=1e-6, solver=solver, intercept_scaling=10000.0, random_state=0 ) lr = LogisticRegression(C=Cs[0], fit_intercept=True, tol=1e-4, intercept_scaling=10000.0, random_state=0) lr.fit(X, y) lr_coef = np.concatenate([lr.coef_.ravel(), lr.intercept_]) assert_array_almost_equal(lr_coef, coefs[0], decimal=4, err_msg="with solver = %s" % solver)
def predictWithThreshold(datadir, threshold, penalty_type='l2'): maxent = LogisticRegression(penalty=penalty_type) scores = defaultdict(list) for dir in sorted(os.listdir(datadir), reverse=True): trainfeatures, trainlabels, vec = feats_and_classify.collect_features(datadir+dir+'/train.conll') TrainIndices=np.array(range(len(trainfeatures))) features, labels, vec = feats_and_classify.collect_features(datadir+dir+'/all.conll') TestIndices=np.array(range(len(trainfeatures),len(features))) # print('\r'+dir, end="") # print(dir) TrainX_i = features[TrainIndices] Trainy_i = labels[TrainIndices] TestX_i = features[TestIndices] Testy_i = labels[TestIndices] maxent.fit(TrainX_i,Trainy_i) # print('Finished fitting') ypred_i, score=pred_for_threshold(maxent,TestX_i,Testy_i, threshold) # print('Predicting') scores["F1"].append(score[0]) scores["Recall"].append(score[1]) scores["Accuracy"].append(score[2]) scores["Precision"].append(score[3]) #scores = cross_validation.cross_val_score(maxent, features, labels, cv=10) print("\n--") for key in sorted(scores.keys()): currentmetric = np.array(scores[key]) print("%s : %0.2f (+/- %0.2f)" % (key,currentmetric.mean(), currentmetric.std())) print("--")
def main(): scriptdir = os.path.dirname(os.path.realpath(__file__)) parser = argparse.ArgumentParser(description="Skeleton for features and classifier for CWI-2016--optimisation of threshhold") parser.add_argument('--threshold',type=float,default=0.5) parser.add_argument('--annotator',type=str,default="03") parser.add_argument('--penalty',type=str,choices=["l1","l2"],default="l1") args = parser.parse_args() current_single_ann = scriptdir+"/../data/cwi_training/cwi_training_"+args.annotator+".lbl.conll" testfile = scriptdir+"/../data/cwi_testing/cwi_testing.txt.lbl.conll" X__dict_train, y_train, v_train = feats_and_classify.collect_features(current_single_ann,vectorize=False) X_dict_test, y_test, v_test = feats_and_classify.collect_features(testfile,vectorize=False) featdicts = list([x for x in X__dict_train + X_dict_test]) vect = DictVectorizer() X = vect.fit_transform(featdicts).toarray() X_train=X[:len(y_train)] X_test=X[len(y_train):] maxent = LogisticRegression(penalty=args.penalty) maxent.fit(X_train,y_train) y_pred_proba = maxent.predict_proba(X_test) ypred_i=["1" if pair[1]>=args.threshold else "0" for pair in y_pred_proba] fout = open(args.annotator+".pred",mode="w") print("\n".join(ypred_i),file=fout) fout.close() sys.exit(0)
def cvWithThreshold(X, y_current_tr, y_current_te, threshold, regularization='l2'): out_dict = {} scores = defaultdict(list) fold=1 maxent = LogisticRegression(penalty=regularization) for TrainIndices, TestIndices in cross_validation.StratifiedKFold(y_current_tr, n_folds=10, shuffle=False, random_state=None): print('\r'+str(fold), end="") fold+=1 TrainX_i = X[TrainIndices] Trainy_i = y_current_tr[TrainIndices] TestX_i = X[TestIndices] Testy_i = y_current_te[TestIndices] maxent.fit(TrainX_i,Trainy_i) ypred_i, score=pred_for_threshold(maxent,TestX_i,Testy_i, threshold) scores["F1"].append(score[0]) scores["Recall"].append(score[1]) scores["Accuracy"].append(score[2]) scores["Precision"].append(score[3]) #scores = cross_validation.cross_val_score(maxent, features, labels, cv=10) print("\n--") for key in sorted(scores.keys()): currentmetric = np.array(scores[key]) out_dict[key] = (currentmetric.mean(),currentmetric.std()) print("%s : %0.2f (+/- %0.2f)" % (key,currentmetric.mean(), currentmetric.std())) print("--") return out_dict
def test_liblinear_random_state(): X, y = make_classification(n_samples=20) lr1 = LogisticRegression(random_state=0) lr1.fit(X, y) lr2 = LogisticRegression(random_state=0) lr2.fit(X, y) assert_array_almost_equal(lr1.coef_, lr2.coef_)
def test_warm_start(solver, warm_start, fit_intercept, multi_class): # A 1-iteration second fit on same data should give almost same result # with warm starting, and quite different result without warm starting. # Warm starting does not work with liblinear solver. X, y = iris.data, iris.target clf = LogisticRegression(tol=1e-4, multi_class=multi_class, warm_start=warm_start, solver=solver, random_state=42, max_iter=100, fit_intercept=fit_intercept) with ignore_warnings(category=ConvergenceWarning): clf.fit(X, y) coef_1 = clf.coef_ clf.max_iter = 1 clf.fit(X, y) cum_diff = np.sum(np.abs(coef_1 - clf.coef_)) msg = ("Warm starting issue with %s solver in %s mode " "with fit_intercept=%s and warm_start=%s" % (solver, multi_class, str(fit_intercept), str(warm_start))) if warm_start: assert_greater(2.0, cum_diff, msg) else: assert_greater(cum_diff, 2.0, msg)
def test_logistic_regression_solvers(): X, y = make_classification(n_features=10, n_informative=5, random_state=0) ncg = LogisticRegression(solver='newton-cg', fit_intercept=False) lbf = LogisticRegression(solver='lbfgs', fit_intercept=False) lib = LogisticRegression(fit_intercept=False) sag = LogisticRegression(solver='sag', fit_intercept=False, random_state=42) saga = LogisticRegression(solver='saga', fit_intercept=False, random_state=42) ncg.fit(X, y) lbf.fit(X, y) sag.fit(X, y) saga.fit(X, y) lib.fit(X, y) assert_array_almost_equal(ncg.coef_, lib.coef_, decimal=3) assert_array_almost_equal(lib.coef_, lbf.coef_, decimal=3) assert_array_almost_equal(ncg.coef_, lbf.coef_, decimal=3) assert_array_almost_equal(sag.coef_, lib.coef_, decimal=3) assert_array_almost_equal(sag.coef_, ncg.coef_, decimal=3) assert_array_almost_equal(sag.coef_, lbf.coef_, decimal=3) assert_array_almost_equal(saga.coef_, sag.coef_, decimal=3) assert_array_almost_equal(saga.coef_, lbf.coef_, decimal=3) assert_array_almost_equal(saga.coef_, ncg.coef_, decimal=3) assert_array_almost_equal(saga.coef_, lib.coef_, decimal=3)
def test_logistic_regression_solvers_multiclass(): X, y = make_classification(n_samples=20, n_features=20, n_informative=10, n_classes=3, random_state=0) tol = 1e-7 ncg = LogisticRegression(solver='newton-cg', fit_intercept=False, tol=tol) lbf = LogisticRegression(solver='lbfgs', fit_intercept=False, tol=tol) lib = LogisticRegression(fit_intercept=False, tol=tol) sag = LogisticRegression(solver='sag', fit_intercept=False, tol=tol, max_iter=1000, random_state=42) saga = LogisticRegression(solver='saga', fit_intercept=False, tol=tol, max_iter=10000, random_state=42) ncg.fit(X, y) lbf.fit(X, y) sag.fit(X, y) saga.fit(X, y) lib.fit(X, y) assert_array_almost_equal(ncg.coef_, lib.coef_, decimal=4) assert_array_almost_equal(lib.coef_, lbf.coef_, decimal=4) assert_array_almost_equal(ncg.coef_, lbf.coef_, decimal=4) assert_array_almost_equal(sag.coef_, lib.coef_, decimal=4) assert_array_almost_equal(sag.coef_, ncg.coef_, decimal=4) assert_array_almost_equal(sag.coef_, lbf.coef_, decimal=4) assert_array_almost_equal(saga.coef_, sag.coef_, decimal=4) assert_array_almost_equal(saga.coef_, lbf.coef_, decimal=4) assert_array_almost_equal(saga.coef_, ncg.coef_, decimal=4) assert_array_almost_equal(saga.coef_, lib.coef_, decimal=4)
def test_logreg_l1(): # Because liblinear penalizes the intercept and saga does not, we do not # fit the intercept to make it possible to compare the coefficients of # the two models at convergence. rng = np.random.RandomState(42) n_samples = 50 X, y = make_classification(n_samples=n_samples, n_features=20, random_state=0) X_noise = rng.normal(size=(n_samples, 3)) X_constant = np.ones(shape=(n_samples, 2)) X = np.concatenate((X, X_noise, X_constant), axis=1) lr_liblinear = LogisticRegression(penalty="l1", C=1.0, solver='liblinear', fit_intercept=False, tol=1e-10) lr_liblinear.fit(X, y) lr_saga = LogisticRegression(penalty="l1", C=1.0, solver='saga', fit_intercept=False, max_iter=1000, tol=1e-10) lr_saga.fit(X, y) assert_array_almost_equal(lr_saga.coef_, lr_liblinear.coef_) # Noise and constant features should be regularized to zero by the l1 # penalty assert_array_almost_equal(lr_liblinear.coef_[0, -5:], np.zeros(5)) assert_array_almost_equal(lr_saga.coef_[0, -5:], np.zeros(5))
def test_logreg_cv_penalty(): # Test that the correct penalty is passed to the final fit. X, y = make_classification(n_samples=50, n_features=20, random_state=0) lr_cv = LogisticRegressionCV(penalty="l1", Cs=[1.0], solver='liblinear') lr_cv.fit(X, y) lr = LogisticRegression(penalty="l1", C=1.0, solver='liblinear') lr.fit(X, y) assert_equal(np.count_nonzero(lr_cv.coef_), np.count_nonzero(lr.coef_))
def classify_logistic(train_features, train_labels, test_features): global SAVE clf = LogisticRegression() clf.fit(train_features, train_labels) if not TEST and SAVE: save_pickle("logistic", clf) return clf.predict(test_features)
def main(): scriptdir = os.path.dirname(os.path.realpath(__file__)) default_pool = scriptdir+"/../data/cwi_training/cwi_training.txt.lbl.conll" parser = argparse.ArgumentParser(description="Skeleton for features and classifier for CWI-2016--optimisation of threshhold") parser.add_argument('--iterations',type=int,default=5) args = parser.parse_args() all_feats = [] all_labels = defaultdict(list) scores = defaultdict(list) for idx in "01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 16 17 18 19 20".split(" "): # for idx in "01".split(" "): current_single_ann = scriptdir+"/../data/cwi_training/cwi_training_"+idx+".lbl.conll" f_current, labels_current, v_current = feats_and_classify.collect_features(current_single_ann,vectorize=False,generateFeatures=False) for instance_index,l in enumerate(labels_current): all_labels[instance_index].append(l) current_single_ann = scriptdir+"/../data/cwi_training/cwi_training_01.lbl.conll" feats, labels_current, v_current = feats_and_classify.collect_features(current_single_ann,vectorize=True,generateFeatures=True) for it in range(args.iterations): for TrainIndices, TestIndices in cross_validation.KFold(n=feats.shape[0], n_folds=10, shuffle=True, random_state=None): maxent = LogisticRegression(penalty='l2') TrainX_i = feats[TrainIndices] Trainy_i = [all_labels[x][random.randrange(0,20)] for x in TrainIndices] TestX_i = feats[TestIndices] Testy_i = [all_labels[x][random.randrange(0,20)] for x in TestIndices] maxent.fit(TrainX_i,Trainy_i) ypred_i = maxent.predict(TestX_i) acc = accuracy_score(ypred_i, Testy_i) pre = precision_score(ypred_i, Testy_i) rec = recall_score(ypred_i, Testy_i) # shared task uses f1 of *accuracy* and recall! f1 = 2 * acc * rec / (acc + rec) scores["Accuracy"].append(acc) scores["F1"].append(f1) scores["Precision"].append(pre) scores["Recall"].append(rec) #scores = cross_validation.cross_val_score(maxent, features, labels, cv=10) print("--") for key in sorted(scores.keys()): currentmetric = np.array(scores[key]) print("%s : %0.2f (+/- %0.2f)" % (key,currentmetric.mean(), currentmetric.std())) print("--") sys.exit(0)
def my_module(rt, params, inputs, outputs): # TODO : Fill your code here X = pickle.load(open(inputs.X, 'r')) Y = pickle.load(open(inputs.Y, 'r')) model = LogisticRegression() model.fit(X, Y) pickle.dump(model, open(outputs.MODEL, 'w')) print "Done"
def prepare(imageFolder): print("preparing images...") inputData = [] outputData = [] for file in os.listdir(imageFolder): if (len(file) == 8): inputData += getInputFromPic(Image.open(imageFolder + file).convert("L"), file) outputData += getOutputFromFileName(file) print("training model...") model = LogisticRegression() model.fit(inputData, outputData) return model
def test_liblinear_decision_function_zero(): # Test negative prediction when decision_function values are zero. # Liblinear predicts the positive class when decision_function values # are zero. This is a test to verify that we do not do the same. # See Issue: https://github.com/scikit-learn/scikit-learn/issues/3600 # and the PR https://github.com/scikit-learn/scikit-learn/pull/3623 X, y = make_classification(n_samples=5, n_features=5, random_state=0) clf = LogisticRegression(fit_intercept=False) clf.fit(X, y) # Dummy data such that the decision function becomes zero. X = np.zeros((5, 5)) assert_array_equal(clf.predict(X), np.zeros(5))
def fit_model_2(self, lol = .07, toWrite = False): model = LogisticRegression(C = lol, penalty = 'l1', tol = 1e-6) for data in self.cv_data: X_train, X_test, Y_train, Y_test = data X_train,Y_train = self.balance_data(X_train,Y_train) model.fit(X_train,Y_train) pred = model.predict_proba(X_test)[:,1] print("Model 2 Score: %f" % (logloss(Y_test,pred),)) if toWrite: f2 = open('model2/model.pkl','w') pickle.dump(model,f2) f2.close()
def test_logistic_regression_class_weights(): # Multinomial case: remove 90% of class 0 X = iris.data[45:, :] y = iris.target[45:] solvers = ("lbfgs", "newton-cg") class_weight_dict = _compute_class_weight_dictionary(y) for solver in solvers: clf1 = LogisticRegression(solver=solver, multi_class="multinomial", class_weight="balanced") clf2 = LogisticRegression(solver=solver, multi_class="multinomial", class_weight=class_weight_dict) clf1.fit(X, y) clf2.fit(X, y) assert_array_almost_equal(clf1.coef_, clf2.coef_, decimal=4) # Binary case: remove 90% of class 0 and 100% of class 2 X = iris.data[45:100, :] y = iris.target[45:100] solvers = ("lbfgs", "newton-cg", "liblinear") class_weight_dict = _compute_class_weight_dictionary(y) for solver in solvers: clf1 = LogisticRegression(solver=solver, multi_class="ovr", class_weight="balanced") clf2 = LogisticRegression(solver=solver, multi_class="ovr", class_weight=class_weight_dict) clf1.fit(X, y) clf2.fit(X, y) assert_array_almost_equal(clf1.coef_, clf2.coef_, decimal=6)
def test_max_iter(): # Test that the maximum number of iteration is reached X, y_bin = iris.data, iris.target.copy() y_bin[y_bin == 2] = 0 solvers = ["newton-cg", "liblinear", "sag"] # old scipy doesn't have maxiter if sp_version >= (0, 12): solvers.append("lbfgs") for max_iter in range(1, 5): for solver in solvers: lr = LogisticRegression(max_iter=max_iter, tol=1e-15, random_state=0, solver=solver) lr.fit(X, y_bin) assert_equal(lr.n_iter_[0], max_iter)
def test_multinomial_binary_probabilities(): # Test multinomial LR gives expected probabilities based on the # decision function, for a binary problem. X, y = make_classification() clf = LogisticRegression(multi_class='multinomial', solver='saga') clf.fit(X, y) decision = clf.decision_function(X) proba = clf.predict_proba(X) expected_proba_class_1 = (np.exp(decision) / (np.exp(decision) + np.exp(-decision))) expected_proba = np.c_[1-expected_proba_class_1, expected_proba_class_1] assert_almost_equal(proba, expected_proba)
def clazzify(train_mat, test_mat, true_train_labels): """ """ # learn logging.info('learning...') model = LogisticRegression(random_state=17, penalty='l1') model.fit(train_mat, true_train_labels) logging.info('finished learning.') # test logging.info('testing') predicted_test_labels = model.predict(test_mat) logging.info('finished testing') return predicted_test_labels, model
def test_scikit_learn_exploded_data(self): # Check results with scikit learn betas = [0.001, 0.07, 0.4] matrix = create_random_observations(200, 2, betas) new_matrix = explode_matrix(matrix) X = new_matrix[:,:-2] y = new_matrix[:, -1] lib = LogisticRegression(fit_intercept=True) lib.fit(X, y) path = self.logitfitL1.fit(new_matrix, self.lambda_grid) skbetas = np.append(lib.intercept_[0], lib.coef_) np.testing.assert_almost_equal(skbetas, self.logitfitL1.coef_, 2)
def generate_submission(): global alg, predictions, submission # The columns we'll use to predict the target # Initialize the algorithm class alg = LogisticRegression(random_state=1) # Train the algorithm using all the training data alg.fit(train[predictors], train["Survived"]) # Make predictions using the test set. predictions = alg.predict(test[predictors]) # Create a new dataframe with only the columns Kaggle wants from the dataset. submission = pandas.DataFrame({ "PassengerId": test["PassengerId"], "Survived": predictions }) submission.to_csv("kaggle.csv", index=False) print("kaggele.csv is generated")
def test_multinomial_binary(): """Test multinomial LR on a binary problem.""" target = (iris.target > 0).astype(np.intp) target = np.array(["setosa", "not-setosa"])[target] clf = LogisticRegression(solver='lbfgs', multi_class='multinomial') clf.fit(iris.data, target) assert_equal(clf.coef_.shape, (1, iris.data.shape[1])) assert_equal(clf.intercept_.shape, (1,)) assert_array_equal(clf.predict(iris.data), target) mlr = LogisticRegression(solver='lbfgs', multi_class='multinomial', fit_intercept=False) mlr.fit(iris.data, target) pred = clf.classes_[np.argmax(clf.predict_log_proba(iris.data), axis=1)] assert_greater(np.mean(pred == target), .9)
def test_multinomial_logistic_regression_string_inputs(): # Test with string labels for LogisticRegression(CV) n_samples, n_features, n_classes = 50, 5, 3 X_ref, y = make_classification(n_samples=n_samples, n_features=n_features, n_classes=n_classes, n_informative=3) y_str = LabelEncoder().fit(['bar', 'baz', 'foo']).inverse_transform(y) # For numerical labels, let y values be taken from set (-1, 0, 1) y = np.array(y) - 1 # Test for string labels lr = LogisticRegression(solver='lbfgs', multi_class='multinomial') lr_cv = LogisticRegressionCV(solver='lbfgs', multi_class='multinomial') lr_str = LogisticRegression(solver='lbfgs', multi_class='multinomial') lr_cv_str = LogisticRegressionCV(solver='lbfgs', multi_class='multinomial') lr.fit(X_ref, y) lr_cv.fit(X_ref, y) lr_str.fit(X_ref, y_str) lr_cv_str.fit(X_ref, y_str) assert_array_almost_equal(lr.coef_, lr_str.coef_) assert_equal(sorted(lr_str.classes_), ['bar', 'baz', 'foo']) assert_array_almost_equal(lr_cv.coef_, lr_cv_str.coef_) assert_equal(sorted(lr_str.classes_), ['bar', 'baz', 'foo']) assert_equal(sorted(lr_cv_str.classes_), ['bar', 'baz', 'foo']) # The predictions should be in original labels assert_equal(sorted(np.unique(lr_str.predict(X_ref))), ['bar', 'baz', 'foo']) assert_equal(sorted(np.unique(lr_cv_str.predict(X_ref))), ['bar', 'baz', 'foo']) # Make sure class weights can be given with string labels lr_cv_str = LogisticRegression(solver='lbfgs', class_weight={ 'bar': 1, 'baz': 2, 'foo': 0 }, multi_class='multinomial').fit( X_ref, y_str) assert_equal(sorted(np.unique(lr_cv_str.predict(X_ref))), ['bar', 'baz'])
def test_logreg_predict_proba_multinomial(): X, y = make_classification(n_samples=10, n_features=20, random_state=0, n_classes=3, n_informative=10) # Predicted probabilites using the true-entropy loss should give a # smaller loss than those using the ovr method. clf_multi = LogisticRegression(multi_class="multinomial", solver="lbfgs") clf_multi.fit(X, y) clf_multi_loss = log_loss(y, clf_multi.predict_proba(X)) clf_ovr = LogisticRegression(multi_class="ovr", solver="lbfgs") clf_ovr.fit(X, y) clf_ovr_loss = log_loss(y, clf_ovr.predict_proba(X)) assert_greater(clf_ovr_loss, clf_multi_loss) # Predicted probabilites using the soft-max function should give a # smaller loss than those using the logistic function. clf_multi_loss = log_loss(y, clf_multi.predict_proba(X)) clf_wrong_loss = log_loss(y, clf_multi._predict_proba_lr(X)) assert_greater(clf_wrong_loss, clf_multi_loss)
def test_multinomial_binary(): # Test multinomial LR on a binary problem. target = (iris.target > 0).astype(np.intp) target = np.array(["setosa", "not-setosa"])[target] for solver in ['lbfgs', 'newton-cg']: clf = LogisticRegression(solver=solver, multi_class='multinomial') clf.fit(iris.data, target) assert_equal(clf.coef_.shape, (1, iris.data.shape[1])) assert_equal(clf.intercept_.shape, (1,)) assert_array_equal(clf.predict(iris.data), target) mlr = LogisticRegression(solver=solver, multi_class='multinomial', fit_intercept=False) mlr.fit(iris.data, target) pred = clf.classes_[np.argmax(clf.predict_log_proba(iris.data), axis=1)] assert_greater(np.mean(pred == target), .9)
def train(): start = timeit.default_timer() df = pd.read_csv('train.csv', delimiter=',', header=None) print("start training ") train, test = train_test_split(df, test_size=0.2) X_train_raw = train[1] y_toxic = train[2] #y_severe_toxic = train[3] #y_obscene = train[4] #y_threat = train[5] #y_insult = train[6] #y_identity_hate = train[7] vectorizer = TfidfVectorizer() X_train = vectorizer.fit_transform(X_train_raw) #classifier_toxic = LogisticRegression(solver ='lbfgs') #classifier_toxic.fit(X_train, y_toxic) # classifier_toxic = OneVsOneClassifier(LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True, #intercept_scaling=1, loss='l2', multi_class='ovr', penalty='l2', #random_state=None, tol=0.0001, verbose=0)) classifier_toxic = LogisticRegression(random_state=0, multi_class='ovr') classifier_toxic.fit(X_train, y_toxic) f = open("logistic_complain_vectorizer.pickle", 'wb') pickle.dump(vectorizer, f) f.close() f = open("logistic_complain_classfier.pickle", 'wb') pickle.dump(classifier_toxic, f) f.close() stop = timeit.default_timer() print("training time took was : ") print stop - start
def logistic_regression(inp): arr = xml_parser.xml_parser('logistic_regression_Params.xml', inp) n = len(arr) if n != 14: return False try: X, y = make_classification(n_samples=arr[0], n_features=arr[1], n_informative=arr[2], n_classes=arr[3]) except ValueError: # pass # print("here") return False print(arr) # print("here") # try: # with parallel_backend(backend): # print("here0") try: clf = LogisticRegression(penalty=arr[6], dual=arr[7], tol=arr[8], C=arr[9], fit_intercept=arr[10], intercept_scaling=arr[11], solver=arr[5], n_jobs=arr[4], max_iter=arr[12], multi_class=arr[13]) clf.fit(X, y) # print("here1") except ValueError: # pass # print("here2") return False # except KeyError: # # print("here3") # return False return True
def test_max_iter(): # Test that the maximum number of iteration is reached X, y_bin = iris.data, iris.target.copy() y_bin[y_bin == 2] = 0 solvers = ['newton-cg', 'liblinear', 'sag'] # old scipy doesn't have maxiter if sp_version >= (0, 12): solvers.append('lbfgs') for max_iter in range(1, 5): for solver in solvers: for multi_class in ['ovr', 'multinomial']: if solver == 'liblinear' and multi_class == 'multinomial': continue lr = LogisticRegression(max_iter=max_iter, tol=1e-15, multi_class=multi_class, random_state=0, solver=solver) lr.fit(X, y_bin) assert_equal(lr.n_iter_[0], max_iter)
def main(): classes = ['chimp', 'corvette', 'tokyo', 'goldengatebridge'] images, labels = get_labels(classes) std_features = get_standard_features(images) k = 256 surf_features = get_visual_words(images, k) tas_features = get_tas_features(images) feature_dict = { 'Std': std_features, 'SURF': surf_features, 'TAS': tas_features #'Zernike': zernike_features } best_features = log_classify(feature_dict, labels) classifier = LogisticRegression() classifier.fit(best_features, labels)
def getBestThreshold(features, labels_pooled, labels_current): print("length of pooled and current", len(labels_pooled), len(labels_current)) maxent = LogisticRegression(penalty='l1') scores = {"F1": [], "Recall": [], "Accuracy": [], "Precision": []} thresholds = [] print('Finding best thresholds...') fold = 1 # for TrainIndices, TestIndices in cross_validation.StratifiedKFold(labels_pooled, n_folds=2, shuffle=False, random_state=None): for TrainIndices, TestIndices in cross_validation.StratifiedKFold( labels_pooled, n_folds=10, shuffle=False, random_state=None): # for TrainIndices, TestIndices in cross_validation.KFold(n=features.shape[0], n_folds=10, shuffle=False, random_state=None): print('\r' + str(fold), end="") fold += 1 TrainX_i = features[TrainIndices] Trainy_i = labels_pooled[TrainIndices] TestX_i = features[TestIndices] Testy_i = labels_current[TestIndices] maxent.fit(TrainX_i, Trainy_i) #get prediction thresh_i, ypred_i, score = optimize_threshold(maxent, TestX_i, Testy_i) thresholds.append(thresh_i) scores["F1"].append(score[0]) scores["Recall"].append(score[1]) scores["Accuracy"].append(score[2]) scores["Precision"].append(score[3]) #scores = cross_validation.cross_val_score(maxent, features, labels, cv=10) print("\n--") for key in sorted(scores.keys()): currentmetric = np.array(scores[key]) print("%s : %0.2f (+/- %0.2f)" % (key, currentmetric.mean(), currentmetric.std())) print("--") return maxent, np.array(thresholds)
def getBestThreshold(X, y_current_tr, y_current_te, regularization='l2'): assert len(X) == len(y_current_tr) == len( y_current_te ), 'Number of features ({}), annotator1 labels ({}) and annotator2 labels ({}) is not equal!'.format( len(X), len(y_current_tr), len(y_current_te)) maxent = LogisticRegression(penalty=regularization) scores = {"F1": [], "Recall": [], "Accuracy": [], "Precision": []} thresholds = [] print('Finding best thresholds...') fold = 1 for TrainIndices, TestIndices in cross_validation.StratifiedKFold( y_current_tr, n_folds=10, shuffle=False, random_state=None): print('\r' + str(fold), end="") fold += 1 TrainX_i = X[TrainIndices] Trainy_i = y_current_tr[TrainIndices] TestX_i = X[TestIndices] Testy_i = y_current_te[TestIndices] maxent.fit(TrainX_i, Trainy_i) #get prediction thresh_i, ypred_i, score = optimize_threshold(maxent, TestX_i, Testy_i) thresholds.append(thresh_i) scores["F1"].append(score[0]) scores["Recall"].append(score[1]) scores["Accuracy"].append(score[2]) scores["Precision"].append(score[3]) #scores = cross_validation.cross_val_score(maxent, features, labels, cv=10) print("\n--") for key in sorted(scores.keys()): currentmetric = np.array(scores[key]) print("%s : %0.2f (+/- %0.2f)" % (key, currentmetric.mean(), currentmetric.std())) print("--") return maxent, np.array(thresholds)
def predictions(): import numpy as np import pandas as pd from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.linear_model.logistic import LogisticRegression from sklearn.cross_validation import train_test_split, cross_val_score df = pd.read_csv('data/SMSSpamCollection', delimiter='\t', header=None) X_train_raw, X_test_raw, y_train, y_test = train_test_split(df[1], df[0]) #75% goes to training set and 25% goes to test set vactorizer = TfidfVectorizer() X_train = vactorizer.fit_transform(X_train_raw) X_test = vactorizer.transform(X_test_raw) #the model classifier = LogisticRegression() classifier.fit(X_train, y_train) predictions = classifier.predict(X_test) for i, prediction in enumerate(predictions[:5]): print 'Prediction: %s. Message: %s' % (prediction, X_test_raw[i])
def regress_on_words(self, word_index, X): """ word: The word that we are interested in text_corpus: input """ labels = [] tmp_X = X # Avoid directly changing the variable for idx, sentence in enumerate(X): if (sentence[word_index] == 1): # tmp_X[idx][word_index] = 0 labels.append(1) else: labels.append(0) # Build the logistic regression model log_reg = LogisticRegression() log_reg.fit(tmp_X, labels) probs = log_reg.predict_proba(tmp_X)[:, -1] return probs
def validate_model(X, y, N, digit, classifier): """ This function validate the model by K-fold cross validation and print the ROC curves in the result folder :param X: nparray, one row is one sample :param y: nparray, labels :param out: output filename :param N: number of CPU cores to use :param digit: digit of the captcha :param classifier: which classifier to use :return: None """ # K-fold cross validation folds = KFold(n_splits=5, shuffle=True, random_state=1234567).split(X) fold_r = fold_result() labels = np.unique(y) category_rs = [None] * len(labels) for label in labels: category_rs[label] = category_result() for train, test in folds: X_train = X[train] X_test = X[test] y_train = y[train] y_test = y[test] if (classifier == 'Logistic'): clf = LogisticRegression(solver='sag', n_jobs=N) else: clf = RandomForestClassifier(n_estimators=200, random_state=1234567, n_jobs=N) clf.fit(X_train, y_train) probas = clf.predict_proba(X_test) fold_r.append(clf, X_train, y_train, X_test, y_test) for label in labels: category_rs[label].append(y_test, probas[:, label], label) fold_r.print_score(digit) # print and save ROCS for label in labels: category_rs[label].print_result(label, digit)
def test_dtype_match(): # Test that np.float32 input data is not cast to np.float64 when possible X_32 = np.array(X).astype(np.float32) y_32 = np.array(Y1).astype(np.float32) X_64 = np.array(X).astype(np.float64) y_64 = np.array(Y1).astype(np.float64) X_sparse_32 = sp.csr_matrix(X, dtype=np.float32) for solver in ['newton-cg']: for multi_class in ['ovr', 'multinomial']: # Check type consistency lr_32 = LogisticRegression(solver=solver, multi_class=multi_class) lr_32.fit(X_32, y_32) assert_equal(lr_32.coef_.dtype, X_32.dtype) # check consistency with sparsity lr_32_sparse = LogisticRegression(solver=solver, multi_class=multi_class) lr_32_sparse.fit(X_sparse_32, y_32) assert_equal(lr_32_sparse.coef_.dtype, X_sparse_32.dtype) # Check accuracy consistency lr_64 = LogisticRegression(solver=solver, multi_class=multi_class) lr_64.fit(X_64, y_64) assert_equal(lr_64.coef_.dtype, X_64.dtype) assert_almost_equal(lr_32.coef_, lr_64.coef_.astype(np.float32))
def weighted_params(): with open('../../../../baselines/GSBC/YELPNYC/finalgrps.json', 'r') as fp: grps = json.load(fp) X = [] Y = [] c = 0 tot = 0 mc = 0 # for grp in grps: # scorepred=grps[grp]['scorepred'] # if len(grps[grp]['users'])>1 and (sum(scorepred[:5]))/6.0>0.4: # tot=tot+1 for grp in grps: scorepred = grps[grp]['scorepred'] if len(grps[grp]['users']) > 1: mc = mc + 1 # mc=len(grps) for grp in grps: scorepred = grps[grp]['scorepred'] if len(grps[grp]['users']) > 1: X.append(grps[grp]['scorepred'][:-2]) if c < int(0.72 * (mc)): Y.append(0) else: Y.append(1) c = c + 1 classifier = LogisticRegression() classifier.fit(X, Y) for grp in grps: scorepred = grps[grp]['scorepred'] if len(grps[grp]['users']) > 1: if grps[grp]['id'] not in ccgroups: ccgroups[grps[grp]['id']] = 0 gtgroups[grps[grp]['id']] = 0 ccgroups[grps[grp]['id']] = sum([ grps[grp]['scorepred'][i] * classifier.coef_[0][i] for i in range(8) ]) / 8.0 gtgroups[grps[grp]['id']] = grps[grp]['scoregt']
def test_logreg_l1_sparse_data(): # Because liblinear penalizes the intercept and saga does not, we do not # fit the intercept to make it possible to compare the coefficients of # the two models at convergence. rng = np.random.RandomState(42) n_samples = 50 X, y = make_classification(n_samples=n_samples, n_features=20, random_state=0) X_noise = rng.normal(scale=0.1, size=(n_samples, 3)) X_constant = np.zeros(shape=(n_samples, 2)) X = np.concatenate((X, X_noise, X_constant), axis=1) X[X < 1] = 0 X = sparse.csr_matrix(X) lr_liblinear = LogisticRegression(penalty="l1", C=1.0, solver='liblinear', fit_intercept=False, tol=1e-10) lr_liblinear.fit(X, y) lr_saga = LogisticRegression(penalty="l1", C=1.0, solver='saga', fit_intercept=False, max_iter=1000, tol=1e-10) lr_saga.fit(X, y) assert_array_almost_equal(lr_saga.coef_, lr_liblinear.coef_) # Noise and constant features should be regularized to zero by the l1 # penalty assert_array_almost_equal(lr_liblinear.coef_[0, -5:], np.zeros(5)) assert_array_almost_equal(lr_saga.coef_[0, -5:], np.zeros(5)) # Check that solving on the sparse and dense data yield the same results lr_saga_dense = LogisticRegression(penalty="l1", C=1.0, solver='saga', fit_intercept=False, max_iter=1000, tol=1e-10) lr_saga_dense.fit(X.toarray(), y) assert_array_almost_equal(lr_saga.coef_, lr_saga_dense.coef_)
def gbdt_lr_train(libsvmFileName): # load样本数据 X_all, y_all = load_svmlight_file(libsvmFileName) # 训练/测试数据分割 X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size = 0.3, random_state = 42) # 定义GBDT模型 gbdt = GradientBoostingClassifier(n_estimators=40, max_depth=3, verbose=0,max_features=0.5) # 训练学习 gbdt.fit(X_train, y_train) # 预测及AUC评测 y_pred_gbdt = gbdt.predict_proba(X_test.toarray())[:, 1] gbdt_auc = roc_auc_score(y_test, y_pred_gbdt) print('gbdt auc: %.5f' % gbdt_auc) # lr对原始特征样本模型训练 lr = LogisticRegression() lr.fit(X_train, y_train) # 预测及AUC评测 y_pred_test = lr.predict_proba(X_test)[:, 1] lr_test_auc = roc_auc_score(y_test, y_pred_test) print('基于原有特征的LR AUC: %.5f' % lr_test_auc) # GBDT编码原有特征 X_train_leaves = gbdt.apply(X_train)[:,:,0] X_test_leaves = gbdt.apply(X_test)[:,:,0] # 对所有特征进行ont-hot编码 (train_rows, cols) = X_train_leaves.shape gbdtenc = OneHotEncoder() X_trans = gbdtenc.fit_transform(np.concatenate((X_train_leaves, X_test_leaves), axis=0)) # 定义LR模型 lr = LogisticRegression() # lr对gbdt特征编码后的样本模型训练 lr.fit(X_trans[:train_rows, :], y_train) # 预测及AUC评测 y_pred_gbdtlr1 = lr.predict_proba(X_trans[train_rows:, :])[:, 1] gbdt_lr_auc1 = roc_auc_score(y_test, y_pred_gbdtlr1) print('基于GBDT特征编码后的LR AUC: %.5f' % gbdt_lr_auc1) # 定义LR模型 lr = LogisticRegression(n_jobs=-1) # 组合特征 X_train_ext = hstack([X_trans[:train_rows, :], X_train]) X_test_ext = hstack([X_trans[train_rows:, :], X_test]) print(X_train_ext.shape) # lr对组合特征的样本模型训练 lr.fit(X_train_ext, y_train) # 预测及AUC评测 y_pred_gbdtlr2 = lr.predict_proba(X_test_ext)[:, 1] gbdt_lr_auc2 = roc_auc_score(y_test, y_pred_gbdtlr2) print('基于组合特征的LR AUC: %.5f' % gbdt_lr_auc2)
def _randomized_logistic(X, y, weights, mask, C=1., verbose=False, fit_intercept=True, tol=1e-3): X = X[safe_mask(X, mask)] y = y[mask] if issparse(X): size = len(weights) weight_dia = sparse.dia_matrix((1 - weights, 0), (size, size)) X = X * weight_dia else: X *= (1 - weights) C = np.atleast_1d(np.asarray(C, dtype=np.float64)) scores = np.zeros((X.shape[1], len(C)), dtype=np.bool) for this_C, this_scores in zip(C, scores.T): # XXX : would be great to do it with a warm_start ... clf = LogisticRegression(C=this_C, tol=tol, penalty='l1', dual=False, fit_intercept=fit_intercept) clf.fit(X, y) this_scores[:] = np.any( np.abs(clf.coef_) > 10 * np.finfo(np.float).eps, axis=0) return scores
def main(): X = df_train.drop(['cust_id', 'y', 'cust_group'], axis=1, inplace=False) y = df_train['y'] X_train,X_test , y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) print(X_train.shape, X_test.shape) # X_train=extract_feature(X_train,y_train) clf=LogisticRegression(C=1.0,max_iter=100,random_state=10) print("===="*20) clf.fit(X_train, y_train) prob = clf.predict_proba(X_test) pred = np.argmax(prob, axis=1) print("mean_squared_error:", mean_squared_error(y_test, prob[:, 1])) print("log_loss:", log_loss(y_test.astype(int), prob[:, 1])) print("roc_auc_score:", roc_auc_score(y_test, prob[:, 1])) # high_danger_prob=prob[:, 1] # print(high_danger_prob) # print("调参") # tune_params(X_test, y_test) predict(clf)
def test_consistency_path(): """Test that the path algorithm is consistent""" rng = np.random.RandomState(0) X = np.concatenate((rng.randn(100, 2) + [1, 1], rng.randn(100, 2))) y = [1] * 100 + [-1] * 100 Cs = np.logspace(0, 4, 10) f = ignore_warnings # can't test with fit_intercept=True since LIBLINEAR # penalizes the intercept for method in ('lbfgs', 'newton-cg', 'liblinear'): coefs, Cs = f(logistic_regression_path)(X, y, Cs=Cs, fit_intercept=False, tol=1e-16, solver=method) for i, C in enumerate(Cs): lr = LogisticRegression(C=C, fit_intercept=False, tol=1e-16) lr.fit(X, y) lr_coef = lr.coef_.ravel() assert_array_almost_equal(lr_coef, coefs[i], decimal=4) # test for fit_intercept=True for method in ('lbfgs', 'newton-cg', 'liblinear'): Cs = [1e3] coefs, Cs = f(logistic_regression_path)(X, y, Cs=Cs, fit_intercept=True, tol=1e-4, solver=method) lr = LogisticRegression(C=Cs[0], fit_intercept=True, tol=1e-4, intercept_scaling=10000) lr.fit(X, y) lr_coef = np.concatenate([lr.coef_.ravel(), lr.intercept_]) assert_array_almost_equal(lr_coef, coefs[0], decimal=4)
def logistic_model_by_sk_learn(): """ 使用sklearn实现 """ data = pd.read_csv('./data/03_data_model/train_data_woe.csv') # 应变量 Y = data['SeriousDlqin2yrs'] # 自变量,剔除对因变量影响不明显的变量 X = data.drop([ 'SeriousDlqin2yrs', 'DebtRatio', 'MonthlyIncome', 'NumberOfOpenCreditLinesAndLoans', 'NumberRealEstateLoansOrLines', 'NumberOfDependents' ], axis=1) logistic = LogisticRegression() logistic.fit(X, Y, sample_weight=(np.array( X['RevolvingUtilizationOfUnsecuredLines'], X['NumberOfTimes90DaysLate']))) print(logistic.coef_[0]) return logistic
def _randomized_logistic(X, y, weights, mask, C=1., verbose=False, fit_intercept=True, tol=1e-3): X = X[safe_mask(X, mask)] y = y[mask] if issparse(X): size = len(weights) weight_dia = sparse.dia_matrix((1 - weights, 0), (size, size)) X = X * weight_dia else: X *= (1 - weights) C = np.atleast_1d(np.asarray(C, dtype=np.float64)) if C.ndim > 1: raise ValueError( "C should be 1-dimensional array-like, " "but got a {}-dimensional array-like instead: {}.".format( C.ndim, C)) scores = np.zeros((X.shape[1], len(C)), dtype=np.bool) for this_C, this_scores in zip(C, scores.T): # XXX : would be great to do it with a warm_start ... clf = LogisticRegression(C=this_C, tol=tol, penalty='l1', dual=False, fit_intercept=fit_intercept, solver='liblinear', multi_class='ovr') clf.fit(X, y) this_scores[:] = np.any( np.abs(clf.coef_) > 10 * np.finfo(np.float).eps, axis=0) return scores
def test_warm_start(): # A 1-iteration second fit on same data should give almost same result # with warm starting, and quite different result without warm starting. # Warm starting does not work with liblinear solver. X, y = iris.data, iris.target solvers = ['newton-cg', 'sag'] # old scipy doesn't have maxiter if sp_version >= (0, 12): solvers.append('lbfgs') for warm_start in [True, False]: for fit_intercept in [True, False]: for solver in solvers: for multi_class in ['ovr', 'multinomial']: if solver == 'sag' and multi_class == 'multinomial': break clf = LogisticRegression(tol=1e-4, multi_class=multi_class, warm_start=warm_start, solver=solver, random_state=42, max_iter=100, fit_intercept=fit_intercept) clf.fit(X, y) coef_1 = clf.coef_ clf.max_iter = 1 with ignore_warnings(): clf.fit(X, y) cum_diff = np.sum(np.abs(coef_1 - clf.coef_)) msg = ("Warm starting issue with %s solver in %s mode " "with fit_intercept=%s and warm_start=%s" % (solver, multi_class, str(fit_intercept), str(warm_start))) if warm_start: assert_greater(2.0, cum_diff, msg) else: assert_greater(cum_diff, 2.0, msg)
def test_saga_vs_liblinear(): iris = load_iris() X, y = iris.data, iris.target X = np.concatenate([X] * 10) y = np.concatenate([y] * 10) X_bin = X[y <= 1] y_bin = y[y <= 1] * 2 - 1 X_sparse, y_sparse = make_classification(n_samples=50, n_features=20, random_state=0) X_sparse = sparse.csr_matrix(X_sparse) for (X, y) in ((X_bin, y_bin), (X_sparse, y_sparse)): for penalty in ['l1', 'l2']: n_samples = X.shape[0] # alpha=1e-3 is time consuming for alpha in np.logspace(-1, 1, 3): saga = LogisticRegression( C=1. / (n_samples * alpha), solver='saga', multi_class='ovr', max_iter=200, fit_intercept=False, penalty=penalty, random_state=0, tol=1e-24) liblinear = LogisticRegression( C=1. / (n_samples * alpha), solver='liblinear', multi_class='ovr', max_iter=200, fit_intercept=False, penalty=penalty, random_state=0, tol=1e-24) saga.fit(X, y) liblinear.fit(X, y) # Convergence for alpha=1e-3 is very slow assert_array_almost_equal(saga.coef_, liblinear.coef_, 3)
class DetectMalicious(): def __init__(self): benign_data = np.loadtxt('javascript-collection/benignjs.csv', delimiter=',', dtype=np.int32) evil_data = np.loadtxt('javascript-collection/eviljs.csv', delimiter=',', dtype=np.int32) train_data = np.concatenate((benign_data, evil_data), axis=0) self.score_template = 'TPR %(TPR)f\tFPR %(FPR)f\tAccuracy %(Accuracy)f\tAUC %(AUC)f' self.D = LogisticRegression() self.D.fit(train_data[:, :-1], train_data[:, -1]) self.jsapi = [] for line in open('javascript-collection/jsapi.txt'): self.jsapi.append(line.strip('\n')) def predict(self, X): flag = [0 for x in range(len(self.jsapi))] for i in range(len(self.jsapi)): if X.find(self.jsapi[i]) != -1: flag[i] = 1 print self.D.predict([flag]) return self.D.predict([flag])
def getLRAcc(my_data, train_ratio): total_rows = numpy.size(my_data, 0) total_cols = numpy.size(my_data, 1) train_rows = int(total_rows * train_ratio) numpy.random.shuffle(my_data) training, test = my_data[:train_rows, :], my_data[train_rows:, :] XTrain = training[:, 1:total_cols - 1] YTrain = training[:, total_cols - 1] XTest = test[:, 1:total_cols - 1] YTest = test[:, total_cols - 1] lrMulti = LogisticRegression() lrMulti.fit(XTrain, YTrain) pred_Y = lrMulti.predict(XTest) error = numpy.sum(YTest != pred_Y) nb_accuracy = (float(total_rows - error) / total_rows) * 100 return nb_accuracy
def test_database_reconstruction_logistic_regression(get_iris_dataset): (x_train_iris, y_train_iris), (x_test_iris, y_test_iris) = get_iris_dataset y_train_iris = np.array([np.argmax(y) for y in y_train_iris]) y_test_iris = np.array([np.argmax(y) for y in y_test_iris]) x_private = x_test_iris[0, :].reshape(1, -1) y_private = y_test_iris[0] x_input = np.vstack((x_train_iris, x_private)) y_input = np.hstack((y_train_iris, y_private)) nb_private = LogisticRegression() nb_private.fit(x_input, y_input) estimator_private = ScikitlearnLogisticRegression(model=nb_private) recon = DatabaseReconstruction(estimator=estimator_private) x_recon, y_recon = recon.reconstruct(x_train_iris, y_train_iris) assert x_recon is not None assert x_recon.shape == (1, 4) assert y_recon.shape == (1, 3) assert np.isclose(x_recon, x_private, rtol=0.05).all() assert np.argmax(y_recon, axis=1) == y_private
def test_logistic_regression_multinomial(): # Tests for the multinomial option in logistic regression # Some basic attributes of Logistic Regression n_samples, n_features, n_classes = 50, 20, 3 X, y = make_classification(n_samples=n_samples, n_features=n_features, n_informative=10, n_classes=n_classes, random_state=0) clf_int = LogisticRegression(solver='lbfgs', multi_class='multinomial') clf_int.fit(X, y) assert_array_equal(clf_int.coef_.shape, (n_classes, n_features)) clf_wint = LogisticRegression(solver='lbfgs', multi_class='multinomial', fit_intercept=False) clf_wint.fit(X, y) assert_array_equal(clf_wint.coef_.shape, (n_classes, n_features)) # Similar tests for newton-cg solver option clf_ncg_int = LogisticRegression(solver='newton-cg', multi_class='multinomial') clf_ncg_int.fit(X, y) assert_array_equal(clf_ncg_int.coef_.shape, (n_classes, n_features)) clf_ncg_wint = LogisticRegression(solver='newton-cg', fit_intercept=False, multi_class='multinomial') clf_ncg_wint.fit(X, y) assert_array_equal(clf_ncg_wint.coef_.shape, (n_classes, n_features)) # Compare solutions between lbfgs and newton-cg assert_almost_equal(clf_int.coef_, clf_ncg_int.coef_, decimal=3) assert_almost_equal(clf_wint.coef_, clf_ncg_wint.coef_, decimal=3) assert_almost_equal(clf_int.intercept_, clf_ncg_int.intercept_, decimal=3) # Test that the path give almost the same results. However since in this # case we take the average of the coefs after fitting across all the # folds, it need not be exactly the same. for solver in ['lbfgs', 'newton-cg']: clf_path = LogisticRegressionCV(solver=solver, multi_class='multinomial', Cs=[1.]) clf_path.fit(X, y) assert_array_almost_equal(clf_path.coef_, clf_int.coef_, decimal=3) assert_almost_equal(clf_path.intercept_, clf_int.intercept_, decimal=3)
def cvWithThreshold(X, y_current_tr, y_current_te, threshold, regularization='l2'): out_dict = {} scores = defaultdict(list) fold = 1 maxent = LogisticRegression(penalty=regularization) for TrainIndices, TestIndices in cross_validation.StratifiedKFold( y_current_tr, n_folds=10, shuffle=False, random_state=None): print('\r' + str(fold), end="") fold += 1 TrainX_i = X[TrainIndices] Trainy_i = y_current_tr[TrainIndices] TestX_i = X[TestIndices] Testy_i = y_current_te[TestIndices] maxent.fit(TrainX_i, Trainy_i) ypred_i, score = pred_for_threshold(maxent, TestX_i, Testy_i, threshold) scores["F1"].append(score[0]) scores["Recall"].append(score[1]) scores["Accuracy"].append(score[2]) scores["Precision"].append(score[3]) #scores = cross_validation.cross_val_score(maxent, features, labels, cv=10) print("\n--") for key in sorted(scores.keys()): currentmetric = np.array(scores[key]) out_dict[key] = (currentmetric.mean(), currentmetric.std()) print("%s : %0.2f (+/- %0.2f)" % (key, currentmetric.mean(), currentmetric.std())) print("--") return out_dict
def predictWithThreshold(datadir, threshold, penalty_type='l2'): maxent = LogisticRegression(penalty=penalty_type) scores = defaultdict(list) for dir in sorted(os.listdir(datadir), reverse=True): trainfeatures, trainlabels, vec = feats_and_classify.collect_features( datadir + dir + '/train.conll') TrainIndices = np.array(range(len(trainfeatures))) features, labels, vec = feats_and_classify.collect_features( datadir + dir + '/all.conll') TestIndices = np.array(range(len(trainfeatures), len(features))) # print('\r'+dir, end="") # print(dir) TrainX_i = features[TrainIndices] Trainy_i = labels[TrainIndices] TestX_i = features[TestIndices] Testy_i = labels[TestIndices] maxent.fit(TrainX_i, Trainy_i) # print('Finished fitting') ypred_i, score = pred_for_threshold(maxent, TestX_i, Testy_i, threshold) # print('Predicting') scores["F1"].append(score[0]) scores["Recall"].append(score[1]) scores["Accuracy"].append(score[2]) scores["Precision"].append(score[3]) #scores = cross_validation.cross_val_score(maxent, features, labels, cv=10) print("\n--") for key in sorted(scores.keys()): currentmetric = np.array(scores[key]) print("%s : %0.2f (+/- %0.2f)" % (key, currentmetric.mean(), currentmetric.std())) print("--")
def gbdt_lr_train(self, Train_tab, Train_libsvm): # load样本数据 X_all, y_all = load_svmlight_file("sample_libsvm_data.txt") # 训练/测试数据分割 X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.1, random_state=42) # 定义GBDT模型 gbdt = GradientBoostingClassifier(n_estimators=40, max_depth=3, verbose=0, max_features=0.5) # 训练模型 gbdt.fit(X_train, y_train) # GBDT编码原有特征 X_train_leaves = gbdt.apply(X_train)[:, :, 0] X_test_leaves = gbdt.apply(X_test)[:, :, 0] # 对所有特征进行ont-hot编码 (train_rows, cols) = X_train_leaves.shape gbdtenc = OneHotEncoder() X_trans = gbdtenc.fit_transform( np.concatenate((X_train_leaves, X_test_leaves), axis=0)) # 定义LR模型 lr = LogisticRegression(n_jobs=-1) # 组合特征 X_train_ext = hstack([X_trans[:train_rows, :], X_train]) X_test_ext = hstack([X_trans[train_rows:, :], X_test]) # lr对组合特征的样本模型训练 lr.fit(X_train_ext, y_train) # 预测及AUC评测 filename = 'finalized_model.sav' pickle.dump(lr, open(filename, 'wb')) # load the model from disk loaded_model = pickle.load(open(filename, 'rb')) y_pred_gbdtlr2 = loaded_model.predict_proba(X_test_ext)[:, 1] print(y_pred_gbdtlr2)