def cvWithThreshold(X, y_current_tr, y_current_te, threshold, regularization='l2'): out_dict = {} scores = defaultdict(list) fold=1 maxent = LogisticRegression(penalty=regularization) for TrainIndices, TestIndices in cross_validation.StratifiedKFold(y_current_tr, n_folds=10, shuffle=False, random_state=None): print('\r'+str(fold), end="") fold+=1 TrainX_i = X[TrainIndices] Trainy_i = y_current_tr[TrainIndices] TestX_i = X[TestIndices] Testy_i = y_current_te[TestIndices] maxent.fit(TrainX_i,Trainy_i) ypred_i, score=pred_for_threshold(maxent,TestX_i,Testy_i, threshold) scores["F1"].append(score[0]) scores["Recall"].append(score[1]) scores["Accuracy"].append(score[2]) scores["Precision"].append(score[3]) #scores = cross_validation.cross_val_score(maxent, features, labels, cv=10) print("\n--") for key in sorted(scores.keys()): currentmetric = np.array(scores[key]) out_dict[key] = (currentmetric.mean(),currentmetric.std()) print("%s : %0.2f (+/- %0.2f)" % (key,currentmetric.mean(), currentmetric.std())) print("--") return out_dict
def main(): scriptdir = os.path.dirname(os.path.realpath(__file__)) parser = argparse.ArgumentParser(description="Skeleton for features and classifier for CWI-2016--optimisation of threshhold") parser.add_argument('--threshold',type=float,default=0.5) parser.add_argument('--annotator',type=str,default="03") parser.add_argument('--penalty',type=str,choices=["l1","l2"],default="l1") args = parser.parse_args() current_single_ann = scriptdir+"/../data/cwi_training/cwi_training_"+args.annotator+".lbl.conll" testfile = scriptdir+"/../data/cwi_testing/cwi_testing.txt.lbl.conll" X__dict_train, y_train, v_train = feats_and_classify.collect_features(current_single_ann,vectorize=False) X_dict_test, y_test, v_test = feats_and_classify.collect_features(testfile,vectorize=False) featdicts = list([x for x in X__dict_train + X_dict_test]) vect = DictVectorizer() X = vect.fit_transform(featdicts).toarray() X_train=X[:len(y_train)] X_test=X[len(y_train):] maxent = LogisticRegression(penalty=args.penalty) maxent.fit(X_train,y_train) y_pred_proba = maxent.predict_proba(X_test) ypred_i=["1" if pair[1]>=args.threshold else "0" for pair in y_pred_proba] fout = open(args.annotator+".pred",mode="w") print("\n".join(ypred_i),file=fout) fout.close() sys.exit(0)
def test_regularization_path(self): # Check results using logistic path num_samples = 10 num_feat = 5 X, y = make_classification(n_samples=num_samples, n_features=num_feat, n_informative=3, n_classes=2, random_state=0, weights=[0.5, 0.5]) matrix = np.zeros((num_samples, num_feat + 2)) matrix[:,:-2] = X matrix[:, -2] = np.ones(num_samples) matrix[:, -1] = y # Betas to test logitfitL1 = LogisticRegressionL1() lambda_grid = np.exp(-1 * np.linspace(1, 17, 200)) path = logitfitL1.fit(matrix, lambda_grid) # Sklearn cs = l1_min_c(X, y, loss='log') * np.logspace(0, 3) # Computing regularization path using sklearn clf = LogisticRegression(C=1.0, penalty='l1', tol=1e-6) coefs_ = [] for c in cs: clf.set_params(C=c) clf.fit(X, y) coefs_.append(clf.coef_.ravel().copy()) skbetas = np.append(clf.intercept_[0], clf.coef_) np.testing.assert_almost_equal(skbetas, logitfitL1.coef_, 1)
def mlogistic(): X = [] # 前三行作为输入样本 X.append("f**k you") X.append("f**k you all") X.append("hello everyone") # 后两句作为测试样本 X.append("f**k me") X.append("hello boy") # y为样本标注 y = [1,1,0] vectorizer = TfidfVectorizer() # 取X的前三句作为输入做tfidf转换 X_train = vectorizer.fit_transform(X[:-2]) print X_train # 取X的后两句用“上句生成”的tfidf做转换 X_test = vectorizer.transform(X[-2:]) print X_test # 用逻辑回归模型做训练 classifier = LogisticRegression() classifier.fit(X_train, y) # 做测试样例的预测 predictions = classifier.predict(X_test) print predictions
class LogReg: def __init__(self): self.load_data() self.clf = LogisticRegression(class_weight = 'balanced') self.train() self.predict() def load_data(self): train_csv = './data/train.csv' test_csv = './data/test.csv' df_train = pd.read_csv(train_csv, header=0) df_test = pd.read_csv(test_csv, header=0) arr_train = df_train.values arr_test = df_test.values self.train_X = arr_train[0::,1::] self.train_Y = arr_train[0::, 0] self.test_X = arr_test[0::, 1::] self.test_ID = arr_test[0::,0] def train(self): self.clf.fit(self.train_X, self.train_Y) def predict(self): self.test_Y = self.clf.predict_proba(self.test_X) def get_training_accuracy(self): return (self.clf.score(self.train_X, self.train_Y)) def store_result(self): df_out = pd.DataFrame() df_out['Id'] = self.test_ID df_out['Action'] = self.test_Y[0::,1] df_out.to_csv('./data/results/c1_result.csv',index=False)
def predictWithThreshold(datadir, threshold, penalty_type='l2'): maxent = LogisticRegression(penalty=penalty_type) scores = defaultdict(list) for dir in sorted(os.listdir(datadir), reverse=True): trainfeatures, trainlabels, vec = feats_and_classify.collect_features(datadir+dir+'/train.conll') TrainIndices=np.array(range(len(trainfeatures))) features, labels, vec = feats_and_classify.collect_features(datadir+dir+'/all.conll') TestIndices=np.array(range(len(trainfeatures),len(features))) # print('\r'+dir, end="") # print(dir) TrainX_i = features[TrainIndices] Trainy_i = labels[TrainIndices] TestX_i = features[TestIndices] Testy_i = labels[TestIndices] maxent.fit(TrainX_i,Trainy_i) # print('Finished fitting') ypred_i, score=pred_for_threshold(maxent,TestX_i,Testy_i, threshold) # print('Predicting') scores["F1"].append(score[0]) scores["Recall"].append(score[1]) scores["Accuracy"].append(score[2]) scores["Precision"].append(score[3]) #scores = cross_validation.cross_val_score(maxent, features, labels, cv=10) print("\n--") for key in sorted(scores.keys()): currentmetric = np.array(scores[key]) print("%s : %0.2f (+/- %0.2f)" % (key,currentmetric.mean(), currentmetric.std())) print("--")
def main(): classes = [ 'chimp', 'corvette', 'tokyo', 'goldengatebridge' ] images, labels = get_labels(classes) std_features = get_standard_features(images) k = 256 surf_features = get_visual_words(images, k) tas_features = get_tas_features(images) feature_dict = { 'Std': std_features, 'SURF': surf_features, 'TAS': tas_features #'Zernike': zernike_features } best_features = log_classify(feature_dict, labels) classifier = LogisticRegression() classifier.fit(best_features, labels)
def test_logreg_cv_penalty(): # Test that the correct penalty is passed to the final fit. X, y = make_classification(n_samples=50, n_features=20, random_state=0) lr_cv = LogisticRegressionCV(penalty="l1", Cs=[1.0], solver='liblinear') lr_cv.fit(X, y) lr = LogisticRegression(penalty="l1", C=1.0, solver='liblinear') lr.fit(X, y) assert_equal(np.count_nonzero(lr_cv.coef_), np.count_nonzero(lr.coef_))
def classify_logistic(train_features, train_labels, test_features): global SAVE clf = LogisticRegression() clf.fit(train_features, train_labels) if not TEST and SAVE: save_pickle("logistic", clf) return clf.predict(test_features)
def main(): scriptdir = os.path.dirname(os.path.realpath(__file__)) default_pool = scriptdir+"/../data/cwi_training/cwi_training.txt.lbl.conll" parser = argparse.ArgumentParser(description="Skeleton for features and classifier for CWI-2016--optimisation of threshhold") parser.add_argument('--iterations',type=int,default=5) args = parser.parse_args() all_feats = [] all_labels = defaultdict(list) scores = defaultdict(list) for idx in "01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 16 17 18 19 20".split(" "): # for idx in "01".split(" "): current_single_ann = scriptdir+"/../data/cwi_training/cwi_training_"+idx+".lbl.conll" f_current, labels_current, v_current = feats_and_classify.collect_features(current_single_ann,vectorize=False,generateFeatures=False) for instance_index,l in enumerate(labels_current): all_labels[instance_index].append(l) current_single_ann = scriptdir+"/../data/cwi_training/cwi_training_01.lbl.conll" feats, labels_current, v_current = feats_and_classify.collect_features(current_single_ann,vectorize=True,generateFeatures=True) for it in range(args.iterations): for TrainIndices, TestIndices in cross_validation.KFold(n=feats.shape[0], n_folds=10, shuffle=True, random_state=None): maxent = LogisticRegression(penalty='l2') TrainX_i = feats[TrainIndices] Trainy_i = [all_labels[x][random.randrange(0,20)] for x in TrainIndices] TestX_i = feats[TestIndices] Testy_i = [all_labels[x][random.randrange(0,20)] for x in TestIndices] maxent.fit(TrainX_i,Trainy_i) ypred_i = maxent.predict(TestX_i) acc = accuracy_score(ypred_i, Testy_i) pre = precision_score(ypred_i, Testy_i) rec = recall_score(ypred_i, Testy_i) # shared task uses f1 of *accuracy* and recall! f1 = 2 * acc * rec / (acc + rec) scores["Accuracy"].append(acc) scores["F1"].append(f1) scores["Precision"].append(pre) scores["Recall"].append(rec) #scores = cross_validation.cross_val_score(maxent, features, labels, cv=10) print("--") for key in sorted(scores.keys()): currentmetric = np.array(scores[key]) print("%s : %0.2f (+/- %0.2f)" % (key,currentmetric.mean(), currentmetric.std())) print("--") sys.exit(0)
def my_module(rt, params, inputs, outputs): # TODO : Fill your code here X = pickle.load(open(inputs.X, 'r')) Y = pickle.load(open(inputs.Y, 'r')) model = LogisticRegression() model.fit(X, Y) pickle.dump(model, open(outputs.MODEL, 'w')) print "Done"
def test_logreg_l1_sparse_data(): # Because liblinear penalizes the intercept and saga does not, we do not # fit the intercept to make it possible to compare the coefficients of # the two models at convergence. rng = np.random.RandomState(42) n_samples = 50 X, y = make_classification(n_samples=n_samples, n_features=20, random_state=0) X_noise = rng.normal(scale=0.1, size=(n_samples, 3)) X_constant = np.zeros(shape=(n_samples, 2)) X = np.concatenate((X, X_noise, X_constant), axis=1) X[X < 1] = 0 X = sparse.csr_matrix(X) lr_liblinear = LogisticRegression(penalty="l1", C=1.0, solver='liblinear', fit_intercept=False, tol=1e-10) lr_liblinear.fit(X, y) lr_saga = LogisticRegression(penalty="l1", C=1.0, solver='saga', fit_intercept=False, max_iter=1000, tol=1e-10) lr_saga.fit(X, y) assert_array_almost_equal(lr_saga.coef_, lr_liblinear.coef_) # Noise and constant features should be regularized to zero by the l1 # penalty assert_array_almost_equal(lr_liblinear.coef_[0, -5:], np.zeros(5)) assert_array_almost_equal(lr_saga.coef_[0, -5:], np.zeros(5)) # Check that solving on the sparse and dense data yield the same results lr_saga_dense = LogisticRegression(penalty="l1", C=1.0, solver='saga', fit_intercept=False, max_iter=1000, tol=1e-10) lr_saga_dense.fit(X.toarray(), y) assert_array_almost_equal(lr_saga.coef_, lr_saga_dense.coef_)
def test_dtype_match(): # Disabled to unblock the 0.19.2 release. See: # https://github.com/scikit-learn/scikit-learn/issues/11438 # Test that np.float32 input data is not cast to np.float64 when possible raise SkipTest() X_32 = np.array(X).astype(np.float32) y_32 = np.array(Y1).astype(np.float32) X_64 = np.array(X).astype(np.float64) y_64 = np.array(Y1).astype(np.float64) X_sparse_32 = sp.csr_matrix(X, dtype=np.float32) for solver in ['newton-cg']: for multi_class in ['ovr', 'multinomial']: # Check type consistency lr_32 = LogisticRegression(solver=solver, multi_class=multi_class) lr_32.fit(X_32, y_32) assert_equal(lr_32.coef_.dtype, X_32.dtype) # check consistency with sparsity lr_32_sparse = LogisticRegression(solver=solver, multi_class=multi_class) lr_32_sparse.fit(X_sparse_32, y_32) assert_equal(lr_32_sparse.coef_.dtype, X_sparse_32.dtype) # Check accuracy consistency lr_64 = LogisticRegression(solver=solver, multi_class=multi_class) lr_64.fit(X_64, y_64) assert_equal(lr_64.coef_.dtype, X_64.dtype) assert_almost_equal(lr_32.coef_, lr_64.coef_.astype(np.float32))
def test_dtype_match(): # Test that np.float32 input data is not cast to np.float64 when possible X_32 = np.array(X).astype(np.float32) y_32 = np.array(Y1).astype(np.float32) X_64 = np.array(X).astype(np.float64) y_64 = np.array(Y1).astype(np.float64) X_sparse_32 = sp.csr_matrix(X, dtype=np.float32) for solver in ['newton-cg']: for multi_class in ['ovr', 'multinomial']: # Check type consistency lr_32 = LogisticRegression(solver=solver, multi_class=multi_class) lr_32.fit(X_32, y_32) assert_equal(lr_32.coef_.dtype, X_32.dtype) # check consistency with sparsity lr_32_sparse = LogisticRegression(solver=solver, multi_class=multi_class) lr_32_sparse.fit(X_sparse_32, y_32) assert_equal(lr_32_sparse.coef_.dtype, X_sparse_32.dtype) # Check accuracy consistency lr_64 = LogisticRegression(solver=solver, multi_class=multi_class) lr_64.fit(X_64, y_64) assert_equal(lr_64.coef_.dtype, X_64.dtype) assert_almost_equal(lr_32.coef_, lr_64.coef_.astype(np.float32))
def test_nnet(n_samples=200, n_features=7, distance=0.8, complete=False): """ :param complete: if True, all possible combinations will be checked, and quality is printed """ X, y = generate_sample(n_samples=n_samples, n_features=n_features, distance=distance) nn_types = [ nnet.SimpleNeuralNetwork, nnet.MLPClassifier, nnet.SoftmaxNeuralNetwork, nnet.RBFNeuralNetwork, nnet.PairwiseNeuralNetwork, nnet.PairwiseSoftplusNeuralNetwork, ] if complete: # checking all possible combinations for loss in nnet.losses: for NNType in nn_types: for trainer in nnet.trainers: nn = NNType(layers=[5], loss=loss, trainer=trainer, random_state=42, epochs=100) nn.fit(X, y ) print(roc_auc_score(y, nn.predict_proba(X)[:, 1]), nn) lr = LogisticRegression().fit(X, y) print(lr, roc_auc_score(y, lr.predict_proba(X)[:, 1])) assert 0 == 1, "Let's see and compare results" else: # checking combinations of losses, nn_types, trainers, most of them are used once during tests. attempts = max(len(nnet.losses), len(nnet.trainers), len(nn_types)) losses_shift = numpy.random.randint(10) trainers_shift = numpy.random.randint(10) for attempt in range(attempts): # each combination is tried 3 times. before raising exception retry_attempts = 3 for retry_attempt in range(retry_attempts): loss = list(nnet.losses.keys())[(attempt + losses_shift) % len(nnet.losses)] trainer = list(nnet.trainers.keys())[(attempt + trainers_shift) % len(nnet.trainers)] nn_type = nn_types[attempt % len(nn_types)] nn = nn_type(layers=[5], loss=loss, trainer=trainer, random_state=42 + retry_attempt, epochs=200) print(nn) nn.fit(X, y) quality = roc_auc_score(y, nn.predict_proba(X)[:, 1]) computed_loss = nn.compute_loss(X, y) if quality > 0.8: break else: print('attempt {} : {}'.format(retry_attempt, quality)) if retry_attempt == retry_attempts - 1: raise RuntimeError('quality of model is too low: {} {}'.format(quality, nn))
def prepare(imageFolder): print("preparing images...") inputData = [] outputData = [] for file in os.listdir(imageFolder): if (len(file) == 8): inputData += getInputFromPic(Image.open(imageFolder + file).convert("L"), file) outputData += getOutputFromFileName(file) print("training model...") model = LogisticRegression() model.fit(inputData, outputData) return model
def test_liblinear_decision_function_zero(): # Test negative prediction when decision_function values are zero. # Liblinear predicts the positive class when decision_function values # are zero. This is a test to verify that we do not do the same. # See Issue: https://github.com/scikit-learn/scikit-learn/issues/3600 # and the PR https://github.com/scikit-learn/scikit-learn/pull/3623 X, y = make_classification(n_samples=5, n_features=5, random_state=0) clf = LogisticRegression(fit_intercept=False) clf.fit(X, y) # Dummy data such that the decision function becomes zero. X = np.zeros((5, 5)) assert_array_equal(clf.predict(X), np.zeros(5))
def fit_model_2(self, lol = .07, toWrite = False): model = LogisticRegression(C = lol, penalty = 'l1', tol = 1e-6) for data in self.cv_data: X_train, X_test, Y_train, Y_test = data X_train,Y_train = self.balance_data(X_train,Y_train) model.fit(X_train,Y_train) pred = model.predict_proba(X_test)[:,1] print("Model 2 Score: %f" % (logloss(Y_test,pred),)) if toWrite: f2 = open('model2/model.pkl','w') pickle.dump(model,f2) f2.close()
def test_logistic_regression_solvers(): X, y = make_classification(n_features=10, n_informative=5, random_state=0) clf_n = LogisticRegression(solver='newton-cg', fit_intercept=False) clf_n.fit(X, y) clf_lbf = LogisticRegression(solver='lbfgs', fit_intercept=False) clf_lbf.fit(X, y) clf_lib = LogisticRegression(fit_intercept=False) clf_lib.fit(X, y) assert_array_almost_equal(clf_n.coef_, clf_lib.coef_, decimal=3) assert_array_almost_equal(clf_lib.coef_, clf_lbf.coef_, decimal=3) assert_array_almost_equal(clf_n.coef_, clf_lbf.coef_, decimal=3)
def test_logistic_regression_solvers_multiclass(): X, y = make_classification(n_samples=20, n_features=20, n_informative=10, n_classes=3, random_state=0) clf_n = LogisticRegression(solver="newton-cg", fit_intercept=False) clf_n.fit(X, y) clf_lbf = LogisticRegression(solver="lbfgs", fit_intercept=False) clf_lbf.fit(X, y) clf_lib = LogisticRegression(fit_intercept=False) clf_lib.fit(X, y) assert_array_almost_equal(clf_n.coef_, clf_lib.coef_, decimal=4) assert_array_almost_equal(clf_lib.coef_, clf_lbf.coef_, decimal=4) assert_array_almost_equal(clf_n.coef_, clf_lbf.coef_, decimal=4)
def test_max_iter(): # Test that the maximum number of iteration is reached X, y_bin = iris.data, iris.target.copy() y_bin[y_bin == 2] = 0 solvers = ["newton-cg", "liblinear", "sag"] # old scipy doesn't have maxiter if sp_version >= (0, 12): solvers.append("lbfgs") for max_iter in range(1, 5): for solver in solvers: lr = LogisticRegression(max_iter=max_iter, tol=1e-15, random_state=0, solver=solver) lr.fit(X, y_bin) assert_equal(lr.n_iter_[0], max_iter)
def test_inconsistent_input(): # Test that an exception is raised on inconsistent input rng = np.random.RandomState(0) X_ = rng.random_sample((5, 10)) y_ = np.ones(X_.shape[0]) y_[0] = 0 clf = LogisticRegression(random_state=0) # Wrong dimensions for training data y_wrong = y_[:-1] assert_raises(ValueError, clf.fit, X, y_wrong) # Wrong dimensions for test data assert_raises(ValueError, clf.fit(X_, y_).predict, rng.random_sample((3, 12)))
def clazzify(train_mat, test_mat, true_train_labels): """ """ # learn logging.info('learning...') model = LogisticRegression(random_state=17, penalty='l1') model.fit(train_mat, true_train_labels) logging.info('finished learning.') # test logging.info('testing') predicted_test_labels = model.predict(test_mat) logging.info('finished testing') return predicted_test_labels, model
def test_multinomial_binary_probabilities(): # Test multinomial LR gives expected probabilities based on the # decision function, for a binary problem. X, y = make_classification() clf = LogisticRegression(multi_class='multinomial', solver='saga') clf.fit(X, y) decision = clf.decision_function(X) proba = clf.predict_proba(X) expected_proba_class_1 = (np.exp(decision) / (np.exp(decision) + np.exp(-decision))) expected_proba = np.c_[1-expected_proba_class_1, expected_proba_class_1] assert_almost_equal(proba, expected_proba)
def generate_submission(): global alg, predictions, submission # The columns we'll use to predict the target # Initialize the algorithm class alg = LogisticRegression(random_state=1) # Train the algorithm using all the training data alg.fit(train[predictors], train["Survived"]) # Make predictions using the test set. predictions = alg.predict(test[predictors]) # Create a new dataframe with only the columns Kaggle wants from the dataset. submission = pandas.DataFrame({ "PassengerId": test["PassengerId"], "Survived": predictions }) submission.to_csv("kaggle.csv", index=False) print("kaggele.csv is generated")
def test_scikit_learn_exploded_data(self): # Check results with scikit learn betas = [0.001, 0.07, 0.4] matrix = create_random_observations(200, 2, betas) new_matrix = explode_matrix(matrix) X = new_matrix[:,:-2] y = new_matrix[:, -1] lib = LogisticRegression(fit_intercept=True) lib.fit(X, y) path = self.logitfitL1.fit(new_matrix, self.lambda_grid) skbetas = np.append(lib.intercept_[0], lib.coef_) np.testing.assert_almost_equal(skbetas, self.logitfitL1.coef_, 2)
def test_predict_iris(): """Test logistic regression with the iris dataset""" n_samples, n_features = iris.data.shape target = iris.target_names[iris.target] clf = LogisticRegression(C=len(iris.data)).fit(iris.data, target) assert_array_equal(np.unique(target), clf.classes_) pred = clf.predict(iris.data) assert_greater(np.mean(pred == target), .95) probabilities = clf.predict_proba(iris.data) assert_array_almost_equal(probabilities.sum(axis=1), np.ones(n_samples)) pred = iris.target_names[probabilities.argmax(axis=1)] assert_greater(np.mean(pred == target), .95)
def test_max_iter(): # Test that the maximum number of iteration is reached X, y_bin = iris.data, iris.target.copy() y_bin[y_bin == 2] = 0 solvers = ['newton-cg', 'liblinear', 'sag', 'saga', 'lbfgs'] for max_iter in range(1, 5): for solver in solvers: for multi_class in ['ovr', 'multinomial']: if solver == 'liblinear' and multi_class == 'multinomial': continue lr = LogisticRegression(max_iter=max_iter, tol=1e-15, multi_class=multi_class, random_state=0, solver=solver) lr.fit(X, y_bin) assert_equal(lr.n_iter_[0], max_iter)
def test_nnet(n_samples=200, n_features=5, distance=0.5, complete=False): X, y = make_blobs( n_samples=n_samples, n_features=5, centers=[numpy.ones(n_features) * distance, -numpy.ones(n_features) * distance], ) nn_types = [ nnet.SimpleNeuralNetwork, nnet.MultiLayerNetwork, nnet.SoftmaxNeuralNetwork, nnet.RBFNeuralNetwork, nnet.PairwiseNeuralNetwork, nnet.PairwiseSoftplusNeuralNetwork, ] if complete: # checking all possible combinations for loss in nnet.losses: for NNType in nn_types: for trainer in nnet.trainers: nn = NNType(layers=[5], loss=loss, trainer=trainer, random_state=42) nn.fit(X, y, epochs=100) print(roc_auc_score(y, nn.predict_proba(X)[:, 1]), nn) lr = LogisticRegression().fit(X, y) print(lr, roc_auc_score(y, lr.predict_proba(X)[:, 1])) assert 0 == 1, "Let's see and compare results" else: # checking combinations of losses, nn_types, trainers, most of them are used once during tests. attempts = max(len(nnet.losses), len(nnet.trainers), len(nn_types)) attempts = 4 losses_shift = numpy.random.randint(10) trainers_shift = numpy.random.randint(10) for attempt in range(attempts): loss = nnet.losses.keys()[(attempt + losses_shift) % len(nnet.losses)] trainer = nnet.trainers.keys()[(attempt + trainers_shift) % len(nnet.trainers)] nn_type = nn_types[attempt % len(nn_types)] nn = nn_type(layers=[5], loss=loss, trainer=trainer, random_state=42) print(nn) nn.fit(X, y, epochs=200) assert roc_auc_score(y, nn.predict_proba(X)[:, 1]) > 0.8, "quality of model is too low: {}".format(nn)
class mentoryWEB: def __init__(self, file): self.vect = TfidfVectorizer(max_df=0.25, stop_words=None, max_features=2500, ngram_range=(1,2), use_idf=True, norm='l2') df = pd.read_csv(file, delimiter='\t', header=None) X_train_raw, y_train = df[1], df[0] X_train = self.vect.fit_transform(X_train_raw) self.clf = LogisticRegression(penalty='l2', C=10) self.clf.fit(X_train, y_train) def test(self, string): X_test = self.vect.transform([string]) prediction = self.clf.predict(X_test) return prediction[0]
def test_logistic_regression_sample_weights(): X, y = make_classification(n_samples=20, n_features=5, n_informative=3, n_classes=2, random_state=0) sample_weight = y + 1 for LR in [LogisticRegression, LogisticRegressionCV]: # Test that passing sample_weight as ones is the same as # not passing them at all (default None) for solver in ['lbfgs', 'liblinear']: clf_sw_none = LR(solver=solver, fit_intercept=False, random_state=42) clf_sw_none.fit(X, y) clf_sw_ones = LR(solver=solver, fit_intercept=False, random_state=42) clf_sw_ones.fit(X, y, sample_weight=np.ones(y.shape[0])) assert_array_almost_equal(clf_sw_none.coef_, clf_sw_ones.coef_, decimal=4) # Test that sample weights work the same with the lbfgs, # newton-cg, and 'sag' solvers clf_sw_lbfgs = LR(solver='lbfgs', fit_intercept=False, random_state=42) clf_sw_lbfgs.fit(X, y, sample_weight=sample_weight) clf_sw_n = LR(solver='newton-cg', fit_intercept=False, random_state=42) clf_sw_n.fit(X, y, sample_weight=sample_weight) clf_sw_sag = LR(solver='sag', fit_intercept=False, tol=1e-10, random_state=42) # ignore convergence warning due to small dataset with ignore_warnings(): clf_sw_sag.fit(X, y, sample_weight=sample_weight) clf_sw_liblinear = LR(solver='liblinear', fit_intercept=False, random_state=42) clf_sw_liblinear.fit(X, y, sample_weight=sample_weight) assert_array_almost_equal(clf_sw_lbfgs.coef_, clf_sw_n.coef_, decimal=4) assert_array_almost_equal(clf_sw_lbfgs.coef_, clf_sw_sag.coef_, decimal=4) assert_array_almost_equal(clf_sw_lbfgs.coef_, clf_sw_liblinear.coef_, decimal=4) # Test that passing class_weight as [1,2] is the same as # passing class weight = [1,1] but adjusting sample weights # to be 2 for all instances of class 2 for solver in ['lbfgs', 'liblinear']: clf_cw_12 = LR(solver=solver, fit_intercept=False, class_weight={ 0: 1, 1: 2 }, random_state=42) clf_cw_12.fit(X, y) clf_sw_12 = LR(solver=solver, fit_intercept=False, random_state=42) clf_sw_12.fit(X, y, sample_weight=sample_weight) assert_array_almost_equal(clf_cw_12.coef_, clf_sw_12.coef_, decimal=4) # Test the above for l1 penalty and l2 penalty with dual=True. # since the patched liblinear code is different. clf_cw = LogisticRegression(solver="liblinear", fit_intercept=False, class_weight={ 0: 1, 1: 2 }, penalty="l1", tol=1e-5, random_state=42) clf_cw.fit(X, y) clf_sw = LogisticRegression(solver="liblinear", fit_intercept=False, penalty="l1", tol=1e-5, random_state=42) clf_sw.fit(X, y, sample_weight) assert_array_almost_equal(clf_cw.coef_, clf_sw.coef_, decimal=4) clf_cw = LogisticRegression(solver="liblinear", fit_intercept=False, class_weight={ 0: 1, 1: 2 }, penalty="l2", dual=True, random_state=42) clf_cw.fit(X, y) clf_sw = LogisticRegression(solver="liblinear", fit_intercept=False, penalty="l2", dual=True, random_state=42) clf_sw.fit(X, y, sample_weight) assert_array_almost_equal(clf_cw.coef_, clf_sw.coef_, decimal=4)
def test_logistic_regression_class_weights(): # Multinomial case: remove 90% of class 0 X = iris.data[45:, :] y = iris.target[45:] solvers = ("lbfgs", "newton-cg") class_weight_dict = _compute_class_weight_dictionary(y) for solver in solvers: clf1 = LogisticRegression(solver=solver, multi_class="multinomial", class_weight="balanced") clf2 = LogisticRegression(solver=solver, multi_class="multinomial", class_weight=class_weight_dict) clf1.fit(X, y) clf2.fit(X, y) assert_array_almost_equal(clf1.coef_, clf2.coef_, decimal=4) # Binary case: remove 90% of class 0 and 100% of class 2 X = iris.data[45:100, :] y = iris.target[45:100] solvers = ("lbfgs", "newton-cg", "liblinear") class_weight_dict = _compute_class_weight_dictionary(y) for solver in solvers: clf1 = LogisticRegression(solver=solver, multi_class="ovr", class_weight="balanced") clf2 = LogisticRegression(solver=solver, multi_class="ovr", class_weight=class_weight_dict) clf1.fit(X, y) clf2.fit(X, y) assert_array_almost_equal(clf1.coef_, clf2.coef_, decimal=6)
def test_logistic_regression_multinomial(): # Tests for the multinomial option in logistic regression # Some basic attributes of Logistic Regression n_samples, n_features, n_classes = 50, 20, 3 X, y = make_classification(n_samples=n_samples, n_features=n_features, n_informative=10, n_classes=n_classes, random_state=0) # 'lbfgs' is used as a referenced solver = 'lbfgs' ref_i = LogisticRegression(solver=solver, multi_class='multinomial') ref_w = LogisticRegression(solver=solver, multi_class='multinomial', fit_intercept=False) ref_i.fit(X, y) ref_w.fit(X, y) assert_array_equal(ref_i.coef_.shape, (n_classes, n_features)) assert_array_equal(ref_w.coef_.shape, (n_classes, n_features)) for solver in ['sag', 'newton-cg']: clf_i = LogisticRegression(solver=solver, multi_class='multinomial', random_state=42, max_iter=1000, tol=1e-6) clf_w = LogisticRegression(solver=solver, multi_class='multinomial', random_state=42, max_iter=1000, tol=1e-6, fit_intercept=False) clf_i.fit(X, y) clf_w.fit(X, y) assert_array_equal(clf_i.coef_.shape, (n_classes, n_features)) assert_array_equal(clf_w.coef_.shape, (n_classes, n_features)) # Compare solutions between lbfgs and the other solvers assert_almost_equal(ref_i.coef_, clf_i.coef_, decimal=3) assert_almost_equal(ref_w.coef_, clf_w.coef_, decimal=3) assert_almost_equal(ref_i.intercept_, clf_i.intercept_, decimal=3) # Test that the path give almost the same results. However since in this # case we take the average of the coefs after fitting across all the # folds, it need not be exactly the same. for solver in ['lbfgs', 'newton-cg', 'sag']: clf_path = LogisticRegressionCV(solver=solver, max_iter=2000, tol=1e-6, multi_class='multinomial', Cs=[1.]) clf_path.fit(X, y) assert_array_almost_equal(clf_path.coef_, ref_i.coef_, decimal=3) assert_almost_equal(clf_path.intercept_, ref_i.intercept_, decimal=3)
label=[ "en clinton test", "en trump test", "fr macron test", "fr lepen test", "it referendum test", "ca indipendencia test", "es indipendencia test", ] clfs = { "NB" : GaussianNB(), "SVM": SVC(kernel="linear"), "LR" : LogisticRegression() } for i in range(0,len(training)): for key, clf in clfs.items(): print(key,label[i]) tweets_training=training[i] tweets_test=test[i] stance_training=numpy.array(feature_manager.get_stance(tweets_training)) stance_test=numpy.array(feature_manager.get_stance(tweets_test))
def test_multinomial_validation(): for solver in ['lbfgs', 'newton-cg', 'sag']: lr = LogisticRegression(C=-1, solver=solver, multi_class='multinomial') assert_raises(ValueError, lr.fit, [[0, 1], [1, 0]], [0, 1])
def test_liblinear_dual_random_state(): # random_state is relevant for liblinear solver only if dual=True X, y = make_classification(n_samples=20, random_state=0) lr1 = LogisticRegression(random_state=0, dual=True, max_iter=1, tol=1e-15) lr1.fit(X, y) lr2 = LogisticRegression(random_state=0, dual=True, max_iter=1, tol=1e-15) lr2.fit(X, y) lr3 = LogisticRegression(random_state=8, dual=True, max_iter=1, tol=1e-15) lr3.fit(X, y) # same result for same random state assert_array_almost_equal(lr1.coef_, lr2.coef_) # different results for different random states msg = "Arrays are not almost equal to 6 decimals" assert_raise_message(AssertionError, msg, assert_array_almost_equal, lr1.coef_, lr3.coef_)
def gbdt_lr_train(libsvmFileName): # load样本数据 X_all, y_all = load_svmlight_file(libsvmFileName) # 训练/测试数据分割 X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.3, random_state=42) # 定义GBDT模型 gbdt = GradientBoostingClassifier(n_estimators=40, max_depth=3, verbose=0, max_features=0.5) # 训练学习 gbdt.fit(X_train, y_train) # 预测及AUC评测 y_pred_gbdt = gbdt.predict_proba(X_test.toarray())[:, 1] gbdt_auc = roc_auc_score(y_test, y_pred_gbdt) print('gbdt auc: %.5f' % gbdt_auc) # lr对原始特征样本模型训练 lr = LogisticRegression() lr.fit(X_train, y_train) # 预测及AUC评测 y_pred_test = lr.predict_proba(X_test)[:, 1] lr_test_auc = roc_auc_score(y_test, y_pred_test) print('基于原有特征的LR AUC: %.5f' % lr_test_auc) # GBDT编码原有特征 X_train_leaves = gbdt.apply(X_train)[:, :, 0] X_test_leaves = gbdt.apply(X_test)[:, :, 0] # 对所有特征进行ont-hot编码 (train_rows, cols) = X_train_leaves.shape gbdtenc = OneHotEncoder() X_trans = gbdtenc.fit_transform( np.concatenate((X_train_leaves, X_test_leaves), axis=0)) # 定义LR模型 lr = LogisticRegression() # lr对gbdt特征编码后的样本模型训练 lr.fit(X_trans[:train_rows, :], y_train) # 预测及AUC评测 y_pred_gbdtlr1 = lr.predict_proba(X_trans[train_rows:, :])[:, 1] gbdt_lr_auc1 = roc_auc_score(y_test, y_pred_gbdtlr1) print('基于GBDT特征编码后的LR AUC: %.5f' % gbdt_lr_auc1) # 定义LR模型 lr = LogisticRegression(n_jobs=-1) # 组合特征 X_train_ext = hstack([X_trans[:train_rows, :], X_train]) X_test_ext = hstack([X_trans[train_rows:, :], X_test]) print(X_train_ext.shape) # lr对组合特征的样本模型训练 lr.fit(X_train_ext, y_train) # 预测及AUC评测 y_pred_gbdtlr2 = lr.predict_proba(X_test_ext)[:, 1] gbdt_lr_auc2 = roc_auc_score(y_test, y_pred_gbdtlr2) print('基于组合特征的LR AUC: %.5f' % gbdt_lr_auc2)
from sklearn.naive_bayes import GaussianNB from sklearn.lda import LDA from sklearn.qda import QDA h = .02 # step size in the mesh from pyearth.earth import Earth from sklearn.linear_model.logistic import LogisticRegression from sklearn.metrics import accuracy_score from sklearn.pipeline import Pipeline np.random.seed(1) # Combine Earth with LogisticRegression in a pipeline to do classification earth_classifier = Pipeline([('earth', Earth(max_degree=3, penalty=1.5)), ('logistic', LogisticRegression())]) names = [ "Nearest Neighbors", "Linear SVM", "RBF SVM", "Decision Tree", "Random Forest", "Naive Bayes", "LDA", "QDA", "Earth" ] classifiers = [ KNeighborsClassifier(3), SVC(kernel="linear", C=0.025), SVC(gamma=2, C=1), DecisionTreeClassifier(max_depth=5), RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1), GaussianNB(), LDA(), QDA(), earth_classifier ]
im = mh.imread(fname, as_grey=True) haralicks.append(texture(im)) sobels.append(edginess_sobel(im)) # Files are named like building00.jpg, scene23.jpg... labels.append(fname[:-len('xx.jpg')]) print('Finished computing features.') haralicks = np.array(haralicks) sobels = np.array(sobels) labels = np.array(labels) # We use logistic regression because it is very fast. # Feel free to experiment with other classifiers scores = cross_validation.cross_val_score(LogisticRegression(), haralicks, labels, cv=5) print('Accuracy (5 fold x-val) with Logistic Regrssion [std features]: {}%'. format(0.1 * round(1000 * scores.mean()))) haralick_plus_sobel = np.hstack([np.atleast_2d(sobels).T, haralicks]) scores = cross_validation.cross_val_score(LogisticRegression(), haralick_plus_sobel, labels, cv=5).mean() print( 'Accuracy (5 fold x-val) with Logistic Regrssion [std features + sobel]: {}%' .format(0.1 * round(1000 * scores.mean())))
y_test = [] accuracy_meta_train = [] for temp in range(2708): y.append(LABEL[labels[temp]]) y = np.array(y) class_label = [0, 1, 2, 3, 4, 5, 6] combination = list(combinations(class_label, 2)) for i in range(len(combination)): print('Cross_Validation: ', i + 1) test_label = list(combination[i]) train_label = [n for n in class_label if n not in test_label] print('Cross_Validation {} Train_Label_List {}: '.format( i + 1, train_label)) print('Cross_Validation {} Test_Label_List {}: '.format(i + 1, test_label)) classifier = LogisticRegression() for j in range(50): labels_local = labels.copy() select_class = random.sample(train_label, 2) print('Cross_Validation {} ITERATION {} Train_Label: {}'.format( i + 1, j + 1, select_class)) class1_idx = [] class2_idx = [] for k in range(2708): if (labels_local[k] == LABEL2[select_class[0]]): class1_idx.append(k) labels_local[k] = LABEL2[select_class[0]] elif (labels_local[k] == LABEL2[select_class[1]]): class2_idx.append(k) labels_local[k] = LABEL2[select_class[1]]
from sklearn.datasets.samples_generator import make_classification from sklearn.model_selection import train_test_split from sklearn.preprocessing import OneHotEncoder from sklearn.linear_model.logistic import LogisticRegression from sklearn import metrics if __name__ == '__main__': # X为样本特征, y为样本类别输出 X, y = make_classification(n_samples=80000) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5) X_train, X_train_lr, y_train, y_train_lr = train_test_split(X_train, y_train, test_size=0.5) grd = GradientBoostingClassifier(n_estimators=10) grd_enc = OneHotEncoder() grd_lm = LogisticRegression() grd.fit(X_train, y_train) grd_enc.fit(grd.apply(X_train)[:, :, 0]) ''' 使用训练好的GBDT模型构建特征,然后将特征经过one-hot编码作为新的特征输入到LR模型训练。 ''' grd_lm.fit(grd_enc.transform(grd.apply(X_train_lr)[:, :, 0]), y_train_lr) # 用训练好的LR模型多X_test做预测 y_pred_grd_lm = grd_lm.predict_proba( grd_enc.transform(grd.apply(X_test)[:, :, 0]))[:, 1] # 根据预测结果输出 fpr_grd_lm, tpr_grd_lm, _ = metrics.roc_curve(y_test, y_pred_grd_lm) print(grd.apply(X_train)[:, :, :].shape)
def classify(train_data, train_label): train_label = train_label.ravel() #将多维数据降成一维 # LSTM_AC, LSTM_f1 = Lstm_models() # GRU_AC, GRU_f1 = GRU_models() NLSTM_AC, NLSTM_f1 = Nestlstm_models() lgbmModel = LGBMClassifier(max_depth=5, num_leaves=25, learning_rate=0.007, n_estimators=1000, min_child_samples=80, subsample=0.8, colsample_bytree=1, reg_alpha=0, reg_lambda=0, random_state=np.random.randint(10e6)) lgbmModel.fit(train_data, train_label) lgbm_pre = lgbmModel.predict(test_data) lgbm_AC = accuracy_score(test_label, lgbm_pre) lgbm_f1 = f1_score(test_label, lgbm_pre, average='macro') AdaBoostModel = AdaBoostClassifier(base_estimator=None, n_estimators=50, learning_rate=1, algorithm='SAMME.R', random_state=None) AdaBoostModel.fit(train_data, train_label) AdaBoost_pre = AdaBoostModel.predict(test_data) AdaBoost_AC = accuracy_score(test_label, AdaBoost_pre) AdaBoost_f1 = f1_score(test_label, AdaBoost_pre, average='macro') rfc1 = RandomForestClassifier(n_estimators=40, max_depth=None, min_samples_split=2, random_state=2) #随机森林分类器 rfc1.fit(train_data, train_label) RF_pre = rfc1.predict(test_data) RF_AC = accuracy_score(test_label, RF_pre) RF_f1 = f1_score(test_label, RF_pre, average='macro') clf = SVC(kernel='rbf', C=9, gamma=0.1) clf.set_params(kernel='rbf', probability=True).fit(train_data, train_label) #set_params:设置SVC函数的参数 clf.predict(train_data) test_pre = clf.predict(test_data) SVM_AC = accuracy_score(test_label, test_pre) SVM_f1 = f1_score(test_label, test_pre, average='macro') # decision tree dtc = DecisionTreeClassifier() dtc.fit(train_data, train_label) dt_pre = dtc.predict(test_data) DT_AC = accuracy_score(test_label, dt_pre) DT_f1 = f1_score(test_label, dt_pre, average='macro') MLP = MLPClassifier(solver='lbfgs', alpha=1e-4, hidden_layer_sizes=(100, 3), random_state=1) MLP.fit(train_data, train_label) MLP_predict = MLP.predict(test_data) MLP_AC = accuracy_score(test_label, MLP_predict) MLP_f1 = f1_score(test_label, MLP_predict, average='macro') # KNN knn = KNeighborsClassifier(n_neighbors=3) knn.fit(train_data, train_label) knn_predict = knn.predict(test_data) KNN_AC = accuracy_score(test_label, knn_predict) KNN_f1 = f1_score(test_label, knn_predict, average='macro') # LogisticRegression classifier = LogisticRegression() classifier.fit(train_data, train_label) lg_predict = classifier.predict(test_data) LG_AC = accuracy_score(test_label, lg_predict) LG_f1 = f1_score(test_label, lg_predict, average='macro') # # print("===== Diagnosis original=======") # print('Original Accuracy:') # print(RF_AC, SVM_AC, DT_AC, NB_AC, MLP_AC, KNN_AC, LG_AC) # print('F1-score') # print(RF_f1, SVM_f1, DT_f1, NB_f1, MLP_f1, KNN_f1, LG_f1) # Main.py按照original.py, Ensemble.py, vae_od.py顺序执行,结果依次存入下面文件 # file_name1 = "./temp_result/Diagnosis_"+str(select_number)+"Level"+str(level_num)+"_Accuracy_result.txt" # file_name2 = "./temp_result/Diagnosis_"+str(select_number)+"Level"+str(level_num)+"_f1_score_result.txt" # with open(file_name1, "a") as f: # f.writelines([str(RF_AC), ' ', str(SVM_AC), ' ', str(DT_AC), ' ', str(NB_AC), ' ', str(MLP_AC), ' ', str(KNN_AC), ' ', str(LG_AC), '\n']) # with open(file_name2, "a") as f: # f.writelines([str(RF_f1), ' ', str(SVM_f1), ' ', str(DT_f1), ' ', str(NB_f1), ' ', str(MLP_f1), ' ', str(KNN_f1), ' ', str(LG_f1), '\n']) # return NLSTM_AC,LSTM_AC ,GRU_AC,lgbm_AC,AdaBoost_AC,RF_AC, SVM_AC, DT_AC, MLP_AC, KNN_AC, LG_AC,NLSTM_f1,LSTM_f1,GRU_f1,lgbm_f1,AdaBoost_f1,RF_f1,SVM_f1,DT_f1,MLP_f1,KNN_f1,LG_f1 return NLSTM_AC, lgbm_AC, AdaBoost_AC, RF_AC, SVM_AC, DT_AC, MLP_AC, KNN_AC, LG_AC, NLSTM_f1, lgbm_f1, AdaBoost_f1, RF_f1, SVM_f1, DT_f1, MLP_f1, KNN_f1, LG_f1
def initialize_with_logistic_regression(self, zs, xs, initialize=False): from sklearn.linear_model.logistic import LogisticRegression if not hasattr(self, '_lr'): self._lr = LogisticRegression(verbose=False, multi_class="multinomial", solver="lbfgs", warm_start=True, max_iter=10) lr = self._lr # Make the covariates K, D = self.num_states, self.covariate_dim # Split zs into prevs and nexts zps = zs[:-1] if isinstance(zs, np.ndarray) else np.concatenate( [z[:-1] for z in zs], axis=0) zns = zs[1:] if isinstance(zs, np.ndarray) else np.concatenate( [z[1:] for z in zs], axis=0) xps = xs[:-1] if isinstance(xs, np.ndarray) else np.concatenate( [x[:-1] for x in xs], axis=0) assert zps.shape[0] == xps.shape[0] assert zps.ndim == 1 and zps.dtype == np.int32 and zps.min( ) >= 0 and zps.max() < K assert zns.ndim == 1 and zns.dtype == np.int32 and zns.min( ) >= 0 and zns.max() < K assert xps.ndim == 2 and xps.shape[1] == D used = np.bincount(zns, minlength=K) > 0 K_used = np.sum(used) lr_X = np.column_stack((one_hot(zps, K), xps)) lr_y = zns # The logistic regression solver fails if we only have one class represented # In this case, set the regression weights to zero and set logpi to have # high probability of the visited class if K_used == 1: self.W = np.zeros((D, K)) self.log_pi = np.zeros((K, K)) self.log_pi[:, used] = 3.0 else: lr.fit(lr_X, lr_y) # Now convert the logistic regression into weights if K_used > 2: self.W = np.zeros((D, K)) self.W[:, used] = lr.coef_[:, K:].T self.logpi = np.zeros((K, K)) self.logpi[:, used] = lr.coef_[:, :K].T self.logpi[:, used] += lr.intercept_[None, :] self.logpi[:, ~used] += -100. elif K_used == 2: # LogisticRegression object only represents one # set of weights for binary problems self.W = np.zeros((D, K)) self.W[:, 1] = lr.coef_[0, K:] self.logpi = np.zeros((K, K)) self.logpi[:, 1] = lr.coef_[0, :K].T self.logpi[:, 1] += lr.intercept_
def test_logistic_regression_solvers_multiclass(): X, y = make_classification(n_samples=20, n_features=20, n_informative=10, n_classes=3, random_state=0) tol = 1e-6 ncg = LogisticRegression(solver='newton-cg', fit_intercept=False, tol=tol) lbf = LogisticRegression(solver='lbfgs', fit_intercept=False, tol=tol) lib = LogisticRegression(fit_intercept=False, tol=tol) sag = LogisticRegression(solver='sag', fit_intercept=False, tol=tol, max_iter=1000, random_state=42) ncg.fit(X, y) lbf.fit(X, y) sag.fit(X, y) lib.fit(X, y) assert_array_almost_equal(ncg.coef_, lib.coef_, decimal=4) assert_array_almost_equal(lib.coef_, lbf.coef_, decimal=4) assert_array_almost_equal(ncg.coef_, lbf.coef_, decimal=4) assert_array_almost_equal(sag.coef_, lib.coef_, decimal=4) assert_array_almost_equal(sag.coef_, ncg.coef_, decimal=4) assert_array_almost_equal(sag.coef_, lbf.coef_, decimal=4)
def test_logistic_regression_solvers(): X, y = make_classification(n_features=10, n_informative=5, random_state=0) ncg = LogisticRegression(solver='newton-cg', fit_intercept=False) lbf = LogisticRegression(solver='lbfgs', fit_intercept=False) lib = LogisticRegression(fit_intercept=False) sag = LogisticRegression(solver='sag', fit_intercept=False, random_state=42) ncg.fit(X, y) lbf.fit(X, y) sag.fit(X, y) lib.fit(X, y) assert_array_almost_equal(ncg.coef_, lib.coef_, decimal=3) assert_array_almost_equal(lib.coef_, lbf.coef_, decimal=3) assert_array_almost_equal(ncg.coef_, lbf.coef_, decimal=3) assert_array_almost_equal(sag.coef_, lib.coef_, decimal=3) assert_array_almost_equal(sag.coef_, ncg.coef_, decimal=3) assert_array_almost_equal(sag.coef_, lbf.coef_, decimal=3)
# TAKE THE TRAINSET AND THE TARGET FROM DATASET trainset=get_trainset(dataset) targets=get_target_dataset(dataset) # DELETING THE STATISTICS OF THE USERS CONTENTS trainset_without_stats=drop_stats(trainset) # STANDARDIZE DATASET trainset_without_stats=StandardScale_dataset(trainset_without_stats) # TRAINING WITHOUT STATISTICS OF THE USERS CONTENTS x_train, x_test, y_train, y_test = train_test_split(trainset_without_stats, targets, test_size = 0.2, random_state = 12345) lr = LogisticRegression(solver='lbfgs') lr.fit(x_train, y_train) predictions = lr.predict(x_test) print("\nPERFORMANCE WITHOUT THE STATISTICS OF THE USERS CONTENTS: ") print("\nCONFUSION MATRIX:") print(confusion_matrix(y_test, predictions)) print("\nCLASSIFICATION REPORT:") print(classification_report(y_test, predictions)) # CLASSIFICATION OF ONLY PUBLIC PROFILES dataset_publics=drop_NaN_entries(dataset) trainset_publics=get_trainset(dataset_publics) targets_publics=get_target_dataset(dataset_publics)
for i in range(50): #set range to contain number of csv files new = pd.read_csv(path + file + str(i + 1) + ".csv") raw = pd.concat([raw, new]) raw = raw[['content', 'troll']] #these can be played with, currently set to ignore words in more than half or less than 100 vectorizer = TfidfVectorizer(min_df=100, max_df=0.5) c = vectorizer.fit_transform(raw['content']) dictionary = vectorizer.get_feature_names() return c, raw['troll'], dictionary X, y, dictionary = load_data("tweet_data_batch") X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True) model1 = LogisticRegression( max_iter=400, n_jobs=-1 ) #increased arbitariy to 400 since default iterations reach limit ''' scores_clf_svc_cv1 = cross_val_score(model1,X,y,cv=5) print("LogReg Accuracy: %0.2f (+/- %0.2f)" % (scores_clf_svc_cv1.mean(), scores_clf_svc_cv1.std() * 2)) # print accuracy ''' model1.fit(X_train, y_train) print("LogReg Accuracy:\n", model1.score(X_test, y_test)) predic1 = model1.predict(X_test) print("LogReg matrix:", metrics.confusion_matrix(y_test, predic1)) model2 = Perceptron() ''' scores_clf_svc_cv2 = cross_val_score(model2,X,y,cv=5) print("Perceptron Accuracy: %0.2f (+/- %0.2f)" % (scores_clf_svc_cv2.mean(), scores_clf_svc_cv2.std() * 2)) # print accuracy '''
def tokenize_porter(text): return [porter.stem(word) for word in text.split()] stop = stopwords.words("english") vectorizer = TfidfVectorizer(stop_words="english", ngram_range=(1, 1)) X = vectorizer.fit_transform(movie["review"]) print(vectorizer.get_feature_names) #print(vectorizer.get_feature_names()) train_x, test_x, train_y, test_y = train_test_split(X, movie["sentiment"], test_size=0.2, random_state=42) print(train_x.shape, train_y.shape) clf = LogisticRegression() clf.fit(train_x, train_y) #predict result print(clf.predict(test_x)) #crossval score scores = cross_val_score(clf, test_x, test_y, cv=5) acc = scores.mean() print("Accuracy: %0.2f percent" % (acc * 100))
def test_nan(): # Test proper NaN handling. # Regression test for Issue #252: fit used to go into an infinite loop. Xnan = np.array(X, dtype=np.float64) Xnan[0, 1] = np.nan LogisticRegression(random_state=0).fit(Xnan, Y1)
#对测试集进行预测 from sklearn.model_selection import train_test_split x_data=data[:,:-1] y_data=data[:,-1] prediction_list=[] for i in range(5): x_train, x_val, y_train, y_val = train_test_split(data[:,:-1], data[:,-1], test_size=0.2) x_train_new,x_test_new =PCA_Reduction(x_train,x_test) print(x_test_new.shape) clf = SVC(kernel='linear',verbose=1) clf.fit(x_train_new, y_train) y_predition_test=clf.predict(x_test_new) prediction_list.append(y_predition_test) classifier=LogisticRegression() classifier.fit(x_train_new,y_train) y_predict=classifier.predict(x_test_new) prediction_list.append(y_predict) dtree=RandomForestClassifier(criterion='gini',max_depth=120,min_impurity_decrease=0) dtree.fit(x_train_new,y_train) pred=dtree.predict(x_test_new) prediction_list.append(pred) print(prediction_list) prediction_result=np.array(prediction_list).T print(prediction_result.shape) test_result=[] for x in prediction_result:
def test_logreg_intercept_scaling_zero(): # Test that intercept_scaling is ignored when fit_intercept is False clf = LogisticRegression(fit_intercept=False) clf.fit(X, Y1) assert_equal(clf.intercept_, 0.)
def test_logreg_predict_proba_multinomial(): X, y = make_classification(n_samples=10, n_features=20, random_state=0, n_classes=3, n_informative=10) # Predicted probabilities using the true-entropy loss should give a # smaller loss than those using the ovr method. clf_multi = LogisticRegression(multi_class="multinomial", solver="lbfgs") clf_multi.fit(X, y) clf_multi_loss = log_loss(y, clf_multi.predict_proba(X)) clf_ovr = LogisticRegression(multi_class="ovr", solver="lbfgs") clf_ovr.fit(X, y) clf_ovr_loss = log_loss(y, clf_ovr.predict_proba(X)) assert_greater(clf_ovr_loss, clf_multi_loss) # Predicted probabilities using the soft-max function should give a # smaller loss than those using the logistic function. clf_multi_loss = log_loss(y, clf_multi.predict_proba(X)) clf_wrong_loss = log_loss(y, clf_multi._predict_proba_lr(X)) assert_greater(clf_wrong_loss, clf_multi_loss)
predict_me = np.array(X[i].astype(float)) predict_me = predict_me.reshape(-1, len(predict_me)) prediction = clf.predict(predict_me) if prediction == y[i]: correct += 1 print (float(correct)/float(len(X))) ''' from sklearn.linear_model.logistic import LogisticRegression from sklearn.model_selection import GridSearchCV from sklearn import decomposition from sklearn.pipeline import Pipeline logistic = LogisticRegression() pca = decomposition.PCA() pipe = Pipeline(steps=[('pca', pca), ('logistic', logistic)]) X = np.array(df.drop('survived', 1)) X = preprocessing.scale(X) print X.shape y = np.array(df['survived']) print y.shape clf = pca.fit_transform(X, y) plt.figure(1, figsize=(5, 5)) plt.clf() plt.axes([.2, .2, .7, .7]) plt.plot(pca.explained_variance_, linewidth=2) plt.axis('tight') plt.xlabel('n_components')
def crossval(features, labels, vec): maxent = LogisticRegression(penalty='l1') #maxent = SGDClassifier(penalty='l1') #maxent = Perceptron(penalty='l1') maxent.fit( features, labels ) # only needed for feature inspection, crossvalidation calls fit(), too coeffcounter = Counter(vec.feature_names_) negfeats = set(vec.feature_names_) posfeats = set(vec.feature_names_) scores = defaultdict(list) TotalCoeffCounter = Counter() for TrainIndices, TestIndices in cross_validation.KFold( n=features.shape[0], n_folds=10, shuffle=False, random_state=None): TrainX_i = features[TrainIndices] Trainy_i = labels[TrainIndices] TestX_i = features[TestIndices] Testy_i = labels[TestIndices] maxent.fit(TrainX_i, Trainy_i) ypred_i = maxent.predict(TestX_i) coeffs_i = list(maxent.coef_[0]) coeffcounter_i = Counter(vec.feature_names_) for value, name in zip(coeffs_i, vec.feature_names_): coeffcounter_i[name] = value acc = accuracy_score(ypred_i, Testy_i) pre = precision_score(ypred_i, Testy_i) rec = recall_score(ypred_i, Testy_i) # shared task uses f1 of *accuracy* and recall! f1 = 2 * acc * rec / (acc + rec) scores["Accuracy"].append(acc) scores["F1"].append(f1) scores["Precision"].append(pre) scores["Recall"].append(rec) posfeats = posfeats.intersection( set([key for (key, value) in coeffcounter.most_common()[:20]])) negfeats = negfeats.intersection( set([key for (key, value) in coeffcounter.most_common()[-20:]])) print("Pervasive positive: ", posfeats) print("Pervasive negative: ", negfeats) #scores = cross_validation.cross_val_score(maxent, features, labels, cv=10) print("--") for key in sorted(scores.keys()): currentmetric = np.array(scores[key]) print("%s : %0.2f (+/- %0.2f)" % (key, currentmetric.mean(), currentmetric.std())) print("--") maxent.fit(features, labels) # fit on everything coeffs_total = list(maxent.coef_[0]) for value, name in zip(coeffs_total, vec.feature_names_): TotalCoeffCounter[name] = value for (key, value) in TotalCoeffCounter.most_common()[:20]: print(key, value) print("---") for (key, value) in TotalCoeffCounter.most_common()[-20:]: print(key, value) print("lowest coeff:", coeffcounter.most_common()[-1]) print("highest coeff", coeffcounter.most_common()[0])
clf = SVC() clf.fit(X_train, y_train) print "使用支持向量分类算法分类结果:" print clf.score(X_test, y_test) #支持向量分类 #nusvm clf = NuSVC() clf.fit(X_train, y_train) print "使用支持向量分类算法分类结果:" print clf.score(X_test, y_test) #核支持向量分类 clf = GaussianNB() clf.fit(X_train, y_train) print "使用朴素贝叶斯分类算法分类结果:" print clf.score(X_test, y_test) #朴素贝叶斯分类 classifier = LogisticRegression() classifier.fit(X_train, y_train) print "使用逻辑回归算法分类结果:" print classifier.score(X_test, y_test) #逻辑回归 classifier = tree.DecisionTreeClassifier() classifier.fit(X_train, y_train) print "使用决策树算法分类结果:" print classifier.score(X_test, y_test) classifier = GradientBoostingClassifier(n_estimators=200) classifier.fit(X_train, y_train) print "使用GBDT算法分类结果:" print classifier.score(X_test, y_test)
# + # calculate train/test data number N = len(digits) N_train = int(N*0.8) N_test = N - N_train # split train/test data x_train = digits[:N_train, :] y_train = dig_label[:N_train] x_test = digits[N_train:, :] y_test = dig_label[N_train:] # do logistic regression lr=LogisticRegression() lr.fit(x_train,y_train) pred_train = lr.predict(x_train) pred_test = lr.predict(x_test) # calculate train/test accuracy acc_train = accuracy_score(y_train, pred_train) acc_test = accuracy_score(y_test, pred_test) print("accuracy train = %f, accuracy_test = %f" % (acc_train, acc_test)) # + # do PCA with 'n_components=40' pca = decomposition.PCA(n_components=40) pca.fit(x_train)
def test_predict_3_classes(): check_predictions(LogisticRegression(C=10), X, Y2) check_predictions(LogisticRegression(C=10), X_sp, Y2)
for k in range(5): train_texts = np.concatenate((texts[:i], texts[i + 200:]), axis=0) train_labels = np.concatenate((labels[:i], labels[i + 200:]), axis=0) test_texts = texts[i:i + 200] test_labels = labels[i:i + 200] # 贝叶斯 text_clf = Pipeline([('tfidf', TfidfVectorizer(max_features=2000)), ('clf', MultinomialNB())]) text_clf = text_clf.fit(train_texts, train_labels) predicted = text_clf.predict(test_texts) t1 += np.mean(predicted == test_labels) print("MultinomialNB准确率为:", np.mean(predicted == test_labels)) # LogisticRegression text_clf = Pipeline([('tfidf', TfidfVectorizer(max_features=2000)), ('clf', LogisticRegression())]) text_clf = text_clf.fit(train_texts, train_labels) predicted = text_clf.predict(test_texts) t2 += np.mean(predicted == test_labels) print("LogisticRegression准确率为:", np.mean(predicted == test_labels)) # SVM text_clf = Pipeline([('tfidf', TfidfVectorizer(max_features=2000)), ('clf', NuSVC())]) text_clf = text_clf.fit(train_texts, train_labels) predicted = text_clf.predict(test_texts) t3 += np.mean(predicted == test_labels) print("SVC准确率为:", np.mean(predicted == test_labels)) text_clf = Pipeline([('tfidf', TfidfVectorizer(max_features=2000)), ('clf', LinearSVC())])
def test_n_iter(): # Test that self.n_iter_ has the correct format. X, y = iris.data, iris.target y_bin = y.copy() y_bin[y_bin == 2] = 0 n_Cs = 4 n_cv_fold = 2 for solver in ['newton-cg', 'liblinear', 'sag', 'saga', 'lbfgs']: # OvR case n_classes = 1 if solver == 'liblinear' else np.unique(y).shape[0] clf = LogisticRegression(tol=1e-2, multi_class='ovr', solver=solver, C=1., random_state=42, max_iter=100) clf.fit(X, y) assert_equal(clf.n_iter_.shape, (n_classes,)) n_classes = np.unique(y).shape[0] clf = LogisticRegressionCV(tol=1e-2, multi_class='ovr', solver=solver, Cs=n_Cs, cv=n_cv_fold, random_state=42, max_iter=100) clf.fit(X, y) assert_equal(clf.n_iter_.shape, (n_classes, n_cv_fold, n_Cs)) clf.fit(X, y_bin) assert_equal(clf.n_iter_.shape, (1, n_cv_fold, n_Cs)) # multinomial case n_classes = 1 if solver in ('liblinear', 'sag', 'saga'): break clf = LogisticRegression(tol=1e-2, multi_class='multinomial', solver=solver, C=1., random_state=42, max_iter=100) clf.fit(X, y) assert_equal(clf.n_iter_.shape, (n_classes,)) clf = LogisticRegressionCV(tol=1e-2, multi_class='multinomial', solver=solver, Cs=n_Cs, cv=n_cv_fold, random_state=42, max_iter=100) clf.fit(X, y) assert_equal(clf.n_iter_.shape, (n_classes, n_cv_fold, n_Cs)) clf.fit(X, y_bin) assert_equal(clf.n_iter_.shape, (1, n_cv_fold, n_Cs))
def getPipeline(): return Pipeline([('vect', TfidfVectorizer(stop_words='english', sublinear_tf=True)), ('clf', LogisticRegression())])