def cvWithThreshold(X, y_current_tr, y_current_te, threshold, regularization='l2'):
    out_dict = {}
    scores = defaultdict(list)
    fold=1
    maxent = LogisticRegression(penalty=regularization)
    for TrainIndices, TestIndices in cross_validation.StratifiedKFold(y_current_tr, n_folds=10, shuffle=False, random_state=None):
        print('\r'+str(fold), end="")
        fold+=1
        TrainX_i = X[TrainIndices]
        Trainy_i = y_current_tr[TrainIndices]
        TestX_i = X[TestIndices]
        Testy_i =  y_current_te[TestIndices]
     
        maxent.fit(TrainX_i,Trainy_i)
        ypred_i, score=pred_for_threshold(maxent,TestX_i,Testy_i, threshold)

        scores["F1"].append(score[0])
        scores["Recall"].append(score[1])
        scores["Accuracy"].append(score[2])
        scores["Precision"].append(score[3])

    
    #scores = cross_validation.cross_val_score(maxent, features, labels, cv=10)
    print("\n--")

    for key in sorted(scores.keys()):
        currentmetric = np.array(scores[key])
        out_dict[key] = (currentmetric.mean(),currentmetric.std())
        print("%s : %0.2f (+/- %0.2f)" % (key,currentmetric.mean(), currentmetric.std()))
    print("--")
    return out_dict
def main():
    scriptdir = os.path.dirname(os.path.realpath(__file__))
    parser = argparse.ArgumentParser(description="Skeleton for features and classifier for CWI-2016--optimisation of threshhold")
    parser.add_argument('--threshold',type=float,default=0.5)
    parser.add_argument('--annotator',type=str,default="03")
    parser.add_argument('--penalty',type=str,choices=["l1","l2"],default="l1")


    args = parser.parse_args()
    current_single_ann = scriptdir+"/../data/cwi_training/cwi_training_"+args.annotator+".lbl.conll"
    testfile = scriptdir+"/../data/cwi_testing/cwi_testing.txt.lbl.conll"
    X__dict_train, y_train, v_train = feats_and_classify.collect_features(current_single_ann,vectorize=False)
    X_dict_test, y_test, v_test = feats_and_classify.collect_features(testfile,vectorize=False)
    featdicts = list([x for x in X__dict_train + X_dict_test])
    vect = DictVectorizer()
    X = vect.fit_transform(featdicts).toarray()
    X_train=X[:len(y_train)]
    X_test=X[len(y_train):]

    maxent = LogisticRegression(penalty=args.penalty)
    maxent.fit(X_train,y_train)
    y_pred_proba = maxent.predict_proba(X_test)
    ypred_i=["1" if pair[1]>=args.threshold else "0" for pair in y_pred_proba]
    fout = open(args.annotator+".pred",mode="w")
    print("\n".join(ypred_i),file=fout)
    fout.close()
    sys.exit(0)
    def test_regularization_path(self):
        # Check results using logistic path
        num_samples = 10
        num_feat = 5

        X, y = make_classification(n_samples=num_samples, n_features=num_feat, n_informative=3,
                                       n_classes=2, random_state=0, weights=[0.5, 0.5])
        matrix = np.zeros((num_samples, num_feat + 2))
        matrix[:,:-2] = X
        matrix[:, -2] = np.ones(num_samples)
        matrix[:, -1] = y

        # Betas to test
        logitfitL1 = LogisticRegressionL1()
        lambda_grid = np.exp(-1 * np.linspace(1, 17, 200))
        path = logitfitL1.fit(matrix, lambda_grid)

        # Sklearn
        cs = l1_min_c(X, y, loss='log') * np.logspace(0, 3)

        # Computing regularization path using sklearn
        clf = LogisticRegression(C=1.0, penalty='l1', tol=1e-6)
        coefs_ = []
        for c in cs:
            clf.set_params(C=c)
            clf.fit(X, y)
            coefs_.append(clf.coef_.ravel().copy())

        skbetas = np.append(clf.intercept_[0], clf.coef_)
        np.testing.assert_almost_equal(skbetas, logitfitL1.coef_, 1)
Beispiel #4
0
def mlogistic():
	X = []

	# 前三行作为输入样本
	X.append("f**k you")
	X.append("f**k you all")
	X.append("hello everyone")

	# 后两句作为测试样本
	X.append("f**k me")
	X.append("hello boy")

	# y为样本标注
	y = [1,1,0]

	vectorizer = TfidfVectorizer()

	# 取X的前三句作为输入做tfidf转换
	X_train = vectorizer.fit_transform(X[:-2])
	print X_train
	# 取X的后两句用“上句生成”的tfidf做转换
	X_test = vectorizer.transform(X[-2:])
	print X_test

	# 用逻辑回归模型做训练
	classifier = LogisticRegression()
	classifier.fit(X_train, y)

	# 做测试样例的预测
	predictions = classifier.predict(X_test)
	print predictions
class LogReg:
    def __init__(self):
        self.load_data()
        self.clf = LogisticRegression(class_weight = 'balanced')
        self.train()
        self.predict()

    def load_data(self):
        train_csv = './data/train.csv'
        test_csv = './data/test.csv'
        df_train = pd.read_csv(train_csv, header=0)
        df_test = pd.read_csv(test_csv, header=0)
        arr_train = df_train.values
        arr_test = df_test.values
        self.train_X = arr_train[0::,1::]
        self.train_Y = arr_train[0::, 0]
        self.test_X = arr_test[0::, 1::]
        self.test_ID = arr_test[0::,0]

    def train(self):
        self.clf.fit(self.train_X, self.train_Y)

    def predict(self):
        self.test_Y = self.clf.predict_proba(self.test_X)

    def get_training_accuracy(self):
        return (self.clf.score(self.train_X, self.train_Y))

    def store_result(self):
        df_out = pd.DataFrame()
        df_out['Id'] = self.test_ID
        df_out['Action'] = self.test_Y[0::,1]
        df_out.to_csv('./data/results/c1_result.csv',index=False)
Beispiel #6
0
def predictWithThreshold(datadir, threshold, penalty_type='l2'):
	maxent = LogisticRegression(penalty=penalty_type)
	scores = defaultdict(list)
	for dir in sorted(os.listdir(datadir), reverse=True):
		trainfeatures, trainlabels, vec = feats_and_classify.collect_features(datadir+dir+'/train.conll')
		TrainIndices=np.array(range(len(trainfeatures)))
		features, labels,  vec = feats_and_classify.collect_features(datadir+dir+'/all.conll')
		TestIndices=np.array(range(len(trainfeatures),len(features)))
#		print('\r'+dir, end="")
#		print(dir)
		TrainX_i = features[TrainIndices]
		Trainy_i = labels[TrainIndices]

		TestX_i = features[TestIndices]
		Testy_i =  labels[TestIndices]

		maxent.fit(TrainX_i,Trainy_i)
#		print('Finished fitting')
		ypred_i, score=pred_for_threshold(maxent,TestX_i,Testy_i, threshold)
#		print('Predicting')

		scores["F1"].append(score[0])
		scores["Recall"].append(score[1])
		scores["Accuracy"].append(score[2])
		scores["Precision"].append(score[3])

	
	#scores = cross_validation.cross_val_score(maxent, features, labels, cv=10)
	print("\n--")

	for key in sorted(scores.keys()):
		currentmetric = np.array(scores[key])
		print("%s : %0.2f (+/- %0.2f)" % (key,currentmetric.mean(), currentmetric.std()))
	print("--")
Beispiel #7
0
def main():
    classes = [
        'chimp',
        'corvette',
        'tokyo',
        'goldengatebridge'
        ]
    
    images, labels = get_labels(classes)
    std_features = get_standard_features(images)
    
    k = 256
    surf_features = get_visual_words(images, k)
    tas_features = get_tas_features(images)
    
    feature_dict = {
        'Std': std_features,
        'SURF': surf_features,
        'TAS': tas_features
        #'Zernike': zernike_features
        }
        
    best_features = log_classify(feature_dict, labels)
    classifier = LogisticRegression() 
    classifier.fit(best_features, labels)
Beispiel #8
0
def test_logreg_cv_penalty():
    # Test that the correct penalty is passed to the final fit.
    X, y = make_classification(n_samples=50, n_features=20, random_state=0)
    lr_cv = LogisticRegressionCV(penalty="l1", Cs=[1.0], solver='liblinear')
    lr_cv.fit(X, y)
    lr = LogisticRegression(penalty="l1", C=1.0, solver='liblinear')
    lr.fit(X, y)
    assert_equal(np.count_nonzero(lr_cv.coef_), np.count_nonzero(lr.coef_))
Beispiel #9
0
def classify_logistic(train_features, train_labels, test_features):
    global SAVE
    clf = LogisticRegression()
    clf.fit(train_features, train_labels)

    if not TEST and SAVE:
        save_pickle("logistic", clf)

    return clf.predict(test_features)
def main():
    scriptdir = os.path.dirname(os.path.realpath(__file__))
    default_pool = scriptdir+"/../data/cwi_training/cwi_training.txt.lbl.conll"
    parser = argparse.ArgumentParser(description="Skeleton for features and classifier for CWI-2016--optimisation of threshhold")
    parser.add_argument('--iterations',type=int,default=5)

    args = parser.parse_args()


    all_feats = []
    all_labels = defaultdict(list)
    scores = defaultdict(list)




    for idx in "01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 16 17 18 19 20".split(" "):
#    for idx in "01".split(" "):
        current_single_ann = scriptdir+"/../data/cwi_training/cwi_training_"+idx+".lbl.conll"
        f_current, labels_current, v_current = feats_and_classify.collect_features(current_single_ann,vectorize=False,generateFeatures=False)
        for instance_index,l in enumerate(labels_current):
            all_labels[instance_index].append(l)
    current_single_ann = scriptdir+"/../data/cwi_training/cwi_training_01.lbl.conll"
    feats, labels_current, v_current = feats_and_classify.collect_features(current_single_ann,vectorize=True,generateFeatures=True)

    for it in range(args.iterations):
        for TrainIndices, TestIndices in cross_validation.KFold(n=feats.shape[0], n_folds=10, shuffle=True, random_state=None):
            maxent = LogisticRegression(penalty='l2')

            TrainX_i = feats[TrainIndices]
            Trainy_i = [all_labels[x][random.randrange(0,20)] for x in TrainIndices]

            TestX_i = feats[TestIndices]
            Testy_i =  [all_labels[x][random.randrange(0,20)] for x in TestIndices]

            maxent.fit(TrainX_i,Trainy_i)
            ypred_i = maxent.predict(TestX_i)

            acc = accuracy_score(ypred_i, Testy_i)
            pre = precision_score(ypred_i, Testy_i)
            rec = recall_score(ypred_i, Testy_i)
            # shared task uses f1 of *accuracy* and recall!
            f1 = 2 * acc * rec / (acc + rec)

            scores["Accuracy"].append(acc)
            scores["F1"].append(f1)
            scores["Precision"].append(pre)
            scores["Recall"].append(rec)
        #scores = cross_validation.cross_val_score(maxent, features, labels, cv=10)
        print("--")

    for key in sorted(scores.keys()):
        currentmetric = np.array(scores[key])
        print("%s : %0.2f (+/- %0.2f)" % (key,currentmetric.mean(), currentmetric.std()))
    print("--")

    sys.exit(0)
Beispiel #11
0
def my_module(rt, params, inputs, outputs):
    # TODO : Fill your code here
    X = pickle.load(open(inputs.X, 'r'))
    Y = pickle.load(open(inputs.Y, 'r'))

    model = LogisticRegression()
    model.fit(X, Y)
    pickle.dump(model, open(outputs.MODEL, 'w'))
    
    print "Done"
Beispiel #12
0
def test_logreg_l1_sparse_data():
    # Because liblinear penalizes the intercept and saga does not, we do not
    # fit the intercept to make it possible to compare the coefficients of
    # the two models at convergence.
    rng = np.random.RandomState(42)
    n_samples = 50
    X, y = make_classification(n_samples=n_samples, n_features=20,
                               random_state=0)
    X_noise = rng.normal(scale=0.1, size=(n_samples, 3))
    X_constant = np.zeros(shape=(n_samples, 2))
    X = np.concatenate((X, X_noise, X_constant), axis=1)
    X[X < 1] = 0
    X = sparse.csr_matrix(X)

    lr_liblinear = LogisticRegression(penalty="l1", C=1.0, solver='liblinear',
                                      fit_intercept=False,
                                      tol=1e-10)
    lr_liblinear.fit(X, y)

    lr_saga = LogisticRegression(penalty="l1", C=1.0, solver='saga',
                                 fit_intercept=False,
                                 max_iter=1000, tol=1e-10)
    lr_saga.fit(X, y)
    assert_array_almost_equal(lr_saga.coef_, lr_liblinear.coef_)
    # Noise and constant features should be regularized to zero by the l1
    # penalty
    assert_array_almost_equal(lr_liblinear.coef_[0, -5:], np.zeros(5))
    assert_array_almost_equal(lr_saga.coef_[0, -5:], np.zeros(5))

    # Check that solving on the sparse and dense data yield the same results
    lr_saga_dense = LogisticRegression(penalty="l1", C=1.0, solver='saga',
                                       fit_intercept=False,
                                       max_iter=1000, tol=1e-10)
    lr_saga_dense.fit(X.toarray(), y)
    assert_array_almost_equal(lr_saga.coef_, lr_saga_dense.coef_)
Beispiel #13
0
def test_dtype_match():
    # Disabled to unblock the 0.19.2 release. See:
    # https://github.com/scikit-learn/scikit-learn/issues/11438
    # Test that np.float32 input data is not cast to np.float64 when possible
    raise SkipTest()
    X_32 = np.array(X).astype(np.float32)
    y_32 = np.array(Y1).astype(np.float32)
    X_64 = np.array(X).astype(np.float64)
    y_64 = np.array(Y1).astype(np.float64)
    X_sparse_32 = sp.csr_matrix(X, dtype=np.float32)

    for solver in ['newton-cg']:
        for multi_class in ['ovr', 'multinomial']:

            # Check type consistency
            lr_32 = LogisticRegression(solver=solver, multi_class=multi_class)
            lr_32.fit(X_32, y_32)
            assert_equal(lr_32.coef_.dtype, X_32.dtype)

            # check consistency with sparsity
            lr_32_sparse = LogisticRegression(solver=solver,
                                              multi_class=multi_class)
            lr_32_sparse.fit(X_sparse_32, y_32)
            assert_equal(lr_32_sparse.coef_.dtype, X_sparse_32.dtype)

            # Check accuracy consistency
            lr_64 = LogisticRegression(solver=solver, multi_class=multi_class)
            lr_64.fit(X_64, y_64)
            assert_equal(lr_64.coef_.dtype, X_64.dtype)
            assert_almost_equal(lr_32.coef_, lr_64.coef_.astype(np.float32))
Beispiel #14
0
def test_dtype_match():
    # Test that np.float32 input data is not cast to np.float64 when possible

    X_32 = np.array(X).astype(np.float32)
    y_32 = np.array(Y1).astype(np.float32)
    X_64 = np.array(X).astype(np.float64)
    y_64 = np.array(Y1).astype(np.float64)
    X_sparse_32 = sp.csr_matrix(X, dtype=np.float32)

    for solver in ['newton-cg']:
        for multi_class in ['ovr', 'multinomial']:

            # Check type consistency
            lr_32 = LogisticRegression(solver=solver, multi_class=multi_class)
            lr_32.fit(X_32, y_32)
            assert_equal(lr_32.coef_.dtype, X_32.dtype)

            # check consistency with sparsity
            lr_32_sparse = LogisticRegression(solver=solver,
                                              multi_class=multi_class)
            lr_32_sparse.fit(X_sparse_32, y_32)
            assert_equal(lr_32_sparse.coef_.dtype, X_sparse_32.dtype)

            # Check accuracy consistency
            lr_64 = LogisticRegression(solver=solver, multi_class=multi_class)
            lr_64.fit(X_64, y_64)
            assert_equal(lr_64.coef_.dtype, X_64.dtype)
            assert_almost_equal(lr_32.coef_, lr_64.coef_.astype(np.float32))
Beispiel #15
0
def test_nnet(n_samples=200, n_features=7, distance=0.8, complete=False):
    """
    :param complete: if True, all possible combinations will be checked, and quality is printed
    """
    X, y = generate_sample(n_samples=n_samples, n_features=n_features, distance=distance)

    nn_types = [
        nnet.SimpleNeuralNetwork,
        nnet.MLPClassifier,
        nnet.SoftmaxNeuralNetwork,
        nnet.RBFNeuralNetwork,
        nnet.PairwiseNeuralNetwork,
        nnet.PairwiseSoftplusNeuralNetwork,
    ]

    if complete:
        # checking all possible combinations
        for loss in nnet.losses:
            for NNType in nn_types:
                for trainer in nnet.trainers:
                    nn = NNType(layers=[5], loss=loss, trainer=trainer, random_state=42, epochs=100)
                    nn.fit(X, y )
                    print(roc_auc_score(y, nn.predict_proba(X)[:, 1]), nn)

        lr = LogisticRegression().fit(X, y)
        print(lr, roc_auc_score(y, lr.predict_proba(X)[:, 1]))

        assert 0 == 1, "Let's see and compare results"
    else:
        # checking combinations of losses, nn_types, trainers, most of them are used once during tests.
        attempts = max(len(nnet.losses), len(nnet.trainers), len(nn_types))
        losses_shift = numpy.random.randint(10)
        trainers_shift = numpy.random.randint(10)
        for attempt in range(attempts):
            # each combination is tried 3 times. before raising exception
            retry_attempts = 3
            for retry_attempt in range(retry_attempts):
                loss = list(nnet.losses.keys())[(attempt + losses_shift) % len(nnet.losses)]
                trainer = list(nnet.trainers.keys())[(attempt + trainers_shift) % len(nnet.trainers)]

                nn_type = nn_types[attempt % len(nn_types)]

                nn = nn_type(layers=[5], loss=loss, trainer=trainer, random_state=42 + retry_attempt, epochs=200)
                print(nn)
                nn.fit(X, y)
                quality = roc_auc_score(y, nn.predict_proba(X)[:, 1])
                computed_loss = nn.compute_loss(X, y)
                if quality > 0.8:
                    break
                else:
                    print('attempt {} : {}'.format(retry_attempt, quality))
                    if retry_attempt == retry_attempts - 1:
                        raise RuntimeError('quality of model is too low: {} {}'.format(quality, nn))
Beispiel #16
0
def prepare(imageFolder):
    print("preparing images...")
    inputData = []
    outputData = []
    for file in os.listdir(imageFolder):
        if (len(file) == 8):
            inputData += getInputFromPic(Image.open(imageFolder + file).convert("L"), file)
            outputData += getOutputFromFileName(file)
    print("training model...")
    model = LogisticRegression()
    model.fit(inputData, outputData) 
    return model
Beispiel #17
0
def test_liblinear_decision_function_zero():
    # Test negative prediction when decision_function values are zero.
    # Liblinear predicts the positive class when decision_function values
    # are zero. This is a test to verify that we do not do the same.
    # See Issue: https://github.com/scikit-learn/scikit-learn/issues/3600
    # and the PR https://github.com/scikit-learn/scikit-learn/pull/3623
    X, y = make_classification(n_samples=5, n_features=5, random_state=0)
    clf = LogisticRegression(fit_intercept=False)
    clf.fit(X, y)

    # Dummy data such that the decision function becomes zero.
    X = np.zeros((5, 5))
    assert_array_equal(clf.predict(X), np.zeros(5))
Beispiel #18
0
    def fit_model_2(self, lol = .07, toWrite = False):
        model = LogisticRegression(C = lol, penalty = 'l1', tol = 1e-6)

        for data in self.cv_data:
            X_train, X_test, Y_train, Y_test = data
            X_train,Y_train = self.balance_data(X_train,Y_train)
            model.fit(X_train,Y_train)
            pred = model.predict_proba(X_test)[:,1]
            print("Model 2 Score: %f" % (logloss(Y_test,pred),))

        if toWrite:
            f2 = open('model2/model.pkl','w')
            pickle.dump(model,f2)
            f2.close()
def test_logistic_regression_solvers():
    X, y = make_classification(n_features=10, n_informative=5, random_state=0)
    clf_n = LogisticRegression(solver='newton-cg', fit_intercept=False)
    clf_n.fit(X, y)
    clf_lbf = LogisticRegression(solver='lbfgs', fit_intercept=False)
    clf_lbf.fit(X, y)
    clf_lib = LogisticRegression(fit_intercept=False)
    clf_lib.fit(X, y)
    assert_array_almost_equal(clf_n.coef_, clf_lib.coef_, decimal=3)
    assert_array_almost_equal(clf_lib.coef_, clf_lbf.coef_, decimal=3)
    assert_array_almost_equal(clf_n.coef_, clf_lbf.coef_, decimal=3)
Beispiel #20
0
def test_logistic_regression_solvers_multiclass():
    X, y = make_classification(n_samples=20, n_features=20, n_informative=10, n_classes=3, random_state=0)
    clf_n = LogisticRegression(solver="newton-cg", fit_intercept=False)
    clf_n.fit(X, y)
    clf_lbf = LogisticRegression(solver="lbfgs", fit_intercept=False)
    clf_lbf.fit(X, y)
    clf_lib = LogisticRegression(fit_intercept=False)
    clf_lib.fit(X, y)
    assert_array_almost_equal(clf_n.coef_, clf_lib.coef_, decimal=4)
    assert_array_almost_equal(clf_lib.coef_, clf_lbf.coef_, decimal=4)
    assert_array_almost_equal(clf_n.coef_, clf_lbf.coef_, decimal=4)
Beispiel #21
0
def test_max_iter():
    # Test that the maximum number of iteration is reached
    X, y_bin = iris.data, iris.target.copy()
    y_bin[y_bin == 2] = 0

    solvers = ["newton-cg", "liblinear", "sag"]
    # old scipy doesn't have maxiter
    if sp_version >= (0, 12):
        solvers.append("lbfgs")

    for max_iter in range(1, 5):
        for solver in solvers:
            lr = LogisticRegression(max_iter=max_iter, tol=1e-15, random_state=0, solver=solver)
            lr.fit(X, y_bin)
            assert_equal(lr.n_iter_[0], max_iter)
Beispiel #22
0
def test_inconsistent_input():
    # Test that an exception is raised on inconsistent input
    rng = np.random.RandomState(0)
    X_ = rng.random_sample((5, 10))
    y_ = np.ones(X_.shape[0])
    y_[0] = 0

    clf = LogisticRegression(random_state=0)

    # Wrong dimensions for training data
    y_wrong = y_[:-1]
    assert_raises(ValueError, clf.fit, X, y_wrong)

    # Wrong dimensions for test data
    assert_raises(ValueError, clf.fit(X_, y_).predict, rng.random_sample((3, 12)))
Beispiel #23
0
def clazzify(train_mat, test_mat, true_train_labels):
    """
    """
    # learn
    logging.info('learning...')
    model = LogisticRegression(random_state=17, penalty='l1')
    model.fit(train_mat, true_train_labels)
    logging.info('finished learning.')

    # test
    logging.info('testing')
    predicted_test_labels = model.predict(test_mat)
    logging.info('finished testing')

    return predicted_test_labels, model
Beispiel #24
0
def test_multinomial_binary_probabilities():
    # Test multinomial LR gives expected probabilities based on the
    # decision function, for a binary problem.
    X, y = make_classification()
    clf = LogisticRegression(multi_class='multinomial', solver='saga')
    clf.fit(X, y)

    decision = clf.decision_function(X)
    proba = clf.predict_proba(X)

    expected_proba_class_1 = (np.exp(decision) /
                              (np.exp(decision) + np.exp(-decision)))
    expected_proba = np.c_[1-expected_proba_class_1, expected_proba_class_1]

    assert_almost_equal(proba, expected_proba)
def generate_submission():
    global alg, predictions, submission
    # The columns we'll use to predict the target
    # Initialize the algorithm class
    alg = LogisticRegression(random_state=1)
    # Train the algorithm using all the training data
    alg.fit(train[predictors], train["Survived"])
    # Make predictions using the test set.
    predictions = alg.predict(test[predictors])
    # Create a new dataframe with only the columns Kaggle wants from the dataset.
    submission = pandas.DataFrame({
        "PassengerId": test["PassengerId"],
        "Survived": predictions
    })
    submission.to_csv("kaggle.csv", index=False)
    print("kaggele.csv is generated")
    def test_scikit_learn_exploded_data(self):
        # Check results with scikit learn

        betas = [0.001, 0.07, 0.4]
        matrix = create_random_observations(200, 2, betas)
        new_matrix = explode_matrix(matrix)
        X = new_matrix[:,:-2]
        y = new_matrix[:, -1]

        lib = LogisticRegression(fit_intercept=True)
        lib.fit(X, y)

        path = self.logitfitL1.fit(new_matrix, self.lambda_grid)

        skbetas = np.append(lib.intercept_[0], lib.coef_)
        np.testing.assert_almost_equal(skbetas, self.logitfitL1.coef_, 2)
Beispiel #27
0
def test_predict_iris():
    """Test logistic regression with the iris dataset"""
    n_samples, n_features = iris.data.shape

    target = iris.target_names[iris.target]
    clf = LogisticRegression(C=len(iris.data)).fit(iris.data, target)
    assert_array_equal(np.unique(target), clf.classes_)

    pred = clf.predict(iris.data)
    assert_greater(np.mean(pred == target), .95)

    probabilities = clf.predict_proba(iris.data)
    assert_array_almost_equal(probabilities.sum(axis=1), np.ones(n_samples))

    pred = iris.target_names[probabilities.argmax(axis=1)]
    assert_greater(np.mean(pred == target), .95)
Beispiel #28
0
def test_max_iter():
    # Test that the maximum number of iteration is reached
    X, y_bin = iris.data, iris.target.copy()
    y_bin[y_bin == 2] = 0

    solvers = ['newton-cg', 'liblinear', 'sag', 'saga', 'lbfgs']

    for max_iter in range(1, 5):
        for solver in solvers:
            for multi_class in ['ovr', 'multinomial']:
                if solver == 'liblinear' and multi_class == 'multinomial':
                    continue
                lr = LogisticRegression(max_iter=max_iter, tol=1e-15,
                                        multi_class=multi_class,
                                        random_state=0, solver=solver)
                lr.fit(X, y_bin)
                assert_equal(lr.n_iter_[0], max_iter)
Beispiel #29
0
def test_nnet(n_samples=200, n_features=5, distance=0.5, complete=False):
    X, y = make_blobs(
        n_samples=n_samples,
        n_features=5,
        centers=[numpy.ones(n_features) * distance, -numpy.ones(n_features) * distance],
    )

    nn_types = [
        nnet.SimpleNeuralNetwork,
        nnet.MultiLayerNetwork,
        nnet.SoftmaxNeuralNetwork,
        nnet.RBFNeuralNetwork,
        nnet.PairwiseNeuralNetwork,
        nnet.PairwiseSoftplusNeuralNetwork,
    ]

    if complete:
        # checking all possible combinations
        for loss in nnet.losses:
            for NNType in nn_types:
                for trainer in nnet.trainers:
                    nn = NNType(layers=[5], loss=loss, trainer=trainer, random_state=42)
                    nn.fit(X, y, epochs=100)
                    print(roc_auc_score(y, nn.predict_proba(X)[:, 1]), nn)

        lr = LogisticRegression().fit(X, y)
        print(lr, roc_auc_score(y, lr.predict_proba(X)[:, 1]))

        assert 0 == 1, "Let's see and compare results"
    else:
        # checking combinations of losses, nn_types, trainers, most of them are used once during tests.
        attempts = max(len(nnet.losses), len(nnet.trainers), len(nn_types))
        attempts = 4
        losses_shift = numpy.random.randint(10)
        trainers_shift = numpy.random.randint(10)
        for attempt in range(attempts):
            loss = nnet.losses.keys()[(attempt + losses_shift) % len(nnet.losses)]
            trainer = nnet.trainers.keys()[(attempt + trainers_shift) % len(nnet.trainers)]

            nn_type = nn_types[attempt % len(nn_types)]

            nn = nn_type(layers=[5], loss=loss, trainer=trainer, random_state=42)
            print(nn)
            nn.fit(X, y, epochs=200)
            assert roc_auc_score(y, nn.predict_proba(X)[:, 1]) > 0.8, "quality of model is too low: {}".format(nn)
Beispiel #30
0
class mentoryWEB:

    def __init__(self, file):
        self.vect = TfidfVectorizer(max_df=0.25, stop_words=None, max_features=2500, ngram_range=(1,2), use_idf=True, norm='l2')
        df = pd.read_csv(file, delimiter='\t', header=None)
        X_train_raw, y_train = df[1], df[0]

        X_train = self.vect.fit_transform(X_train_raw)

        self.clf = LogisticRegression(penalty='l2', C=10)
        self.clf.fit(X_train, y_train)


    def test(self, string):
        X_test = self.vect.transform([string])
        prediction = self.clf.predict(X_test)

        return prediction[0]
Beispiel #31
0
def test_logistic_regression_sample_weights():
    X, y = make_classification(n_samples=20,
                               n_features=5,
                               n_informative=3,
                               n_classes=2,
                               random_state=0)
    sample_weight = y + 1

    for LR in [LogisticRegression, LogisticRegressionCV]:

        # Test that passing sample_weight as ones is the same as
        # not passing them at all (default None)
        for solver in ['lbfgs', 'liblinear']:
            clf_sw_none = LR(solver=solver,
                             fit_intercept=False,
                             random_state=42)
            clf_sw_none.fit(X, y)
            clf_sw_ones = LR(solver=solver,
                             fit_intercept=False,
                             random_state=42)
            clf_sw_ones.fit(X, y, sample_weight=np.ones(y.shape[0]))
            assert_array_almost_equal(clf_sw_none.coef_,
                                      clf_sw_ones.coef_,
                                      decimal=4)

        # Test that sample weights work the same with the lbfgs,
        # newton-cg, and 'sag' solvers
        clf_sw_lbfgs = LR(solver='lbfgs', fit_intercept=False, random_state=42)
        clf_sw_lbfgs.fit(X, y, sample_weight=sample_weight)
        clf_sw_n = LR(solver='newton-cg', fit_intercept=False, random_state=42)
        clf_sw_n.fit(X, y, sample_weight=sample_weight)
        clf_sw_sag = LR(solver='sag',
                        fit_intercept=False,
                        tol=1e-10,
                        random_state=42)
        # ignore convergence warning due to small dataset
        with ignore_warnings():
            clf_sw_sag.fit(X, y, sample_weight=sample_weight)
        clf_sw_liblinear = LR(solver='liblinear',
                              fit_intercept=False,
                              random_state=42)
        clf_sw_liblinear.fit(X, y, sample_weight=sample_weight)
        assert_array_almost_equal(clf_sw_lbfgs.coef_,
                                  clf_sw_n.coef_,
                                  decimal=4)
        assert_array_almost_equal(clf_sw_lbfgs.coef_,
                                  clf_sw_sag.coef_,
                                  decimal=4)
        assert_array_almost_equal(clf_sw_lbfgs.coef_,
                                  clf_sw_liblinear.coef_,
                                  decimal=4)

        # Test that passing class_weight as [1,2] is the same as
        # passing class weight = [1,1] but adjusting sample weights
        # to be 2 for all instances of class 2
        for solver in ['lbfgs', 'liblinear']:
            clf_cw_12 = LR(solver=solver,
                           fit_intercept=False,
                           class_weight={
                               0: 1,
                               1: 2
                           },
                           random_state=42)
            clf_cw_12.fit(X, y)
            clf_sw_12 = LR(solver=solver, fit_intercept=False, random_state=42)
            clf_sw_12.fit(X, y, sample_weight=sample_weight)
            assert_array_almost_equal(clf_cw_12.coef_,
                                      clf_sw_12.coef_,
                                      decimal=4)

    # Test the above for l1 penalty and l2 penalty with dual=True.
    # since the patched liblinear code is different.
    clf_cw = LogisticRegression(solver="liblinear",
                                fit_intercept=False,
                                class_weight={
                                    0: 1,
                                    1: 2
                                },
                                penalty="l1",
                                tol=1e-5,
                                random_state=42)
    clf_cw.fit(X, y)
    clf_sw = LogisticRegression(solver="liblinear",
                                fit_intercept=False,
                                penalty="l1",
                                tol=1e-5,
                                random_state=42)
    clf_sw.fit(X, y, sample_weight)
    assert_array_almost_equal(clf_cw.coef_, clf_sw.coef_, decimal=4)

    clf_cw = LogisticRegression(solver="liblinear",
                                fit_intercept=False,
                                class_weight={
                                    0: 1,
                                    1: 2
                                },
                                penalty="l2",
                                dual=True,
                                random_state=42)
    clf_cw.fit(X, y)
    clf_sw = LogisticRegression(solver="liblinear",
                                fit_intercept=False,
                                penalty="l2",
                                dual=True,
                                random_state=42)
    clf_sw.fit(X, y, sample_weight)
    assert_array_almost_equal(clf_cw.coef_, clf_sw.coef_, decimal=4)
Beispiel #32
0
def test_logistic_regression_class_weights():
    # Multinomial case: remove 90% of class 0
    X = iris.data[45:, :]
    y = iris.target[45:]
    solvers = ("lbfgs", "newton-cg")
    class_weight_dict = _compute_class_weight_dictionary(y)

    for solver in solvers:
        clf1 = LogisticRegression(solver=solver,
                                  multi_class="multinomial",
                                  class_weight="balanced")
        clf2 = LogisticRegression(solver=solver,
                                  multi_class="multinomial",
                                  class_weight=class_weight_dict)
        clf1.fit(X, y)
        clf2.fit(X, y)
        assert_array_almost_equal(clf1.coef_, clf2.coef_, decimal=4)

    # Binary case: remove 90% of class 0 and 100% of class 2
    X = iris.data[45:100, :]
    y = iris.target[45:100]
    solvers = ("lbfgs", "newton-cg", "liblinear")
    class_weight_dict = _compute_class_weight_dictionary(y)

    for solver in solvers:
        clf1 = LogisticRegression(solver=solver,
                                  multi_class="ovr",
                                  class_weight="balanced")
        clf2 = LogisticRegression(solver=solver,
                                  multi_class="ovr",
                                  class_weight=class_weight_dict)
        clf1.fit(X, y)
        clf2.fit(X, y)
        assert_array_almost_equal(clf1.coef_, clf2.coef_, decimal=6)
Beispiel #33
0
def test_logistic_regression_multinomial():
    # Tests for the multinomial option in logistic regression

    # Some basic attributes of Logistic Regression
    n_samples, n_features, n_classes = 50, 20, 3
    X, y = make_classification(n_samples=n_samples,
                               n_features=n_features,
                               n_informative=10,
                               n_classes=n_classes,
                               random_state=0)

    # 'lbfgs' is used as a referenced
    solver = 'lbfgs'
    ref_i = LogisticRegression(solver=solver, multi_class='multinomial')
    ref_w = LogisticRegression(solver=solver,
                               multi_class='multinomial',
                               fit_intercept=False)
    ref_i.fit(X, y)
    ref_w.fit(X, y)
    assert_array_equal(ref_i.coef_.shape, (n_classes, n_features))
    assert_array_equal(ref_w.coef_.shape, (n_classes, n_features))
    for solver in ['sag', 'newton-cg']:
        clf_i = LogisticRegression(solver=solver,
                                   multi_class='multinomial',
                                   random_state=42,
                                   max_iter=1000,
                                   tol=1e-6)
        clf_w = LogisticRegression(solver=solver,
                                   multi_class='multinomial',
                                   random_state=42,
                                   max_iter=1000,
                                   tol=1e-6,
                                   fit_intercept=False)
        clf_i.fit(X, y)
        clf_w.fit(X, y)
        assert_array_equal(clf_i.coef_.shape, (n_classes, n_features))
        assert_array_equal(clf_w.coef_.shape, (n_classes, n_features))

        # Compare solutions between lbfgs and the other solvers
        assert_almost_equal(ref_i.coef_, clf_i.coef_, decimal=3)
        assert_almost_equal(ref_w.coef_, clf_w.coef_, decimal=3)
        assert_almost_equal(ref_i.intercept_, clf_i.intercept_, decimal=3)

    # Test that the path give almost the same results. However since in this
    # case we take the average of the coefs after fitting across all the
    # folds, it need not be exactly the same.
    for solver in ['lbfgs', 'newton-cg', 'sag']:
        clf_path = LogisticRegressionCV(solver=solver,
                                        max_iter=2000,
                                        tol=1e-6,
                                        multi_class='multinomial',
                                        Cs=[1.])
        clf_path.fit(X, y)
        assert_array_almost_equal(clf_path.coef_, ref_i.coef_, decimal=3)
        assert_almost_equal(clf_path.intercept_, ref_i.intercept_, decimal=3)
label=[
    "en clinton test",
    "en trump test",
    "fr macron test",
    "fr lepen test",
    "it referendum test",
    "ca indipendencia test",
    "es indipendencia test", 
]


clfs = {
        "NB" : GaussianNB(),
        "SVM": SVC(kernel="linear"),
        "LR" : LogisticRegression()
        }



for i in range(0,len(training)):

    for key, clf in clfs.items():
        print(key,label[i])

        tweets_training=training[i]
        tweets_test=test[i]
        stance_training=numpy.array(feature_manager.get_stance(tweets_training))
        stance_test=numpy.array(feature_manager.get_stance(tweets_test))

Beispiel #35
0
def test_multinomial_validation():
    for solver in ['lbfgs', 'newton-cg', 'sag']:
        lr = LogisticRegression(C=-1, solver=solver, multi_class='multinomial')
        assert_raises(ValueError, lr.fit, [[0, 1], [1, 0]], [0, 1])
Beispiel #36
0
def test_liblinear_dual_random_state():
    # random_state is relevant for liblinear solver only if dual=True
    X, y = make_classification(n_samples=20, random_state=0)
    lr1 = LogisticRegression(random_state=0, dual=True, max_iter=1, tol=1e-15)
    lr1.fit(X, y)
    lr2 = LogisticRegression(random_state=0, dual=True, max_iter=1, tol=1e-15)
    lr2.fit(X, y)
    lr3 = LogisticRegression(random_state=8, dual=True, max_iter=1, tol=1e-15)
    lr3.fit(X, y)

    # same result for same random state
    assert_array_almost_equal(lr1.coef_, lr2.coef_)
    # different results for different random states
    msg = "Arrays are not almost equal to 6 decimals"
    assert_raise_message(AssertionError, msg, assert_array_almost_equal,
                         lr1.coef_, lr3.coef_)
Beispiel #37
0
def gbdt_lr_train(libsvmFileName):

    # load样本数据
    X_all, y_all = load_svmlight_file(libsvmFileName)

    # 训练/测试数据分割
    X_train, X_test, y_train, y_test = train_test_split(X_all,
                                                        y_all,
                                                        test_size=0.3,
                                                        random_state=42)

    # 定义GBDT模型
    gbdt = GradientBoostingClassifier(n_estimators=40,
                                      max_depth=3,
                                      verbose=0,
                                      max_features=0.5)

    # 训练学习
    gbdt.fit(X_train, y_train)

    # 预测及AUC评测
    y_pred_gbdt = gbdt.predict_proba(X_test.toarray())[:, 1]
    gbdt_auc = roc_auc_score(y_test, y_pred_gbdt)
    print('gbdt auc: %.5f' % gbdt_auc)

    # lr对原始特征样本模型训练
    lr = LogisticRegression()
    lr.fit(X_train, y_train)  # 预测及AUC评测
    y_pred_test = lr.predict_proba(X_test)[:, 1]
    lr_test_auc = roc_auc_score(y_test, y_pred_test)
    print('基于原有特征的LR AUC: %.5f' % lr_test_auc)

    # GBDT编码原有特征
    X_train_leaves = gbdt.apply(X_train)[:, :, 0]
    X_test_leaves = gbdt.apply(X_test)[:, :, 0]

    # 对所有特征进行ont-hot编码
    (train_rows, cols) = X_train_leaves.shape

    gbdtenc = OneHotEncoder()
    X_trans = gbdtenc.fit_transform(
        np.concatenate((X_train_leaves, X_test_leaves), axis=0))

    # 定义LR模型
    lr = LogisticRegression()
    # lr对gbdt特征编码后的样本模型训练
    lr.fit(X_trans[:train_rows, :], y_train)
    # 预测及AUC评测
    y_pred_gbdtlr1 = lr.predict_proba(X_trans[train_rows:, :])[:, 1]
    gbdt_lr_auc1 = roc_auc_score(y_test, y_pred_gbdtlr1)
    print('基于GBDT特征编码后的LR AUC: %.5f' % gbdt_lr_auc1)

    # 定义LR模型
    lr = LogisticRegression(n_jobs=-1)
    # 组合特征
    X_train_ext = hstack([X_trans[:train_rows, :], X_train])
    X_test_ext = hstack([X_trans[train_rows:, :], X_test])

    print(X_train_ext.shape)
    # lr对组合特征的样本模型训练
    lr.fit(X_train_ext, y_train)

    # 预测及AUC评测
    y_pred_gbdtlr2 = lr.predict_proba(X_test_ext)[:, 1]
    gbdt_lr_auc2 = roc_auc_score(y_test, y_pred_gbdtlr2)
    print('基于组合特征的LR AUC: %.5f' % gbdt_lr_auc2)
Beispiel #38
0
from sklearn.naive_bayes import GaussianNB
from sklearn.lda import LDA
from sklearn.qda import QDA

h = .02  # step size in the mesh

from pyearth.earth import Earth
from sklearn.linear_model.logistic import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline

np.random.seed(1)

# Combine Earth with LogisticRegression in a pipeline to do classification
earth_classifier = Pipeline([('earth', Earth(max_degree=3, penalty=1.5)),
                             ('logistic', LogisticRegression())])

names = [
    "Nearest Neighbors", "Linear SVM", "RBF SVM", "Decision Tree",
    "Random Forest", "Naive Bayes", "LDA", "QDA", "Earth"
]
classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="linear", C=0.025),
    SVC(gamma=2, C=1),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    GaussianNB(),
    LDA(),
    QDA(), earth_classifier
]
Beispiel #39
0
    im = mh.imread(fname, as_grey=True)
    haralicks.append(texture(im))
    sobels.append(edginess_sobel(im))

    # Files are named like building00.jpg, scene23.jpg...
    labels.append(fname[:-len('xx.jpg')])

print('Finished computing features.')

haralicks = np.array(haralicks)
sobels = np.array(sobels)
labels = np.array(labels)

# We use logistic regression because it is very fast.
# Feel free to experiment with other classifiers
scores = cross_validation.cross_val_score(LogisticRegression(),
                                          haralicks,
                                          labels,
                                          cv=5)
print('Accuracy (5 fold x-val) with Logistic Regrssion [std features]: {}%'.
      format(0.1 * round(1000 * scores.mean())))

haralick_plus_sobel = np.hstack([np.atleast_2d(sobels).T, haralicks])
scores = cross_validation.cross_val_score(LogisticRegression(),
                                          haralick_plus_sobel,
                                          labels,
                                          cv=5).mean()
print(
    'Accuracy (5 fold x-val) with Logistic Regrssion [std features + sobel]: {}%'
    .format(0.1 * round(1000 * scores.mean())))
Beispiel #40
0
y_test = []
accuracy_meta_train = []
for temp in range(2708):
    y.append(LABEL[labels[temp]])
y = np.array(y)

class_label = [0, 1, 2, 3, 4, 5, 6]
combination = list(combinations(class_label, 2))
for i in range(len(combination)):
    print('Cross_Validation: ', i + 1)
    test_label = list(combination[i])
    train_label = [n for n in class_label if n not in test_label]
    print('Cross_Validation {} Train_Label_List {}: '.format(
        i + 1, train_label))
    print('Cross_Validation {} Test_Label_List {}: '.format(i + 1, test_label))
    classifier = LogisticRegression()

    for j in range(50):
        labels_local = labels.copy()
        select_class = random.sample(train_label, 2)
        print('Cross_Validation {} ITERATION {} Train_Label: {}'.format(
            i + 1, j + 1, select_class))
        class1_idx = []
        class2_idx = []
        for k in range(2708):
            if (labels_local[k] == LABEL2[select_class[0]]):
                class1_idx.append(k)
                labels_local[k] = LABEL2[select_class[0]]
            elif (labels_local[k] == LABEL2[select_class[1]]):
                class2_idx.append(k)
                labels_local[k] = LABEL2[select_class[1]]
Beispiel #41
0
from sklearn.datasets.samples_generator import make_classification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model.logistic import LogisticRegression
from sklearn import metrics

if __name__ == '__main__':

    # X为样本特征, y为样本类别输出
    X, y = make_classification(n_samples=80000)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5)
    X_train, X_train_lr, y_train, y_train_lr = train_test_split(X_train,
                                                                y_train,
                                                                test_size=0.5)
    grd = GradientBoostingClassifier(n_estimators=10)
    grd_enc = OneHotEncoder()
    grd_lm = LogisticRegression()
    grd.fit(X_train, y_train)
    grd_enc.fit(grd.apply(X_train)[:, :, 0])
    '''
    使用训练好的GBDT模型构建特征,然后将特征经过one-hot编码作为新的特征输入到LR模型训练。
    '''
    grd_lm.fit(grd_enc.transform(grd.apply(X_train_lr)[:, :, 0]), y_train_lr)
    # 用训练好的LR模型多X_test做预测
    y_pred_grd_lm = grd_lm.predict_proba(
        grd_enc.transform(grd.apply(X_test)[:, :, 0]))[:, 1]
    # 根据预测结果输出
    fpr_grd_lm, tpr_grd_lm, _ = metrics.roc_curve(y_test, y_pred_grd_lm)

    print(grd.apply(X_train)[:, :, :].shape)
Beispiel #42
0
def classify(train_data, train_label):
    train_label = train_label.ravel()  #将多维数据降成一维
    # LSTM_AC, LSTM_f1 = Lstm_models()
    # GRU_AC, GRU_f1 = GRU_models()
    NLSTM_AC, NLSTM_f1 = Nestlstm_models()

    lgbmModel = LGBMClassifier(max_depth=5,
                               num_leaves=25,
                               learning_rate=0.007,
                               n_estimators=1000,
                               min_child_samples=80,
                               subsample=0.8,
                               colsample_bytree=1,
                               reg_alpha=0,
                               reg_lambda=0,
                               random_state=np.random.randint(10e6))
    lgbmModel.fit(train_data, train_label)
    lgbm_pre = lgbmModel.predict(test_data)

    lgbm_AC = accuracy_score(test_label, lgbm_pre)
    lgbm_f1 = f1_score(test_label, lgbm_pre, average='macro')

    AdaBoostModel = AdaBoostClassifier(base_estimator=None,
                                       n_estimators=50,
                                       learning_rate=1,
                                       algorithm='SAMME.R',
                                       random_state=None)
    AdaBoostModel.fit(train_data, train_label)
    AdaBoost_pre = AdaBoostModel.predict(test_data)
    AdaBoost_AC = accuracy_score(test_label, AdaBoost_pre)
    AdaBoost_f1 = f1_score(test_label, AdaBoost_pre, average='macro')

    rfc1 = RandomForestClassifier(n_estimators=40,
                                  max_depth=None,
                                  min_samples_split=2,
                                  random_state=2)  #随机森林分类器
    rfc1.fit(train_data, train_label)
    RF_pre = rfc1.predict(test_data)
    RF_AC = accuracy_score(test_label, RF_pre)
    RF_f1 = f1_score(test_label, RF_pre, average='macro')

    clf = SVC(kernel='rbf', C=9, gamma=0.1)
    clf.set_params(kernel='rbf',
                   probability=True).fit(train_data,
                                         train_label)  #set_params:设置SVC函数的参数
    clf.predict(train_data)
    test_pre = clf.predict(test_data)
    SVM_AC = accuracy_score(test_label, test_pre)
    SVM_f1 = f1_score(test_label, test_pre, average='macro')

    # decision tree
    dtc = DecisionTreeClassifier()
    dtc.fit(train_data, train_label)
    dt_pre = dtc.predict(test_data)
    DT_AC = accuracy_score(test_label, dt_pre)
    DT_f1 = f1_score(test_label, dt_pre, average='macro')

    MLP = MLPClassifier(solver='lbfgs',
                        alpha=1e-4,
                        hidden_layer_sizes=(100, 3),
                        random_state=1)
    MLP.fit(train_data, train_label)
    MLP_predict = MLP.predict(test_data)
    MLP_AC = accuracy_score(test_label, MLP_predict)
    MLP_f1 = f1_score(test_label, MLP_predict, average='macro')

    # KNN
    knn = KNeighborsClassifier(n_neighbors=3)
    knn.fit(train_data, train_label)
    knn_predict = knn.predict(test_data)
    KNN_AC = accuracy_score(test_label, knn_predict)
    KNN_f1 = f1_score(test_label, knn_predict, average='macro')

    # LogisticRegression
    classifier = LogisticRegression()
    classifier.fit(train_data, train_label)
    lg_predict = classifier.predict(test_data)
    LG_AC = accuracy_score(test_label, lg_predict)
    LG_f1 = f1_score(test_label, lg_predict, average='macro')
    #
    # print("===== Diagnosis original=======")
    # print('Original Accuracy:')
    # print(RF_AC, SVM_AC, DT_AC, NB_AC, MLP_AC, KNN_AC, LG_AC)
    # print('F1-score')
    # print(RF_f1, SVM_f1, DT_f1, NB_f1, MLP_f1, KNN_f1, LG_f1)
    # Main.py按照original.py, Ensemble.py, vae_od.py顺序执行,结果依次存入下面文件
    #     file_name1 = "./temp_result/Diagnosis_"+str(select_number)+"Level"+str(level_num)+"_Accuracy_result.txt"
    #     file_name2 = "./temp_result/Diagnosis_"+str(select_number)+"Level"+str(level_num)+"_f1_score_result.txt"
    #     with open(file_name1, "a") as f:
    #         f.writelines([str(RF_AC), ' ', str(SVM_AC), ' ', str(DT_AC), ' ', str(NB_AC), ' ', str(MLP_AC), ' ', str(KNN_AC), ' ', str(LG_AC), '\n'])
    #     with open(file_name2, "a") as f:
    #         f.writelines([str(RF_f1), ' ', str(SVM_f1), ' ', str(DT_f1), ' ', str(NB_f1), ' ', str(MLP_f1), ' ', str(KNN_f1), ' ', str(LG_f1), '\n'])
    #     return NLSTM_AC,LSTM_AC ,GRU_AC,lgbm_AC,AdaBoost_AC,RF_AC, SVM_AC, DT_AC, MLP_AC, KNN_AC, LG_AC,NLSTM_f1,LSTM_f1,GRU_f1,lgbm_f1,AdaBoost_f1,RF_f1,SVM_f1,DT_f1,MLP_f1,KNN_f1,LG_f1
    return NLSTM_AC, lgbm_AC, AdaBoost_AC, RF_AC, SVM_AC, DT_AC, MLP_AC, KNN_AC, LG_AC, NLSTM_f1, lgbm_f1, AdaBoost_f1, RF_f1, SVM_f1, DT_f1, MLP_f1, KNN_f1, LG_f1
    def initialize_with_logistic_regression(self, zs, xs, initialize=False):
        from sklearn.linear_model.logistic import LogisticRegression
        if not hasattr(self, '_lr'):
            self._lr = LogisticRegression(verbose=False,
                                          multi_class="multinomial",
                                          solver="lbfgs",
                                          warm_start=True,
                                          max_iter=10)
        lr = self._lr

        # Make the covariates
        K, D = self.num_states, self.covariate_dim

        # Split zs into prevs and nexts
        zps = zs[:-1] if isinstance(zs, np.ndarray) else np.concatenate(
            [z[:-1] for z in zs], axis=0)
        zns = zs[1:] if isinstance(zs, np.ndarray) else np.concatenate(
            [z[1:] for z in zs], axis=0)
        xps = xs[:-1] if isinstance(xs, np.ndarray) else np.concatenate(
            [x[:-1] for x in xs], axis=0)

        assert zps.shape[0] == xps.shape[0]
        assert zps.ndim == 1 and zps.dtype == np.int32 and zps.min(
        ) >= 0 and zps.max() < K
        assert zns.ndim == 1 and zns.dtype == np.int32 and zns.min(
        ) >= 0 and zns.max() < K
        assert xps.ndim == 2 and xps.shape[1] == D

        used = np.bincount(zns, minlength=K) > 0
        K_used = np.sum(used)

        lr_X = np.column_stack((one_hot(zps, K), xps))
        lr_y = zns

        # The logistic regression solver fails if we only have one class represented
        # In this case, set the regression weights to zero and set logpi to have
        # high probability of the visited class
        if K_used == 1:
            self.W = np.zeros((D, K))
            self.log_pi = np.zeros((K, K))
            self.log_pi[:, used] = 3.0
        else:
            lr.fit(lr_X, lr_y)

            # Now convert the logistic regression into weights
            if K_used > 2:
                self.W = np.zeros((D, K))
                self.W[:, used] = lr.coef_[:, K:].T
                self.logpi = np.zeros((K, K))
                self.logpi[:, used] = lr.coef_[:, :K].T
                self.logpi[:, used] += lr.intercept_[None, :]
                self.logpi[:, ~used] += -100.

            elif K_used == 2:
                # LogisticRegression object only represents one
                # set of weights for binary problems
                self.W = np.zeros((D, K))
                self.W[:, 1] = lr.coef_[0, K:]
                self.logpi = np.zeros((K, K))
                self.logpi[:, 1] = lr.coef_[0, :K].T
                self.logpi[:, 1] += lr.intercept_
Beispiel #44
0
def test_logistic_regression_solvers_multiclass():
    X, y = make_classification(n_samples=20,
                               n_features=20,
                               n_informative=10,
                               n_classes=3,
                               random_state=0)
    tol = 1e-6
    ncg = LogisticRegression(solver='newton-cg', fit_intercept=False, tol=tol)
    lbf = LogisticRegression(solver='lbfgs', fit_intercept=False, tol=tol)
    lib = LogisticRegression(fit_intercept=False, tol=tol)
    sag = LogisticRegression(solver='sag',
                             fit_intercept=False,
                             tol=tol,
                             max_iter=1000,
                             random_state=42)
    ncg.fit(X, y)
    lbf.fit(X, y)
    sag.fit(X, y)
    lib.fit(X, y)
    assert_array_almost_equal(ncg.coef_, lib.coef_, decimal=4)
    assert_array_almost_equal(lib.coef_, lbf.coef_, decimal=4)
    assert_array_almost_equal(ncg.coef_, lbf.coef_, decimal=4)
    assert_array_almost_equal(sag.coef_, lib.coef_, decimal=4)
    assert_array_almost_equal(sag.coef_, ncg.coef_, decimal=4)
    assert_array_almost_equal(sag.coef_, lbf.coef_, decimal=4)
Beispiel #45
0
def test_logistic_regression_solvers():
    X, y = make_classification(n_features=10, n_informative=5, random_state=0)

    ncg = LogisticRegression(solver='newton-cg', fit_intercept=False)
    lbf = LogisticRegression(solver='lbfgs', fit_intercept=False)
    lib = LogisticRegression(fit_intercept=False)
    sag = LogisticRegression(solver='sag',
                             fit_intercept=False,
                             random_state=42)
    ncg.fit(X, y)
    lbf.fit(X, y)
    sag.fit(X, y)
    lib.fit(X, y)
    assert_array_almost_equal(ncg.coef_, lib.coef_, decimal=3)
    assert_array_almost_equal(lib.coef_, lbf.coef_, decimal=3)
    assert_array_almost_equal(ncg.coef_, lbf.coef_, decimal=3)
    assert_array_almost_equal(sag.coef_, lib.coef_, decimal=3)
    assert_array_almost_equal(sag.coef_, ncg.coef_, decimal=3)
    assert_array_almost_equal(sag.coef_, lbf.coef_, decimal=3)
Beispiel #46
0
# TAKE THE TRAINSET AND THE TARGET FROM DATASET
trainset=get_trainset(dataset)
targets=get_target_dataset(dataset)


# DELETING THE STATISTICS OF THE USERS CONTENTS
trainset_without_stats=drop_stats(trainset)

# STANDARDIZE DATASET
trainset_without_stats=StandardScale_dataset(trainset_without_stats)


# TRAINING WITHOUT STATISTICS OF THE USERS CONTENTS
x_train, x_test, y_train, y_test = train_test_split(trainset_without_stats, targets, test_size = 0.2, random_state = 12345)
lr = LogisticRegression(solver='lbfgs')
lr.fit(x_train, y_train)
predictions = lr.predict(x_test)


print("\nPERFORMANCE WITHOUT THE STATISTICS OF THE USERS CONTENTS: ")
print("\nCONFUSION MATRIX:")
print(confusion_matrix(y_test, predictions))

print("\nCLASSIFICATION REPORT:")
print(classification_report(y_test, predictions))

# CLASSIFICATION OF ONLY PUBLIC PROFILES
dataset_publics=drop_NaN_entries(dataset)
trainset_publics=get_trainset(dataset_publics)
targets_publics=get_target_dataset(dataset_publics)
Beispiel #47
0
    for i in range(50):  #set range to contain number of csv files
        new = pd.read_csv(path + file + str(i + 1) + ".csv")
        raw = pd.concat([raw, new])
    raw = raw[['content', 'troll']]
    #these can be played with, currently set to ignore words in more than half or less than 100
    vectorizer = TfidfVectorizer(min_df=100, max_df=0.5)
    c = vectorizer.fit_transform(raw['content'])
    dictionary = vectorizer.get_feature_names()
    return c, raw['troll'], dictionary


X, y, dictionary = load_data("tweet_data_batch")
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True)

model1 = LogisticRegression(
    max_iter=400, n_jobs=-1
)  #increased arbitariy to 400 since default iterations reach limit
'''
scores_clf_svc_cv1 = cross_val_score(model1,X,y,cv=5)
print("LogReg Accuracy: %0.2f (+/- %0.2f)" % (scores_clf_svc_cv1.mean(), scores_clf_svc_cv1.std() * 2))  # print accuracy
'''
model1.fit(X_train, y_train)
print("LogReg Accuracy:\n", model1.score(X_test, y_test))
predic1 = model1.predict(X_test)
print("LogReg matrix:", metrics.confusion_matrix(y_test, predic1))

model2 = Perceptron()
'''
scores_clf_svc_cv2 = cross_val_score(model2,X,y,cv=5)
print("Perceptron Accuracy: %0.2f (+/- %0.2f)" % (scores_clf_svc_cv2.mean(), scores_clf_svc_cv2.std() * 2))  # print accuracy
'''

def tokenize_porter(text):
    return [porter.stem(word) for word in text.split()]


stop = stopwords.words("english")

vectorizer = TfidfVectorizer(stop_words="english", ngram_range=(1, 1))

X = vectorizer.fit_transform(movie["review"])
print(vectorizer.get_feature_names)
#print(vectorizer.get_feature_names())
train_x, test_x, train_y, test_y = train_test_split(X,
                                                    movie["sentiment"],
                                                    test_size=0.2,
                                                    random_state=42)

print(train_x.shape, train_y.shape)
clf = LogisticRegression()
clf.fit(train_x, train_y)

#predict result
print(clf.predict(test_x))

#crossval score
scores = cross_val_score(clf, test_x, test_y, cv=5)

acc = scores.mean()
print("Accuracy: %0.2f percent" % (acc * 100))
Beispiel #49
0
def test_nan():
    # Test proper NaN handling.
    # Regression test for Issue #252: fit used to go into an infinite loop.
    Xnan = np.array(X, dtype=np.float64)
    Xnan[0, 1] = np.nan
    LogisticRegression(random_state=0).fit(Xnan, Y1)
Beispiel #50
0
#对测试集进行预测
from sklearn.model_selection import train_test_split
x_data=data[:,:-1]
y_data=data[:,-1]

prediction_list=[]
for i in range(5):
   x_train, x_val, y_train, y_val = train_test_split(data[:,:-1], data[:,-1], test_size=0.2)
   x_train_new,x_test_new =PCA_Reduction(x_train,x_test)
   print(x_test_new.shape)
   clf = SVC(kernel='linear',verbose=1)
   clf.fit(x_train_new, y_train) 
   y_predition_test=clf.predict(x_test_new)
   prediction_list.append(y_predition_test)
   
   classifier=LogisticRegression()
   classifier.fit(x_train_new,y_train)
   y_predict=classifier.predict(x_test_new)
   prediction_list.append(y_predict)
   
   dtree=RandomForestClassifier(criterion='gini',max_depth=120,min_impurity_decrease=0) 
   dtree.fit(x_train_new,y_train) 
   pred=dtree.predict(x_test_new) 
   prediction_list.append(pred)
   
   
print(prediction_list)
prediction_result=np.array(prediction_list).T 
print(prediction_result.shape)
test_result=[]
for x in prediction_result:
Beispiel #51
0
def test_logreg_intercept_scaling_zero():
    # Test that intercept_scaling is ignored when fit_intercept is False

    clf = LogisticRegression(fit_intercept=False)
    clf.fit(X, Y1)
    assert_equal(clf.intercept_, 0.)
def test_logreg_predict_proba_multinomial():
    X, y = make_classification(n_samples=10, n_features=20, random_state=0,
                               n_classes=3, n_informative=10)

    # Predicted probabilities using the true-entropy loss should give a
    # smaller loss than those using the ovr method.
    clf_multi = LogisticRegression(multi_class="multinomial", solver="lbfgs")
    clf_multi.fit(X, y)
    clf_multi_loss = log_loss(y, clf_multi.predict_proba(X))
    clf_ovr = LogisticRegression(multi_class="ovr", solver="lbfgs")
    clf_ovr.fit(X, y)
    clf_ovr_loss = log_loss(y, clf_ovr.predict_proba(X))
    assert_greater(clf_ovr_loss, clf_multi_loss)

    # Predicted probabilities using the soft-max function should give a
    # smaller loss than those using the logistic function.
    clf_multi_loss = log_loss(y, clf_multi.predict_proba(X))
    clf_wrong_loss = log_loss(y, clf_multi._predict_proba_lr(X))
    assert_greater(clf_wrong_loss, clf_multi_loss)
Beispiel #53
0
    predict_me = np.array(X[i].astype(float))
    predict_me = predict_me.reshape(-1, len(predict_me))

    prediction = clf.predict(predict_me)
    if prediction == y[i]:
        correct += 1

print (float(correct)/float(len(X)))
'''

from sklearn.linear_model.logistic import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn import decomposition
from sklearn.pipeline import Pipeline

logistic = LogisticRegression()
pca = decomposition.PCA()
pipe = Pipeline(steps=[('pca', pca), ('logistic', logistic)])

X = np.array(df.drop('survived', 1))
X = preprocessing.scale(X)
print X.shape
y = np.array(df['survived'])
print y.shape
clf = pca.fit_transform(X, y)
plt.figure(1, figsize=(5, 5))
plt.clf()
plt.axes([.2, .2, .7, .7])
plt.plot(pca.explained_variance_, linewidth=2)
plt.axis('tight')
plt.xlabel('n_components')
def crossval(features, labels, vec):
    maxent = LogisticRegression(penalty='l1')
    #maxent = SGDClassifier(penalty='l1')
    #maxent = Perceptron(penalty='l1')
    maxent.fit(
        features, labels
    )  # only needed for feature inspection, crossvalidation calls fit(), too
    coeffcounter = Counter(vec.feature_names_)
    negfeats = set(vec.feature_names_)
    posfeats = set(vec.feature_names_)

    scores = defaultdict(list)
    TotalCoeffCounter = Counter()

    for TrainIndices, TestIndices in cross_validation.KFold(
            n=features.shape[0], n_folds=10, shuffle=False, random_state=None):
        TrainX_i = features[TrainIndices]
        Trainy_i = labels[TrainIndices]

        TestX_i = features[TestIndices]
        Testy_i = labels[TestIndices]

        maxent.fit(TrainX_i, Trainy_i)
        ypred_i = maxent.predict(TestX_i)
        coeffs_i = list(maxent.coef_[0])
        coeffcounter_i = Counter(vec.feature_names_)
        for value, name in zip(coeffs_i, vec.feature_names_):
            coeffcounter_i[name] = value

        acc = accuracy_score(ypred_i, Testy_i)
        pre = precision_score(ypred_i, Testy_i)
        rec = recall_score(ypred_i, Testy_i)
        # shared task uses f1 of *accuracy* and recall!
        f1 = 2 * acc * rec / (acc + rec)

        scores["Accuracy"].append(acc)
        scores["F1"].append(f1)
        scores["Precision"].append(pre)
        scores["Recall"].append(rec)

        posfeats = posfeats.intersection(
            set([key for (key, value) in coeffcounter.most_common()[:20]]))
        negfeats = negfeats.intersection(
            set([key for (key, value) in coeffcounter.most_common()[-20:]]))

    print("Pervasive positive: ", posfeats)
    print("Pervasive negative: ", negfeats)

    #scores = cross_validation.cross_val_score(maxent, features, labels, cv=10)
    print("--")

    for key in sorted(scores.keys()):
        currentmetric = np.array(scores[key])
        print("%s : %0.2f (+/- %0.2f)" %
              (key, currentmetric.mean(), currentmetric.std()))
    print("--")

    maxent.fit(features, labels)  # fit on everything

    coeffs_total = list(maxent.coef_[0])
    for value, name in zip(coeffs_total, vec.feature_names_):
        TotalCoeffCounter[name] = value

    for (key, value) in TotalCoeffCounter.most_common()[:20]:
        print(key, value)
    print("---")
    for (key, value) in TotalCoeffCounter.most_common()[-20:]:
        print(key, value)
    print("lowest coeff:", coeffcounter.most_common()[-1])
    print("highest coeff", coeffcounter.most_common()[0])
Beispiel #55
0
clf = SVC()
clf.fit(X_train, y_train)
print "使用支持向量分类算法分类结果:"
print clf.score(X_test, y_test)  #支持向量分类
#nusvm
clf = NuSVC()
clf.fit(X_train, y_train)
print "使用支持向量分类算法分类结果:"
print clf.score(X_test, y_test)  #核支持向量分类

clf = GaussianNB()
clf.fit(X_train, y_train)
print "使用朴素贝叶斯分类算法分类结果:"
print clf.score(X_test, y_test)  #朴素贝叶斯分类

classifier = LogisticRegression()
classifier.fit(X_train, y_train)
print "使用逻辑回归算法分类结果:"
print classifier.score(X_test, y_test)  #逻辑回归

classifier = tree.DecisionTreeClassifier()
classifier.fit(X_train, y_train)
print "使用决策树算法分类结果:"
print classifier.score(X_test, y_test)

classifier = GradientBoostingClassifier(n_estimators=200)
classifier.fit(X_train, y_train)
print "使用GBDT算法分类结果:"
print classifier.score(X_test, y_test)

# +

# calculate train/test data number
N = len(digits)
N_train = int(N*0.8)
N_test = N - N_train

# split train/test data
x_train = digits[:N_train, :]
y_train = dig_label[:N_train]
x_test  = digits[N_train:, :]
y_test  = dig_label[N_train:]

# do logistic regression
lr=LogisticRegression()
lr.fit(x_train,y_train)

pred_train = lr.predict(x_train)
pred_test  = lr.predict(x_test)

# calculate train/test accuracy
acc_train = accuracy_score(y_train, pred_train)
acc_test = accuracy_score(y_test, pred_test)
print("accuracy train = %f, accuracy_test = %f" % (acc_train, acc_test))


# +
# do PCA with 'n_components=40'
pca = decomposition.PCA(n_components=40)
pca.fit(x_train)
def test_predict_3_classes():
    check_predictions(LogisticRegression(C=10), X, Y2)
    check_predictions(LogisticRegression(C=10), X_sp, Y2)
Beispiel #58
0
for k in range(5):
    train_texts = np.concatenate((texts[:i], texts[i + 200:]), axis=0)
    train_labels = np.concatenate((labels[:i], labels[i + 200:]), axis=0)
    test_texts = texts[i:i + 200]
    test_labels = labels[i:i + 200]
    # 贝叶斯
    text_clf = Pipeline([('tfidf', TfidfVectorizer(max_features=2000)),
                         ('clf', MultinomialNB())])
    text_clf = text_clf.fit(train_texts, train_labels)
    predicted = text_clf.predict(test_texts)
    t1 += np.mean(predicted == test_labels)
    print("MultinomialNB准确率为:", np.mean(predicted == test_labels))

    # LogisticRegression
    text_clf = Pipeline([('tfidf', TfidfVectorizer(max_features=2000)),
                         ('clf', LogisticRegression())])
    text_clf = text_clf.fit(train_texts, train_labels)
    predicted = text_clf.predict(test_texts)
    t2 += np.mean(predicted == test_labels)
    print("LogisticRegression准确率为:", np.mean(predicted == test_labels))

    # SVM
    text_clf = Pipeline([('tfidf', TfidfVectorizer(max_features=2000)),
                         ('clf', NuSVC())])
    text_clf = text_clf.fit(train_texts, train_labels)
    predicted = text_clf.predict(test_texts)
    t3 += np.mean(predicted == test_labels)
    print("SVC准确率为:", np.mean(predicted == test_labels))

    text_clf = Pipeline([('tfidf', TfidfVectorizer(max_features=2000)),
                         ('clf', LinearSVC())])
def test_n_iter():
    # Test that self.n_iter_ has the correct format.
    X, y = iris.data, iris.target
    y_bin = y.copy()
    y_bin[y_bin == 2] = 0

    n_Cs = 4
    n_cv_fold = 2

    for solver in ['newton-cg', 'liblinear', 'sag', 'saga', 'lbfgs']:
        # OvR case
        n_classes = 1 if solver == 'liblinear' else np.unique(y).shape[0]
        clf = LogisticRegression(tol=1e-2, multi_class='ovr',
                                 solver=solver, C=1.,
                                 random_state=42, max_iter=100)
        clf.fit(X, y)
        assert_equal(clf.n_iter_.shape, (n_classes,))

        n_classes = np.unique(y).shape[0]
        clf = LogisticRegressionCV(tol=1e-2, multi_class='ovr',
                                   solver=solver, Cs=n_Cs, cv=n_cv_fold,
                                   random_state=42, max_iter=100)
        clf.fit(X, y)
        assert_equal(clf.n_iter_.shape, (n_classes, n_cv_fold, n_Cs))
        clf.fit(X, y_bin)
        assert_equal(clf.n_iter_.shape, (1, n_cv_fold, n_Cs))

        # multinomial case
        n_classes = 1
        if solver in ('liblinear', 'sag', 'saga'):
            break

        clf = LogisticRegression(tol=1e-2, multi_class='multinomial',
                                 solver=solver, C=1.,
                                 random_state=42, max_iter=100)
        clf.fit(X, y)
        assert_equal(clf.n_iter_.shape, (n_classes,))

        clf = LogisticRegressionCV(tol=1e-2, multi_class='multinomial',
                                   solver=solver, Cs=n_Cs, cv=n_cv_fold,
                                   random_state=42, max_iter=100)
        clf.fit(X, y)
        assert_equal(clf.n_iter_.shape, (n_classes, n_cv_fold, n_Cs))
        clf.fit(X, y_bin)
        assert_equal(clf.n_iter_.shape, (1, n_cv_fold, n_Cs))
Beispiel #60
0
def getPipeline():

    return Pipeline([('vect',
                      TfidfVectorizer(stop_words='english',
                                      sublinear_tf=True)),
                     ('clf', LogisticRegression())])