class LogReg:
    def __init__(self):
        self.load_data()
        self.clf = LogisticRegression(class_weight = 'balanced')
        self.train()
        self.predict()

    def load_data(self):
        train_csv = './data/train.csv'
        test_csv = './data/test.csv'
        df_train = pd.read_csv(train_csv, header=0)
        df_test = pd.read_csv(test_csv, header=0)
        arr_train = df_train.values
        arr_test = df_test.values
        self.train_X = arr_train[0::,1::]
        self.train_Y = arr_train[0::, 0]
        self.test_X = arr_test[0::, 1::]
        self.test_ID = arr_test[0::,0]

    def train(self):
        self.clf.fit(self.train_X, self.train_Y)

    def predict(self):
        self.test_Y = self.clf.predict_proba(self.test_X)

    def get_training_accuracy(self):
        return (self.clf.score(self.train_X, self.train_Y))

    def store_result(self):
        df_out = pd.DataFrame()
        df_out['Id'] = self.test_ID
        df_out['Action'] = self.test_Y[0::,1]
        df_out.to_csv('./data/results/c1_result.csv',index=False)
Beispiel #2
0
def main():
    classes = [
        'chimp',
        'corvette',
        'tokyo',
        'goldengatebridge'
        ]
    
    images, labels = get_labels(classes)
    std_features = get_standard_features(images)
    
    k = 256
    surf_features = get_visual_words(images, k)
    tas_features = get_tas_features(images)
    
    feature_dict = {
        'Std': std_features,
        'SURF': surf_features,
        'TAS': tas_features
        #'Zernike': zernike_features
        }
        
    best_features = log_classify(feature_dict, labels)
    classifier = LogisticRegression() 
    classifier.fit(best_features, labels)
def test_warm_start():
    # A 1-iteration second fit on same data should give almost same result
    # with warm starting, and quite different result without warm starting.
    # Warm starting does not work with liblinear solver.
    X, y = iris.data, iris.target

    solvers = ['newton-cg', 'sag']
    # old scipy doesn't have maxiter
    if sp_version >= (0, 12):
        solvers.append('lbfgs')

    for warm_start in [True, False]:
        for fit_intercept in [True, False]:
            for solver in solvers:
                for multi_class in ['ovr', 'multinomial']:
                    clf = LogisticRegression(tol=1e-4, multi_class=multi_class,
                                             warm_start=warm_start,
                                             solver=solver,
                                             random_state=42, max_iter=100,
                                             fit_intercept=fit_intercept)
                    clf.fit(X, y)
                    coef_1 = clf.coef_

                    clf.max_iter = 1
                    with ignore_warnings():
                        clf.fit(X, y)
                    cum_diff = np.sum(np.abs(coef_1 - clf.coef_))
                    msg = ("Warm starting issue with %s solver in %s mode "
                           "with fit_intercept=%s and warm_start=%s"
                           % (solver, multi_class, str(fit_intercept),
                              str(warm_start)))
                    if warm_start:
                        assert_greater(2.0, cum_diff, msg)
                    else:
                        assert_greater(cum_diff, 2.0, msg)
Beispiel #4
0
def test_write_parameters():
    # Test that we can write to coef_ and intercept_
    clf = LogisticRegression(random_state=0)
    clf.fit(X, Y1)
    clf.coef_[:] = 0
    clf.intercept_[:] = 0
    assert_array_almost_equal(clf.decision_function(X), 0)
Beispiel #5
0
def mlogistic():
	X = []

	# 前三行作为输入样本
	X.append("f**k you")
	X.append("f**k you all")
	X.append("hello everyone")

	# 后两句作为测试样本
	X.append("f**k me")
	X.append("hello boy")

	# y为样本标注
	y = [1,1,0]

	vectorizer = TfidfVectorizer()

	# 取X的前三句作为输入做tfidf转换
	X_train = vectorizer.fit_transform(X[:-2])
	print X_train
	# 取X的后两句用“上句生成”的tfidf做转换
	X_test = vectorizer.transform(X[-2:])
	print X_test

	# 用逻辑回归模型做训练
	classifier = LogisticRegression()
	classifier.fit(X_train, y)

	# 做测试样例的预测
	predictions = classifier.predict(X_test)
	print predictions
def test_consistency_path():
    """Test that the path algorithm is consistent"""
    rng = np.random.RandomState(0)
    X = np.concatenate((rng.randn(100, 2) + [1, 1], rng.randn(100, 2)))
    y = [1] * 100 + [-1] * 100
    Cs = np.logspace(0, 4, 10)

    f = ignore_warnings
    # can't test with fit_intercept=True since LIBLINEAR
    # penalizes the intercept
    for method in ('lbfgs', 'newton-cg', 'liblinear'):
        coefs, Cs = f(logistic_regression_path)(
            X, y, Cs=Cs, fit_intercept=False, tol=1e-16, solver=method)
        for i, C in enumerate(Cs):
            lr = LogisticRegression(C=C, fit_intercept=False, tol=1e-16)
            lr.fit(X, y)
            lr_coef = lr.coef_.ravel()
            assert_array_almost_equal(lr_coef, coefs[i], decimal=4)

    # test for fit_intercept=True
    for method in ('lbfgs', 'newton-cg', 'liblinear'):
        Cs = [1e3]
        coefs, Cs = f(logistic_regression_path)(
            X, y, Cs=Cs, fit_intercept=True, tol=1e-4, solver=method)
        lr = LogisticRegression(C=Cs[0], fit_intercept=True, tol=1e-4,
                                intercept_scaling=10000)
        lr.fit(X, y)
        lr_coef = np.concatenate([lr.coef_.ravel(), lr.intercept_])
        assert_array_almost_equal(lr_coef, coefs[0], decimal=4)
    def test_regularization_path(self):
        # Check results using logistic path
        num_samples = 10
        num_feat = 5

        X, y = make_classification(n_samples=num_samples, n_features=num_feat, n_informative=3,
                                       n_classes=2, random_state=0, weights=[0.5, 0.5])
        matrix = np.zeros((num_samples, num_feat + 2))
        matrix[:,:-2] = X
        matrix[:, -2] = np.ones(num_samples)
        matrix[:, -1] = y

        # Betas to test
        logitfitL1 = LogisticRegressionL1()
        lambda_grid = np.exp(-1 * np.linspace(1, 17, 200))
        path = logitfitL1.fit(matrix, lambda_grid)

        # Sklearn
        cs = l1_min_c(X, y, loss='log') * np.logspace(0, 3)

        # Computing regularization path using sklearn
        clf = LogisticRegression(C=1.0, penalty='l1', tol=1e-6)
        coefs_ = []
        for c in cs:
            clf.set_params(C=c)
            clf.fit(X, y)
            coefs_.append(clf.coef_.ravel().copy())

        skbetas = np.append(clf.intercept_[0], clf.coef_)
        np.testing.assert_almost_equal(skbetas, logitfitL1.coef_, 1)
Beispiel #8
0
def test_consistency_path():
    # Test that the path algorithm is consistent
    rng = np.random.RandomState(0)
    X = np.concatenate((rng.randn(100, 2) + [1, 1], rng.randn(100, 2)))
    y = [1] * 100 + [-1] * 100
    Cs = np.logspace(0, 4, 10)

    f = ignore_warnings
    # can't test with fit_intercept=True since LIBLINEAR
    # penalizes the intercept
    for solver in ("lbfgs", "newton-cg", "liblinear", "sag"):
        coefs, Cs, _ = f(logistic_regression_path)(
            X, y, Cs=Cs, fit_intercept=False, tol=1e-5, solver=solver, random_state=0
        )
        for i, C in enumerate(Cs):
            lr = LogisticRegression(C=C, fit_intercept=False, tol=1e-5, random_state=0)
            lr.fit(X, y)
            lr_coef = lr.coef_.ravel()
            assert_array_almost_equal(lr_coef, coefs[i], decimal=4, err_msg="with solver = %s" % solver)

    # test for fit_intercept=True
    for solver in ("lbfgs", "newton-cg", "liblinear", "sag"):
        Cs = [1e3]
        coefs, Cs, _ = f(logistic_regression_path)(
            X, y, Cs=Cs, fit_intercept=True, tol=1e-6, solver=solver, intercept_scaling=10000.0, random_state=0
        )
        lr = LogisticRegression(C=Cs[0], fit_intercept=True, tol=1e-4, intercept_scaling=10000.0, random_state=0)
        lr.fit(X, y)
        lr_coef = np.concatenate([lr.coef_.ravel(), lr.intercept_])
        assert_array_almost_equal(lr_coef, coefs[0], decimal=4, err_msg="with solver = %s" % solver)
Beispiel #9
0
def predictWithThreshold(datadir, threshold, penalty_type='l2'):
	maxent = LogisticRegression(penalty=penalty_type)
	scores = defaultdict(list)
	for dir in sorted(os.listdir(datadir), reverse=True):
		trainfeatures, trainlabels, vec = feats_and_classify.collect_features(datadir+dir+'/train.conll')
		TrainIndices=np.array(range(len(trainfeatures)))
		features, labels,  vec = feats_and_classify.collect_features(datadir+dir+'/all.conll')
		TestIndices=np.array(range(len(trainfeatures),len(features)))
#		print('\r'+dir, end="")
#		print(dir)
		TrainX_i = features[TrainIndices]
		Trainy_i = labels[TrainIndices]

		TestX_i = features[TestIndices]
		Testy_i =  labels[TestIndices]

		maxent.fit(TrainX_i,Trainy_i)
#		print('Finished fitting')
		ypred_i, score=pred_for_threshold(maxent,TestX_i,Testy_i, threshold)
#		print('Predicting')

		scores["F1"].append(score[0])
		scores["Recall"].append(score[1])
		scores["Accuracy"].append(score[2])
		scores["Precision"].append(score[3])

	
	#scores = cross_validation.cross_val_score(maxent, features, labels, cv=10)
	print("\n--")

	for key in sorted(scores.keys()):
		currentmetric = np.array(scores[key])
		print("%s : %0.2f (+/- %0.2f)" % (key,currentmetric.mean(), currentmetric.std()))
	print("--")
def main():
    scriptdir = os.path.dirname(os.path.realpath(__file__))
    parser = argparse.ArgumentParser(description="Skeleton for features and classifier for CWI-2016--optimisation of threshhold")
    parser.add_argument('--threshold',type=float,default=0.5)
    parser.add_argument('--annotator',type=str,default="03")
    parser.add_argument('--penalty',type=str,choices=["l1","l2"],default="l1")


    args = parser.parse_args()
    current_single_ann = scriptdir+"/../data/cwi_training/cwi_training_"+args.annotator+".lbl.conll"
    testfile = scriptdir+"/../data/cwi_testing/cwi_testing.txt.lbl.conll"
    X__dict_train, y_train, v_train = feats_and_classify.collect_features(current_single_ann,vectorize=False)
    X_dict_test, y_test, v_test = feats_and_classify.collect_features(testfile,vectorize=False)
    featdicts = list([x for x in X__dict_train + X_dict_test])
    vect = DictVectorizer()
    X = vect.fit_transform(featdicts).toarray()
    X_train=X[:len(y_train)]
    X_test=X[len(y_train):]

    maxent = LogisticRegression(penalty=args.penalty)
    maxent.fit(X_train,y_train)
    y_pred_proba = maxent.predict_proba(X_test)
    ypred_i=["1" if pair[1]>=args.threshold else "0" for pair in y_pred_proba]
    fout = open(args.annotator+".pred",mode="w")
    print("\n".join(ypred_i),file=fout)
    fout.close()
    sys.exit(0)
def cvWithThreshold(X, y_current_tr, y_current_te, threshold, regularization='l2'):
    out_dict = {}
    scores = defaultdict(list)
    fold=1
    maxent = LogisticRegression(penalty=regularization)
    for TrainIndices, TestIndices in cross_validation.StratifiedKFold(y_current_tr, n_folds=10, shuffle=False, random_state=None):
        print('\r'+str(fold), end="")
        fold+=1
        TrainX_i = X[TrainIndices]
        Trainy_i = y_current_tr[TrainIndices]
        TestX_i = X[TestIndices]
        Testy_i =  y_current_te[TestIndices]
     
        maxent.fit(TrainX_i,Trainy_i)
        ypred_i, score=pred_for_threshold(maxent,TestX_i,Testy_i, threshold)

        scores["F1"].append(score[0])
        scores["Recall"].append(score[1])
        scores["Accuracy"].append(score[2])
        scores["Precision"].append(score[3])

    
    #scores = cross_validation.cross_val_score(maxent, features, labels, cv=10)
    print("\n--")

    for key in sorted(scores.keys()):
        currentmetric = np.array(scores[key])
        out_dict[key] = (currentmetric.mean(),currentmetric.std())
        print("%s : %0.2f (+/- %0.2f)" % (key,currentmetric.mean(), currentmetric.std()))
    print("--")
    return out_dict
def test_liblinear_random_state():
    X, y = make_classification(n_samples=20)
    lr1 = LogisticRegression(random_state=0)
    lr1.fit(X, y)
    lr2 = LogisticRegression(random_state=0)
    lr2.fit(X, y)
    assert_array_almost_equal(lr1.coef_, lr2.coef_)
Beispiel #13
0
def test_warm_start(solver, warm_start, fit_intercept, multi_class):
    # A 1-iteration second fit on same data should give almost same result
    # with warm starting, and quite different result without warm starting.
    # Warm starting does not work with liblinear solver.
    X, y = iris.data, iris.target

    clf = LogisticRegression(tol=1e-4, multi_class=multi_class,
                             warm_start=warm_start,
                             solver=solver,
                             random_state=42, max_iter=100,
                             fit_intercept=fit_intercept)
    with ignore_warnings(category=ConvergenceWarning):
        clf.fit(X, y)
        coef_1 = clf.coef_

        clf.max_iter = 1
        clf.fit(X, y)
    cum_diff = np.sum(np.abs(coef_1 - clf.coef_))
    msg = ("Warm starting issue with %s solver in %s mode "
           "with fit_intercept=%s and warm_start=%s"
           % (solver, multi_class, str(fit_intercept),
              str(warm_start)))
    if warm_start:
        assert_greater(2.0, cum_diff, msg)
    else:
        assert_greater(cum_diff, 2.0, msg)
Beispiel #14
0
def test_logistic_regression_solvers():
    X, y = make_classification(n_features=10, n_informative=5, random_state=0)

    ncg = LogisticRegression(solver='newton-cg', fit_intercept=False)
    lbf = LogisticRegression(solver='lbfgs', fit_intercept=False)
    lib = LogisticRegression(fit_intercept=False)
    sag = LogisticRegression(solver='sag', fit_intercept=False,
                             random_state=42)
    saga = LogisticRegression(solver='saga', fit_intercept=False,
                              random_state=42)
    ncg.fit(X, y)
    lbf.fit(X, y)
    sag.fit(X, y)
    saga.fit(X, y)
    lib.fit(X, y)
    assert_array_almost_equal(ncg.coef_, lib.coef_, decimal=3)
    assert_array_almost_equal(lib.coef_, lbf.coef_, decimal=3)
    assert_array_almost_equal(ncg.coef_, lbf.coef_, decimal=3)
    assert_array_almost_equal(sag.coef_, lib.coef_, decimal=3)
    assert_array_almost_equal(sag.coef_, ncg.coef_, decimal=3)
    assert_array_almost_equal(sag.coef_, lbf.coef_, decimal=3)
    assert_array_almost_equal(saga.coef_, sag.coef_, decimal=3)
    assert_array_almost_equal(saga.coef_, lbf.coef_, decimal=3)
    assert_array_almost_equal(saga.coef_, ncg.coef_, decimal=3)
    assert_array_almost_equal(saga.coef_, lib.coef_, decimal=3)
Beispiel #15
0
def test_logistic_regression_solvers_multiclass():
    X, y = make_classification(n_samples=20, n_features=20, n_informative=10,
                               n_classes=3, random_state=0)
    tol = 1e-7
    ncg = LogisticRegression(solver='newton-cg', fit_intercept=False, tol=tol)
    lbf = LogisticRegression(solver='lbfgs', fit_intercept=False, tol=tol)
    lib = LogisticRegression(fit_intercept=False, tol=tol)
    sag = LogisticRegression(solver='sag', fit_intercept=False, tol=tol,
                             max_iter=1000, random_state=42)
    saga = LogisticRegression(solver='saga', fit_intercept=False, tol=tol,
                              max_iter=10000, random_state=42)
    ncg.fit(X, y)
    lbf.fit(X, y)
    sag.fit(X, y)
    saga.fit(X, y)
    lib.fit(X, y)
    assert_array_almost_equal(ncg.coef_, lib.coef_, decimal=4)
    assert_array_almost_equal(lib.coef_, lbf.coef_, decimal=4)
    assert_array_almost_equal(ncg.coef_, lbf.coef_, decimal=4)
    assert_array_almost_equal(sag.coef_, lib.coef_, decimal=4)
    assert_array_almost_equal(sag.coef_, ncg.coef_, decimal=4)
    assert_array_almost_equal(sag.coef_, lbf.coef_, decimal=4)
    assert_array_almost_equal(saga.coef_, sag.coef_, decimal=4)
    assert_array_almost_equal(saga.coef_, lbf.coef_, decimal=4)
    assert_array_almost_equal(saga.coef_, ncg.coef_, decimal=4)
    assert_array_almost_equal(saga.coef_, lib.coef_, decimal=4)
Beispiel #16
0
def test_logreg_l1():
    # Because liblinear penalizes the intercept and saga does not, we do not
    # fit the intercept to make it possible to compare the coefficients of
    # the two models at convergence.
    rng = np.random.RandomState(42)
    n_samples = 50
    X, y = make_classification(n_samples=n_samples, n_features=20,
                               random_state=0)
    X_noise = rng.normal(size=(n_samples, 3))
    X_constant = np.ones(shape=(n_samples, 2))
    X = np.concatenate((X, X_noise, X_constant), axis=1)
    lr_liblinear = LogisticRegression(penalty="l1", C=1.0, solver='liblinear',
                                      fit_intercept=False,
                                      tol=1e-10)
    lr_liblinear.fit(X, y)

    lr_saga = LogisticRegression(penalty="l1", C=1.0, solver='saga',
                                 fit_intercept=False,
                                 max_iter=1000, tol=1e-10)
    lr_saga.fit(X, y)
    assert_array_almost_equal(lr_saga.coef_, lr_liblinear.coef_)

    # Noise and constant features should be regularized to zero by the l1
    # penalty
    assert_array_almost_equal(lr_liblinear.coef_[0, -5:], np.zeros(5))
    assert_array_almost_equal(lr_saga.coef_[0, -5:], np.zeros(5))
Beispiel #17
0
def test_logreg_cv_penalty():
    # Test that the correct penalty is passed to the final fit.
    X, y = make_classification(n_samples=50, n_features=20, random_state=0)
    lr_cv = LogisticRegressionCV(penalty="l1", Cs=[1.0], solver='liblinear')
    lr_cv.fit(X, y)
    lr = LogisticRegression(penalty="l1", C=1.0, solver='liblinear')
    lr.fit(X, y)
    assert_equal(np.count_nonzero(lr_cv.coef_), np.count_nonzero(lr.coef_))
Beispiel #18
0
def classify_logistic(train_features, train_labels, test_features):
    global SAVE
    clf = LogisticRegression()
    clf.fit(train_features, train_labels)

    if not TEST and SAVE:
        save_pickle("logistic", clf)

    return clf.predict(test_features)
def main():
    scriptdir = os.path.dirname(os.path.realpath(__file__))
    default_pool = scriptdir+"/../data/cwi_training/cwi_training.txt.lbl.conll"
    parser = argparse.ArgumentParser(description="Skeleton for features and classifier for CWI-2016--optimisation of threshhold")
    parser.add_argument('--iterations',type=int,default=5)

    args = parser.parse_args()


    all_feats = []
    all_labels = defaultdict(list)
    scores = defaultdict(list)




    for idx in "01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 16 17 18 19 20".split(" "):
#    for idx in "01".split(" "):
        current_single_ann = scriptdir+"/../data/cwi_training/cwi_training_"+idx+".lbl.conll"
        f_current, labels_current, v_current = feats_and_classify.collect_features(current_single_ann,vectorize=False,generateFeatures=False)
        for instance_index,l in enumerate(labels_current):
            all_labels[instance_index].append(l)
    current_single_ann = scriptdir+"/../data/cwi_training/cwi_training_01.lbl.conll"
    feats, labels_current, v_current = feats_and_classify.collect_features(current_single_ann,vectorize=True,generateFeatures=True)

    for it in range(args.iterations):
        for TrainIndices, TestIndices in cross_validation.KFold(n=feats.shape[0], n_folds=10, shuffle=True, random_state=None):
            maxent = LogisticRegression(penalty='l2')

            TrainX_i = feats[TrainIndices]
            Trainy_i = [all_labels[x][random.randrange(0,20)] for x in TrainIndices]

            TestX_i = feats[TestIndices]
            Testy_i =  [all_labels[x][random.randrange(0,20)] for x in TestIndices]

            maxent.fit(TrainX_i,Trainy_i)
            ypred_i = maxent.predict(TestX_i)

            acc = accuracy_score(ypred_i, Testy_i)
            pre = precision_score(ypred_i, Testy_i)
            rec = recall_score(ypred_i, Testy_i)
            # shared task uses f1 of *accuracy* and recall!
            f1 = 2 * acc * rec / (acc + rec)

            scores["Accuracy"].append(acc)
            scores["F1"].append(f1)
            scores["Precision"].append(pre)
            scores["Recall"].append(rec)
        #scores = cross_validation.cross_val_score(maxent, features, labels, cv=10)
        print("--")

    for key in sorted(scores.keys()):
        currentmetric = np.array(scores[key])
        print("%s : %0.2f (+/- %0.2f)" % (key,currentmetric.mean(), currentmetric.std()))
    print("--")

    sys.exit(0)
Beispiel #20
0
def my_module(rt, params, inputs, outputs):
    # TODO : Fill your code here
    X = pickle.load(open(inputs.X, 'r'))
    Y = pickle.load(open(inputs.Y, 'r'))

    model = LogisticRegression()
    model.fit(X, Y)
    pickle.dump(model, open(outputs.MODEL, 'w'))
    
    print "Done"
Beispiel #21
0
def prepare(imageFolder):
    print("preparing images...")
    inputData = []
    outputData = []
    for file in os.listdir(imageFolder):
        if (len(file) == 8):
            inputData += getInputFromPic(Image.open(imageFolder + file).convert("L"), file)
            outputData += getOutputFromFileName(file)
    print("training model...")
    model = LogisticRegression()
    model.fit(inputData, outputData) 
    return model
Beispiel #22
0
def test_liblinear_decision_function_zero():
    # Test negative prediction when decision_function values are zero.
    # Liblinear predicts the positive class when decision_function values
    # are zero. This is a test to verify that we do not do the same.
    # See Issue: https://github.com/scikit-learn/scikit-learn/issues/3600
    # and the PR https://github.com/scikit-learn/scikit-learn/pull/3623
    X, y = make_classification(n_samples=5, n_features=5, random_state=0)
    clf = LogisticRegression(fit_intercept=False)
    clf.fit(X, y)

    # Dummy data such that the decision function becomes zero.
    X = np.zeros((5, 5))
    assert_array_equal(clf.predict(X), np.zeros(5))
Beispiel #23
0
    def fit_model_2(self, lol = .07, toWrite = False):
        model = LogisticRegression(C = lol, penalty = 'l1', tol = 1e-6)

        for data in self.cv_data:
            X_train, X_test, Y_train, Y_test = data
            X_train,Y_train = self.balance_data(X_train,Y_train)
            model.fit(X_train,Y_train)
            pred = model.predict_proba(X_test)[:,1]
            print("Model 2 Score: %f" % (logloss(Y_test,pred),))

        if toWrite:
            f2 = open('model2/model.pkl','w')
            pickle.dump(model,f2)
            f2.close()
Beispiel #24
0
def test_logistic_regression_class_weights():
    # Multinomial case: remove 90% of class 0
    X = iris.data[45:, :]
    y = iris.target[45:]
    solvers = ("lbfgs", "newton-cg")
    class_weight_dict = _compute_class_weight_dictionary(y)

    for solver in solvers:
        clf1 = LogisticRegression(solver=solver, multi_class="multinomial",
                                  class_weight="balanced")
        clf2 = LogisticRegression(solver=solver, multi_class="multinomial",
                                  class_weight=class_weight_dict)
        clf1.fit(X, y)
        clf2.fit(X, y)
        assert_array_almost_equal(clf1.coef_, clf2.coef_, decimal=4)

    # Binary case: remove 90% of class 0 and 100% of class 2
    X = iris.data[45:100, :]
    y = iris.target[45:100]
    solvers = ("lbfgs", "newton-cg", "liblinear")
    class_weight_dict = _compute_class_weight_dictionary(y)

    for solver in solvers:
        clf1 = LogisticRegression(solver=solver, multi_class="ovr",
                                  class_weight="balanced")
        clf2 = LogisticRegression(solver=solver, multi_class="ovr",
                                  class_weight=class_weight_dict)
        clf1.fit(X, y)
        clf2.fit(X, y)
        assert_array_almost_equal(clf1.coef_, clf2.coef_, decimal=6)
Beispiel #25
0
def test_max_iter():
    # Test that the maximum number of iteration is reached
    X, y_bin = iris.data, iris.target.copy()
    y_bin[y_bin == 2] = 0

    solvers = ["newton-cg", "liblinear", "sag"]
    # old scipy doesn't have maxiter
    if sp_version >= (0, 12):
        solvers.append("lbfgs")

    for max_iter in range(1, 5):
        for solver in solvers:
            lr = LogisticRegression(max_iter=max_iter, tol=1e-15, random_state=0, solver=solver)
            lr.fit(X, y_bin)
            assert_equal(lr.n_iter_[0], max_iter)
Beispiel #26
0
def test_multinomial_binary_probabilities():
    # Test multinomial LR gives expected probabilities based on the
    # decision function, for a binary problem.
    X, y = make_classification()
    clf = LogisticRegression(multi_class='multinomial', solver='saga')
    clf.fit(X, y)

    decision = clf.decision_function(X)
    proba = clf.predict_proba(X)

    expected_proba_class_1 = (np.exp(decision) /
                              (np.exp(decision) + np.exp(-decision)))
    expected_proba = np.c_[1-expected_proba_class_1, expected_proba_class_1]

    assert_almost_equal(proba, expected_proba)
Beispiel #27
0
def clazzify(train_mat, test_mat, true_train_labels):
    """
    """
    # learn
    logging.info('learning...')
    model = LogisticRegression(random_state=17, penalty='l1')
    model.fit(train_mat, true_train_labels)
    logging.info('finished learning.')

    # test
    logging.info('testing')
    predicted_test_labels = model.predict(test_mat)
    logging.info('finished testing')

    return predicted_test_labels, model
    def test_scikit_learn_exploded_data(self):
        # Check results with scikit learn

        betas = [0.001, 0.07, 0.4]
        matrix = create_random_observations(200, 2, betas)
        new_matrix = explode_matrix(matrix)
        X = new_matrix[:,:-2]
        y = new_matrix[:, -1]

        lib = LogisticRegression(fit_intercept=True)
        lib.fit(X, y)

        path = self.logitfitL1.fit(new_matrix, self.lambda_grid)

        skbetas = np.append(lib.intercept_[0], lib.coef_)
        np.testing.assert_almost_equal(skbetas, self.logitfitL1.coef_, 2)
def generate_submission():
    global alg, predictions, submission
    # The columns we'll use to predict the target
    # Initialize the algorithm class
    alg = LogisticRegression(random_state=1)
    # Train the algorithm using all the training data
    alg.fit(train[predictors], train["Survived"])
    # Make predictions using the test set.
    predictions = alg.predict(test[predictors])
    # Create a new dataframe with only the columns Kaggle wants from the dataset.
    submission = pandas.DataFrame({
        "PassengerId": test["PassengerId"],
        "Survived": predictions
    })
    submission.to_csv("kaggle.csv", index=False)
    print("kaggele.csv is generated")
def test_multinomial_binary():
    """Test multinomial LR on a binary problem."""
    target = (iris.target > 0).astype(np.intp)
    target = np.array(["setosa", "not-setosa"])[target]

    clf = LogisticRegression(solver='lbfgs', multi_class='multinomial')
    clf.fit(iris.data, target)

    assert_equal(clf.coef_.shape, (1, iris.data.shape[1]))
    assert_equal(clf.intercept_.shape, (1,))
    assert_array_equal(clf.predict(iris.data), target)

    mlr = LogisticRegression(solver='lbfgs', multi_class='multinomial',
                             fit_intercept=False)
    mlr.fit(iris.data, target)
    pred = clf.classes_[np.argmax(clf.predict_log_proba(iris.data), axis=1)]
    assert_greater(np.mean(pred == target), .9)
Beispiel #31
0
def test_multinomial_logistic_regression_string_inputs():
    # Test with string labels for LogisticRegression(CV)
    n_samples, n_features, n_classes = 50, 5, 3
    X_ref, y = make_classification(n_samples=n_samples,
                                   n_features=n_features,
                                   n_classes=n_classes,
                                   n_informative=3)
    y_str = LabelEncoder().fit(['bar', 'baz', 'foo']).inverse_transform(y)
    # For numerical labels, let y values be taken from set (-1, 0, 1)
    y = np.array(y) - 1
    # Test for string labels
    lr = LogisticRegression(solver='lbfgs', multi_class='multinomial')
    lr_cv = LogisticRegressionCV(solver='lbfgs', multi_class='multinomial')
    lr_str = LogisticRegression(solver='lbfgs', multi_class='multinomial')
    lr_cv_str = LogisticRegressionCV(solver='lbfgs', multi_class='multinomial')

    lr.fit(X_ref, y)
    lr_cv.fit(X_ref, y)
    lr_str.fit(X_ref, y_str)
    lr_cv_str.fit(X_ref, y_str)

    assert_array_almost_equal(lr.coef_, lr_str.coef_)
    assert_equal(sorted(lr_str.classes_), ['bar', 'baz', 'foo'])
    assert_array_almost_equal(lr_cv.coef_, lr_cv_str.coef_)
    assert_equal(sorted(lr_str.classes_), ['bar', 'baz', 'foo'])
    assert_equal(sorted(lr_cv_str.classes_), ['bar', 'baz', 'foo'])

    # The predictions should be in original labels
    assert_equal(sorted(np.unique(lr_str.predict(X_ref))),
                 ['bar', 'baz', 'foo'])
    assert_equal(sorted(np.unique(lr_cv_str.predict(X_ref))),
                 ['bar', 'baz', 'foo'])

    # Make sure class weights can be given with string labels
    lr_cv_str = LogisticRegression(solver='lbfgs',
                                   class_weight={
                                       'bar': 1,
                                       'baz': 2,
                                       'foo': 0
                                   },
                                   multi_class='multinomial').fit(
                                       X_ref, y_str)
    assert_equal(sorted(np.unique(lr_cv_str.predict(X_ref))), ['bar', 'baz'])
def test_logreg_predict_proba_multinomial():
    X, y = make_classification(n_samples=10, n_features=20, random_state=0,
                               n_classes=3, n_informative=10)

    # Predicted probabilites using the true-entropy loss should give a
    # smaller loss than those using the ovr method.
    clf_multi = LogisticRegression(multi_class="multinomial", solver="lbfgs")
    clf_multi.fit(X, y)
    clf_multi_loss = log_loss(y, clf_multi.predict_proba(X))
    clf_ovr = LogisticRegression(multi_class="ovr", solver="lbfgs")
    clf_ovr.fit(X, y)
    clf_ovr_loss = log_loss(y, clf_ovr.predict_proba(X))
    assert_greater(clf_ovr_loss, clf_multi_loss)

    # Predicted probabilites using the soft-max function should give a
    # smaller loss than those using the logistic function.
    clf_multi_loss = log_loss(y, clf_multi.predict_proba(X))
    clf_wrong_loss = log_loss(y, clf_multi._predict_proba_lr(X))
    assert_greater(clf_wrong_loss, clf_multi_loss)
def test_multinomial_binary():
    # Test multinomial LR on a binary problem.
    target = (iris.target > 0).astype(np.intp)
    target = np.array(["setosa", "not-setosa"])[target]

    for solver in ['lbfgs', 'newton-cg']:
        clf = LogisticRegression(solver=solver, multi_class='multinomial')
        clf.fit(iris.data, target)

        assert_equal(clf.coef_.shape, (1, iris.data.shape[1]))
        assert_equal(clf.intercept_.shape, (1,))
        assert_array_equal(clf.predict(iris.data), target)

        mlr = LogisticRegression(solver=solver, multi_class='multinomial',
                                 fit_intercept=False)
        mlr.fit(iris.data, target)
        pred = clf.classes_[np.argmax(clf.predict_log_proba(iris.data),
                                      axis=1)]
        assert_greater(np.mean(pred == target), .9)
Beispiel #34
0
def train():
    start = timeit.default_timer()
    df = pd.read_csv('train.csv', delimiter=',', header=None)

    print("start training ")

    train, test = train_test_split(df, test_size=0.2)

    X_train_raw = train[1]

    y_toxic = train[2]
    #y_severe_toxic       =    train[3]
    #y_obscene            =    train[4]
    #y_threat             =    train[5]
    #y_insult             =    train[6]
    #y_identity_hate      =    train[7]

    vectorizer = TfidfVectorizer()
    X_train = vectorizer.fit_transform(X_train_raw)

    #classifier_toxic = LogisticRegression(solver ='lbfgs')
    #classifier_toxic.fit(X_train, y_toxic)

    #    classifier_toxic = OneVsOneClassifier(LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
    #intercept_scaling=1, loss='l2', multi_class='ovr', penalty='l2',
    #random_state=None, tol=0.0001, verbose=0))

    classifier_toxic = LogisticRegression(random_state=0, multi_class='ovr')
    classifier_toxic.fit(X_train, y_toxic)

    f = open("logistic_complain_vectorizer.pickle", 'wb')
    pickle.dump(vectorizer, f)
    f.close()

    f = open("logistic_complain_classfier.pickle", 'wb')
    pickle.dump(classifier_toxic, f)
    f.close()

    stop = timeit.default_timer()

    print("training time took was : ")
    print stop - start
Beispiel #35
0
def logistic_regression(inp):
    arr = xml_parser.xml_parser('logistic_regression_Params.xml', inp)
    n = len(arr)
    if n != 14:
        return False
    try:
        X, y = make_classification(n_samples=arr[0],
                                   n_features=arr[1],
                                   n_informative=arr[2],
                                   n_classes=arr[3])
    except ValueError:
        #	pass
        #        print("here")
        return False
    print(arr)
    # print("here")
    # try:
    #     with parallel_backend(backend):
    #         print("here0")
    try:
        clf = LogisticRegression(penalty=arr[6],
                                 dual=arr[7],
                                 tol=arr[8],
                                 C=arr[9],
                                 fit_intercept=arr[10],
                                 intercept_scaling=arr[11],
                                 solver=arr[5],
                                 n_jobs=arr[4],
                                 max_iter=arr[12],
                                 multi_class=arr[13])
        clf.fit(X, y)


#        print("here1")
    except ValueError:
        #	pass
        #        print("here2")
        return False
    # except KeyError:
    #     # print("here3")
    #     return False
    return True
def test_max_iter():
    # Test that the maximum number of iteration is reached
    X, y_bin = iris.data, iris.target.copy()
    y_bin[y_bin == 2] = 0

    solvers = ['newton-cg', 'liblinear', 'sag']
    # old scipy doesn't have maxiter
    if sp_version >= (0, 12):
        solvers.append('lbfgs')

    for max_iter in range(1, 5):
        for solver in solvers:
            for multi_class in ['ovr', 'multinomial']:
                if solver == 'liblinear' and multi_class == 'multinomial':
                    continue
                lr = LogisticRegression(max_iter=max_iter, tol=1e-15,
                                        multi_class=multi_class,
                                        random_state=0, solver=solver)
                lr.fit(X, y_bin)
                assert_equal(lr.n_iter_[0], max_iter)
Beispiel #37
0
def main():
    classes = ['chimp', 'corvette', 'tokyo', 'goldengatebridge']

    images, labels = get_labels(classes)
    std_features = get_standard_features(images)

    k = 256
    surf_features = get_visual_words(images, k)
    tas_features = get_tas_features(images)

    feature_dict = {
        'Std': std_features,
        'SURF': surf_features,
        'TAS': tas_features
        #'Zernike': zernike_features
    }

    best_features = log_classify(feature_dict, labels)
    classifier = LogisticRegression()
    classifier.fit(best_features, labels)
def getBestThreshold(features, labels_pooled, labels_current):
    print("length of pooled and current", len(labels_pooled),
          len(labels_current))
    maxent = LogisticRegression(penalty='l1')
    scores = {"F1": [], "Recall": [], "Accuracy": [], "Precision": []}
    thresholds = []

    print('Finding best thresholds...')
    fold = 1
    #    for TrainIndices, TestIndices in cross_validation.StratifiedKFold(labels_pooled, n_folds=2, shuffle=False, random_state=None):
    for TrainIndices, TestIndices in cross_validation.StratifiedKFold(
            labels_pooled, n_folds=10, shuffle=False, random_state=None):
        #    for TrainIndices, TestIndices in cross_validation.KFold(n=features.shape[0], n_folds=10, shuffle=False, random_state=None):
        print('\r' + str(fold), end="")
        fold += 1
        TrainX_i = features[TrainIndices]
        Trainy_i = labels_pooled[TrainIndices]

        TestX_i = features[TestIndices]
        Testy_i = labels_current[TestIndices]

        maxent.fit(TrainX_i, Trainy_i)
        #get prediction
        thresh_i, ypred_i, score = optimize_threshold(maxent, TestX_i, Testy_i)
        thresholds.append(thresh_i)

        scores["F1"].append(score[0])
        scores["Recall"].append(score[1])
        scores["Accuracy"].append(score[2])
        scores["Precision"].append(score[3])

    #scores = cross_validation.cross_val_score(maxent, features, labels, cv=10)
    print("\n--")

    for key in sorted(scores.keys()):
        currentmetric = np.array(scores[key])
        print("%s : %0.2f (+/- %0.2f)" %
              (key, currentmetric.mean(), currentmetric.std()))
    print("--")

    return maxent, np.array(thresholds)
Beispiel #39
0
def getBestThreshold(X, y_current_tr, y_current_te, regularization='l2'):
    assert len(X) == len(y_current_tr) == len(
        y_current_te
    ), 'Number of features ({}), annotator1 labels ({}) and annotator2 labels ({}) is not equal!'.format(
        len(X), len(y_current_tr), len(y_current_te))
    maxent = LogisticRegression(penalty=regularization)
    scores = {"F1": [], "Recall": [], "Accuracy": [], "Precision": []}
    thresholds = []

    print('Finding best thresholds...')
    fold = 1
    for TrainIndices, TestIndices in cross_validation.StratifiedKFold(
            y_current_tr, n_folds=10, shuffle=False, random_state=None):
        print('\r' + str(fold), end="")
        fold += 1
        TrainX_i = X[TrainIndices]
        Trainy_i = y_current_tr[TrainIndices]

        TestX_i = X[TestIndices]
        Testy_i = y_current_te[TestIndices]

        maxent.fit(TrainX_i, Trainy_i)
        #get prediction
        thresh_i, ypred_i, score = optimize_threshold(maxent, TestX_i, Testy_i)
        thresholds.append(thresh_i)

        scores["F1"].append(score[0])
        scores["Recall"].append(score[1])
        scores["Accuracy"].append(score[2])
        scores["Precision"].append(score[3])

    #scores = cross_validation.cross_val_score(maxent, features, labels, cv=10)
    print("\n--")

    for key in sorted(scores.keys()):
        currentmetric = np.array(scores[key])
        print("%s : %0.2f (+/- %0.2f)" %
              (key, currentmetric.mean(), currentmetric.std()))
    print("--")

    return maxent, np.array(thresholds)
def predictions():
    import numpy as np
    import pandas as pd
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.linear_model.logistic import LogisticRegression
    from sklearn.cross_validation import train_test_split, cross_val_score

    df = pd.read_csv('data/SMSSpamCollection', delimiter='\t', header=None)
    X_train_raw, X_test_raw, y_train, y_test = train_test_split(df[1], df[0])
    #75% goes to training set and 25% goes to test set

    vactorizer = TfidfVectorizer()
    X_train = vactorizer.fit_transform(X_train_raw)
    X_test = vactorizer.transform(X_test_raw)

    #the model
    classifier = LogisticRegression()
    classifier.fit(X_train, y_train)
    predictions = classifier.predict(X_test)
    for i, prediction in enumerate(predictions[:5]):
        print 'Prediction: %s. Message: %s' % (prediction, X_test_raw[i])
Beispiel #41
0
    def regress_on_words(self, word_index, X):
        """
        word: The word that we are interested in
        text_corpus: input
        """
        labels = []
        tmp_X = X  # Avoid directly changing the variable
        for idx, sentence in enumerate(X):
            if (sentence[word_index] == 1):
                # tmp_X[idx][word_index] = 0
                labels.append(1)
            else:
                labels.append(0)

        # Build the logistic regression model
        log_reg = LogisticRegression()
        log_reg.fit(tmp_X, labels)

        probs = log_reg.predict_proba(tmp_X)[:, -1]

        return probs
Beispiel #42
0
def validate_model(X, y, N, digit, classifier):
    """
    This function validate the model by K-fold cross validation and print the ROC curves
    in the result folder

    :param X: nparray, one row is one sample
    :param y: nparray, labels
    :param out: output filename
    :param N: number of CPU cores to use
    :param digit: digit of the captcha
    :param classifier: which classifier to use
    :return: None
    """
    # K-fold cross validation
    folds = KFold(n_splits=5, shuffle=True, random_state=1234567).split(X)
    fold_r = fold_result()
    labels = np.unique(y)
    category_rs = [None] * len(labels)
    for label in labels:
        category_rs[label] = category_result()
    for train, test in folds:
        X_train = X[train]
        X_test = X[test]
        y_train = y[train]
        y_test = y[test]
        if (classifier == 'Logistic'):
            clf = LogisticRegression(solver='sag', n_jobs=N)
        else:
            clf = RandomForestClassifier(n_estimators=200,
                                         random_state=1234567,
                                         n_jobs=N)
        clf.fit(X_train, y_train)
        probas = clf.predict_proba(X_test)
        fold_r.append(clf, X_train, y_train, X_test, y_test)
        for label in labels:
            category_rs[label].append(y_test, probas[:, label], label)
    fold_r.print_score(digit)
    # print and save ROCS
    for label in labels:
        category_rs[label].print_result(label, digit)
Beispiel #43
0
def test_dtype_match():
    # Test that np.float32 input data is not cast to np.float64 when possible

    X_32 = np.array(X).astype(np.float32)
    y_32 = np.array(Y1).astype(np.float32)
    X_64 = np.array(X).astype(np.float64)
    y_64 = np.array(Y1).astype(np.float64)
    X_sparse_32 = sp.csr_matrix(X, dtype=np.float32)

    for solver in ['newton-cg']:
        for multi_class in ['ovr', 'multinomial']:

            # Check type consistency
            lr_32 = LogisticRegression(solver=solver, multi_class=multi_class)
            lr_32.fit(X_32, y_32)
            assert_equal(lr_32.coef_.dtype, X_32.dtype)

            # check consistency with sparsity
            lr_32_sparse = LogisticRegression(solver=solver,
                                              multi_class=multi_class)
            lr_32_sparse.fit(X_sparse_32, y_32)
            assert_equal(lr_32_sparse.coef_.dtype, X_sparse_32.dtype)

            # Check accuracy consistency
            lr_64 = LogisticRegression(solver=solver, multi_class=multi_class)
            lr_64.fit(X_64, y_64)
            assert_equal(lr_64.coef_.dtype, X_64.dtype)
            assert_almost_equal(lr_32.coef_, lr_64.coef_.astype(np.float32))
Beispiel #44
0
def weighted_params():
    with open('../../../../baselines/GSBC/YELPNYC/finalgrps.json', 'r') as fp:
        grps = json.load(fp)
        X = []
        Y = []
        c = 0
        tot = 0
        mc = 0
        # for grp in grps:
        #     scorepred=grps[grp]['scorepred']
        #     if len(grps[grp]['users'])>1 and (sum(scorepred[:5]))/6.0>0.4:
        #         tot=tot+1
        for grp in grps:
            scorepred = grps[grp]['scorepred']
            if len(grps[grp]['users']) > 1:
                mc = mc + 1
        # mc=len(grps)
        for grp in grps:
            scorepred = grps[grp]['scorepred']
            if len(grps[grp]['users']) > 1:
                X.append(grps[grp]['scorepred'][:-2])
                if c < int(0.72 * (mc)):
                    Y.append(0)
                else:
                    Y.append(1)
                c = c + 1
        classifier = LogisticRegression()
        classifier.fit(X, Y)

        for grp in grps:
            scorepred = grps[grp]['scorepred']
            if len(grps[grp]['users']) > 1:
                if grps[grp]['id'] not in ccgroups:
                    ccgroups[grps[grp]['id']] = 0
                    gtgroups[grps[grp]['id']] = 0
                ccgroups[grps[grp]['id']] = sum([
                    grps[grp]['scorepred'][i] * classifier.coef_[0][i]
                    for i in range(8)
                ]) / 8.0
                gtgroups[grps[grp]['id']] = grps[grp]['scoregt']
Beispiel #45
0
def test_logreg_l1_sparse_data():
    # Because liblinear penalizes the intercept and saga does not, we do not
    # fit the intercept to make it possible to compare the coefficients of
    # the two models at convergence.
    rng = np.random.RandomState(42)
    n_samples = 50
    X, y = make_classification(n_samples=n_samples, n_features=20,
                               random_state=0)
    X_noise = rng.normal(scale=0.1, size=(n_samples, 3))
    X_constant = np.zeros(shape=(n_samples, 2))
    X = np.concatenate((X, X_noise, X_constant), axis=1)
    X[X < 1] = 0
    X = sparse.csr_matrix(X)

    lr_liblinear = LogisticRegression(penalty="l1", C=1.0, solver='liblinear',
                                      fit_intercept=False,
                                      tol=1e-10)
    lr_liblinear.fit(X, y)

    lr_saga = LogisticRegression(penalty="l1", C=1.0, solver='saga',
                                 fit_intercept=False,
                                 max_iter=1000, tol=1e-10)
    lr_saga.fit(X, y)
    assert_array_almost_equal(lr_saga.coef_, lr_liblinear.coef_)
    # Noise and constant features should be regularized to zero by the l1
    # penalty
    assert_array_almost_equal(lr_liblinear.coef_[0, -5:], np.zeros(5))
    assert_array_almost_equal(lr_saga.coef_[0, -5:], np.zeros(5))

    # Check that solving on the sparse and dense data yield the same results
    lr_saga_dense = LogisticRegression(penalty="l1", C=1.0, solver='saga',
                                       fit_intercept=False,
                                       max_iter=1000, tol=1e-10)
    lr_saga_dense.fit(X.toarray(), y)
    assert_array_almost_equal(lr_saga.coef_, lr_saga_dense.coef_)
Beispiel #46
0
def gbdt_lr_train(libsvmFileName):

    # load样本数据
    X_all, y_all = load_svmlight_file(libsvmFileName)

    # 训练/测试数据分割
    X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size = 0.3, random_state = 42)

    # 定义GBDT模型
    gbdt = GradientBoostingClassifier(n_estimators=40, max_depth=3, verbose=0,max_features=0.5)

    # 训练学习
    gbdt.fit(X_train, y_train)

    # 预测及AUC评测
    y_pred_gbdt = gbdt.predict_proba(X_test.toarray())[:, 1]
    gbdt_auc = roc_auc_score(y_test, y_pred_gbdt)
    print('gbdt auc: %.5f' % gbdt_auc)

    # lr对原始特征样本模型训练
    lr = LogisticRegression()
    lr.fit(X_train, y_train)    # 预测及AUC评测
    y_pred_test = lr.predict_proba(X_test)[:, 1]
    lr_test_auc = roc_auc_score(y_test, y_pred_test)
    print('基于原有特征的LR AUC: %.5f' % lr_test_auc)

    # GBDT编码原有特征
    X_train_leaves = gbdt.apply(X_train)[:,:,0]
    X_test_leaves = gbdt.apply(X_test)[:,:,0]

    # 对所有特征进行ont-hot编码
    (train_rows, cols) = X_train_leaves.shape

    gbdtenc = OneHotEncoder()
    X_trans = gbdtenc.fit_transform(np.concatenate((X_train_leaves, X_test_leaves), axis=0))

    # 定义LR模型
    lr = LogisticRegression()
    # lr对gbdt特征编码后的样本模型训练
    lr.fit(X_trans[:train_rows, :], y_train)
    # 预测及AUC评测
    y_pred_gbdtlr1 = lr.predict_proba(X_trans[train_rows:, :])[:, 1]
    gbdt_lr_auc1 = roc_auc_score(y_test, y_pred_gbdtlr1)
    print('基于GBDT特征编码后的LR AUC: %.5f' % gbdt_lr_auc1)

    # 定义LR模型
    lr = LogisticRegression(n_jobs=-1)
    # 组合特征
    X_train_ext = hstack([X_trans[:train_rows, :], X_train])
    X_test_ext = hstack([X_trans[train_rows:, :], X_test])

    print(X_train_ext.shape)
    # lr对组合特征的样本模型训练
    lr.fit(X_train_ext, y_train)

    # 预测及AUC评测
    y_pred_gbdtlr2 = lr.predict_proba(X_test_ext)[:, 1]
    gbdt_lr_auc2 = roc_auc_score(y_test, y_pred_gbdtlr2)
    print('基于组合特征的LR AUC: %.5f' % gbdt_lr_auc2)
def _randomized_logistic(X, y, weights, mask, C=1., verbose=False,
                         fit_intercept=True, tol=1e-3):
    X = X[safe_mask(X, mask)]
    y = y[mask]
    if issparse(X):
        size = len(weights)
        weight_dia = sparse.dia_matrix((1 - weights, 0), (size, size))
        X = X * weight_dia
    else:
        X *= (1 - weights)

    C = np.atleast_1d(np.asarray(C, dtype=np.float64))
    scores = np.zeros((X.shape[1], len(C)), dtype=np.bool)

    for this_C, this_scores in zip(C, scores.T):
        # XXX : would be great to do it with a warm_start ...
        clf = LogisticRegression(C=this_C, tol=tol, penalty='l1', dual=False,
                                 fit_intercept=fit_intercept)
        clf.fit(X, y)
        this_scores[:] = np.any(
            np.abs(clf.coef_) > 10 * np.finfo(np.float).eps, axis=0)
    return scores
Beispiel #48
0
def main():
    X = df_train.drop(['cust_id', 'y', 'cust_group'], axis=1, inplace=False)
    y = df_train['y']
    X_train,X_test , y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    print(X_train.shape, X_test.shape)
    # X_train=extract_feature(X_train,y_train)
    clf=LogisticRegression(C=1.0,max_iter=100,random_state=10)

    print("===="*20)
    clf.fit(X_train, y_train)
    prob  = clf.predict_proba(X_test)
    pred = np.argmax(prob, axis=1)
    print("mean_squared_error:", mean_squared_error(y_test, prob[:, 1]))
    print("log_loss:", log_loss(y_test.astype(int), prob[:, 1]))
    print("roc_auc_score:", roc_auc_score(y_test, prob[:, 1]))
    # high_danger_prob=prob[:, 1]
    # print(high_danger_prob)

    # print("调参")
    # tune_params(X_test, y_test)

    predict(clf)
Beispiel #49
0
def test_consistency_path():
    """Test that the path algorithm is consistent"""
    rng = np.random.RandomState(0)
    X = np.concatenate((rng.randn(100, 2) + [1, 1], rng.randn(100, 2)))
    y = [1] * 100 + [-1] * 100
    Cs = np.logspace(0, 4, 10)

    f = ignore_warnings
    # can't test with fit_intercept=True since LIBLINEAR
    # penalizes the intercept
    for method in ('lbfgs', 'newton-cg', 'liblinear'):
        coefs, Cs = f(logistic_regression_path)(X,
                                                y,
                                                Cs=Cs,
                                                fit_intercept=False,
                                                tol=1e-16,
                                                solver=method)
        for i, C in enumerate(Cs):
            lr = LogisticRegression(C=C, fit_intercept=False, tol=1e-16)
            lr.fit(X, y)
            lr_coef = lr.coef_.ravel()
            assert_array_almost_equal(lr_coef, coefs[i], decimal=4)

    # test for fit_intercept=True
    for method in ('lbfgs', 'newton-cg', 'liblinear'):
        Cs = [1e3]
        coefs, Cs = f(logistic_regression_path)(X,
                                                y,
                                                Cs=Cs,
                                                fit_intercept=True,
                                                tol=1e-4,
                                                solver=method)
        lr = LogisticRegression(C=Cs[0],
                                fit_intercept=True,
                                tol=1e-4,
                                intercept_scaling=10000)
        lr.fit(X, y)
        lr_coef = np.concatenate([lr.coef_.ravel(), lr.intercept_])
        assert_array_almost_equal(lr_coef, coefs[0], decimal=4)
Beispiel #50
0
def logistic_model_by_sk_learn():
    """
    使用sklearn实现
    """
    data = pd.read_csv('./data/03_data_model/train_data_woe.csv')
    # 应变量
    Y = data['SeriousDlqin2yrs']
    # 自变量,剔除对因变量影响不明显的变量
    X = data.drop([
        'SeriousDlqin2yrs', 'DebtRatio', 'MonthlyIncome',
        'NumberOfOpenCreditLinesAndLoans', 'NumberRealEstateLoansOrLines',
        'NumberOfDependents'
    ],
                  axis=1)
    logistic = LogisticRegression()
    logistic.fit(X,
                 Y,
                 sample_weight=(np.array(
                     X['RevolvingUtilizationOfUnsecuredLines'],
                     X['NumberOfTimes90DaysLate'])))
    print(logistic.coef_[0])
    return logistic
Beispiel #51
0
def _randomized_logistic(X,
                         y,
                         weights,
                         mask,
                         C=1.,
                         verbose=False,
                         fit_intercept=True,
                         tol=1e-3):
    X = X[safe_mask(X, mask)]
    y = y[mask]
    if issparse(X):
        size = len(weights)
        weight_dia = sparse.dia_matrix((1 - weights, 0), (size, size))
        X = X * weight_dia
    else:
        X *= (1 - weights)

    C = np.atleast_1d(np.asarray(C, dtype=np.float64))
    if C.ndim > 1:
        raise ValueError(
            "C should be 1-dimensional array-like, "
            "but got a {}-dimensional array-like instead: {}.".format(
                C.ndim, C))

    scores = np.zeros((X.shape[1], len(C)), dtype=np.bool)

    for this_C, this_scores in zip(C, scores.T):
        # XXX : would be great to do it with a warm_start ...
        clf = LogisticRegression(C=this_C,
                                 tol=tol,
                                 penalty='l1',
                                 dual=False,
                                 fit_intercept=fit_intercept,
                                 solver='liblinear',
                                 multi_class='ovr')
        clf.fit(X, y)
        this_scores[:] = np.any(
            np.abs(clf.coef_) > 10 * np.finfo(np.float).eps, axis=0)
    return scores
def test_warm_start():
    # A 1-iteration second fit on same data should give almost same result
    # with warm starting, and quite different result without warm starting.
    # Warm starting does not work with liblinear solver.
    X, y = iris.data, iris.target

    solvers = ['newton-cg', 'sag']
    # old scipy doesn't have maxiter
    if sp_version >= (0, 12):
        solvers.append('lbfgs')

    for warm_start in [True, False]:
        for fit_intercept in [True, False]:
            for solver in solvers:
                for multi_class in ['ovr', 'multinomial']:
                    if solver == 'sag' and multi_class == 'multinomial':
                        break
                    clf = LogisticRegression(tol=1e-4,
                                             multi_class=multi_class,
                                             warm_start=warm_start,
                                             solver=solver,
                                             random_state=42,
                                             max_iter=100,
                                             fit_intercept=fit_intercept)
                    clf.fit(X, y)
                    coef_1 = clf.coef_

                    clf.max_iter = 1
                    with ignore_warnings():
                        clf.fit(X, y)
                    cum_diff = np.sum(np.abs(coef_1 - clf.coef_))
                    msg = ("Warm starting issue with %s solver in %s mode "
                           "with fit_intercept=%s and warm_start=%s" %
                           (solver, multi_class, str(fit_intercept),
                            str(warm_start)))
                    if warm_start:
                        assert_greater(2.0, cum_diff, msg)
                    else:
                        assert_greater(cum_diff, 2.0, msg)
Beispiel #53
0
def test_saga_vs_liblinear():
    iris = load_iris()
    X, y = iris.data, iris.target
    X = np.concatenate([X] * 10)
    y = np.concatenate([y] * 10)

    X_bin = X[y <= 1]
    y_bin = y[y <= 1] * 2 - 1

    X_sparse, y_sparse = make_classification(n_samples=50, n_features=20,
                                             random_state=0)
    X_sparse = sparse.csr_matrix(X_sparse)

    for (X, y) in ((X_bin, y_bin), (X_sparse, y_sparse)):
        for penalty in ['l1', 'l2']:
            n_samples = X.shape[0]
            # alpha=1e-3 is time consuming
            for alpha in np.logspace(-1, 1, 3):
                saga = LogisticRegression(
                    C=1. / (n_samples * alpha),
                    solver='saga',
                    multi_class='ovr',
                    max_iter=200,
                    fit_intercept=False,
                    penalty=penalty, random_state=0, tol=1e-24)

                liblinear = LogisticRegression(
                    C=1. / (n_samples * alpha),
                    solver='liblinear',
                    multi_class='ovr',
                    max_iter=200,
                    fit_intercept=False,
                    penalty=penalty, random_state=0, tol=1e-24)

                saga.fit(X, y)
                liblinear.fit(X, y)
                # Convergence for alpha=1e-3 is very slow
                assert_array_almost_equal(saga.coef_, liblinear.coef_, 3)
Beispiel #54
0
class DetectMalicious():
    def __init__(self):
        benign_data = np.loadtxt('javascript-collection/benignjs.csv',
                                 delimiter=',',
                                 dtype=np.int32)
        evil_data = np.loadtxt('javascript-collection/eviljs.csv',
                               delimiter=',',
                               dtype=np.int32)
        train_data = np.concatenate((benign_data, evil_data), axis=0)
        self.score_template = 'TPR %(TPR)f\tFPR %(FPR)f\tAccuracy %(Accuracy)f\tAUC %(AUC)f'
        self.D = LogisticRegression()
        self.D.fit(train_data[:, :-1], train_data[:, -1])
        self.jsapi = []
        for line in open('javascript-collection/jsapi.txt'):
            self.jsapi.append(line.strip('\n'))

    def predict(self, X):
        flag = [0 for x in range(len(self.jsapi))]
        for i in range(len(self.jsapi)):
            if X.find(self.jsapi[i]) != -1:
                flag[i] = 1
        print self.D.predict([flag])
        return self.D.predict([flag])
def getLRAcc(my_data, train_ratio):

    total_rows = numpy.size(my_data, 0)
    total_cols = numpy.size(my_data, 1)
    train_rows = int(total_rows * train_ratio)

    numpy.random.shuffle(my_data)
    training, test = my_data[:train_rows, :], my_data[train_rows:, :]

    XTrain = training[:, 1:total_cols - 1]
    YTrain = training[:, total_cols - 1]

    XTest = test[:, 1:total_cols - 1]
    YTest = test[:, total_cols - 1]

    lrMulti = LogisticRegression()
    lrMulti.fit(XTrain, YTrain)
    pred_Y = lrMulti.predict(XTest)

    error = numpy.sum(YTest != pred_Y)
    nb_accuracy = (float(total_rows - error) / total_rows) * 100

    return nb_accuracy
def test_database_reconstruction_logistic_regression(get_iris_dataset):
    (x_train_iris, y_train_iris), (x_test_iris, y_test_iris) = get_iris_dataset
    y_train_iris = np.array([np.argmax(y) for y in y_train_iris])
    y_test_iris = np.array([np.argmax(y) for y in y_test_iris])

    x_private = x_test_iris[0, :].reshape(1, -1)
    y_private = y_test_iris[0]

    x_input = np.vstack((x_train_iris, x_private))
    y_input = np.hstack((y_train_iris, y_private))

    nb_private = LogisticRegression()
    nb_private.fit(x_input, y_input)
    estimator_private = ScikitlearnLogisticRegression(model=nb_private)

    recon = DatabaseReconstruction(estimator=estimator_private)
    x_recon, y_recon = recon.reconstruct(x_train_iris, y_train_iris)

    assert x_recon is not None
    assert x_recon.shape == (1, 4)
    assert y_recon.shape == (1, 3)
    assert np.isclose(x_recon, x_private, rtol=0.05).all()
    assert np.argmax(y_recon, axis=1) == y_private
def test_logistic_regression_multinomial():
    # Tests for the multinomial option in logistic regression

    # Some basic attributes of Logistic Regression
    n_samples, n_features, n_classes = 50, 20, 3
    X, y = make_classification(n_samples=n_samples,
                               n_features=n_features,
                               n_informative=10,
                               n_classes=n_classes,
                               random_state=0)
    clf_int = LogisticRegression(solver='lbfgs', multi_class='multinomial')
    clf_int.fit(X, y)
    assert_array_equal(clf_int.coef_.shape, (n_classes, n_features))

    clf_wint = LogisticRegression(solver='lbfgs',
                                  multi_class='multinomial',
                                  fit_intercept=False)
    clf_wint.fit(X, y)
    assert_array_equal(clf_wint.coef_.shape, (n_classes, n_features))

    # Similar tests for newton-cg solver option
    clf_ncg_int = LogisticRegression(solver='newton-cg',
                                     multi_class='multinomial')
    clf_ncg_int.fit(X, y)
    assert_array_equal(clf_ncg_int.coef_.shape, (n_classes, n_features))

    clf_ncg_wint = LogisticRegression(solver='newton-cg',
                                      fit_intercept=False,
                                      multi_class='multinomial')
    clf_ncg_wint.fit(X, y)
    assert_array_equal(clf_ncg_wint.coef_.shape, (n_classes, n_features))

    # Compare solutions between lbfgs and newton-cg
    assert_almost_equal(clf_int.coef_, clf_ncg_int.coef_, decimal=3)
    assert_almost_equal(clf_wint.coef_, clf_ncg_wint.coef_, decimal=3)
    assert_almost_equal(clf_int.intercept_, clf_ncg_int.intercept_, decimal=3)

    # Test that the path give almost the same results. However since in this
    # case we take the average of the coefs after fitting across all the
    # folds, it need not be exactly the same.
    for solver in ['lbfgs', 'newton-cg']:
        clf_path = LogisticRegressionCV(solver=solver,
                                        multi_class='multinomial',
                                        Cs=[1.])
        clf_path.fit(X, y)
        assert_array_almost_equal(clf_path.coef_, clf_int.coef_, decimal=3)
        assert_almost_equal(clf_path.intercept_, clf_int.intercept_, decimal=3)
Beispiel #58
0
def cvWithThreshold(X,
                    y_current_tr,
                    y_current_te,
                    threshold,
                    regularization='l2'):
    out_dict = {}
    scores = defaultdict(list)
    fold = 1
    maxent = LogisticRegression(penalty=regularization)
    for TrainIndices, TestIndices in cross_validation.StratifiedKFold(
            y_current_tr, n_folds=10, shuffle=False, random_state=None):
        print('\r' + str(fold), end="")
        fold += 1
        TrainX_i = X[TrainIndices]
        Trainy_i = y_current_tr[TrainIndices]
        TestX_i = X[TestIndices]
        Testy_i = y_current_te[TestIndices]

        maxent.fit(TrainX_i, Trainy_i)
        ypred_i, score = pred_for_threshold(maxent, TestX_i, Testy_i,
                                            threshold)

        scores["F1"].append(score[0])
        scores["Recall"].append(score[1])
        scores["Accuracy"].append(score[2])
        scores["Precision"].append(score[3])

    #scores = cross_validation.cross_val_score(maxent, features, labels, cv=10)
    print("\n--")

    for key in sorted(scores.keys()):
        currentmetric = np.array(scores[key])
        out_dict[key] = (currentmetric.mean(), currentmetric.std())
        print("%s : %0.2f (+/- %0.2f)" %
              (key, currentmetric.mean(), currentmetric.std()))
    print("--")
    return out_dict
Beispiel #59
0
def predictWithThreshold(datadir, threshold, penalty_type='l2'):
    maxent = LogisticRegression(penalty=penalty_type)
    scores = defaultdict(list)
    for dir in sorted(os.listdir(datadir), reverse=True):
        trainfeatures, trainlabels, vec = feats_and_classify.collect_features(
            datadir + dir + '/train.conll')
        TrainIndices = np.array(range(len(trainfeatures)))
        features, labels, vec = feats_and_classify.collect_features(
            datadir + dir + '/all.conll')
        TestIndices = np.array(range(len(trainfeatures), len(features)))
        #		print('\r'+dir, end="")
        #		print(dir)
        TrainX_i = features[TrainIndices]
        Trainy_i = labels[TrainIndices]

        TestX_i = features[TestIndices]
        Testy_i = labels[TestIndices]

        maxent.fit(TrainX_i, Trainy_i)
        #		print('Finished fitting')
        ypred_i, score = pred_for_threshold(maxent, TestX_i, Testy_i,
                                            threshold)
        #		print('Predicting')

        scores["F1"].append(score[0])
        scores["Recall"].append(score[1])
        scores["Accuracy"].append(score[2])
        scores["Precision"].append(score[3])

    #scores = cross_validation.cross_val_score(maxent, features, labels, cv=10)
    print("\n--")

    for key in sorted(scores.keys()):
        currentmetric = np.array(scores[key])
        print("%s : %0.2f (+/- %0.2f)" %
              (key, currentmetric.mean(), currentmetric.std()))
    print("--")
Beispiel #60
0
 def gbdt_lr_train(self, Train_tab, Train_libsvm):
     # load样本数据
     X_all, y_all = load_svmlight_file("sample_libsvm_data.txt")
     # 训练/测试数据分割
     X_train, X_test, y_train, y_test = train_test_split(X_all,
                                                         y_all,
                                                         test_size=0.1,
                                                         random_state=42)
     # 定义GBDT模型
     gbdt = GradientBoostingClassifier(n_estimators=40,
                                       max_depth=3,
                                       verbose=0,
                                       max_features=0.5)
     # 训练模型
     gbdt.fit(X_train, y_train)
     # GBDT编码原有特征
     X_train_leaves = gbdt.apply(X_train)[:, :, 0]
     X_test_leaves = gbdt.apply(X_test)[:, :, 0]
     # 对所有特征进行ont-hot编码
     (train_rows, cols) = X_train_leaves.shape
     gbdtenc = OneHotEncoder()
     X_trans = gbdtenc.fit_transform(
         np.concatenate((X_train_leaves, X_test_leaves), axis=0))
     # 定义LR模型
     lr = LogisticRegression(n_jobs=-1)
     # 组合特征
     X_train_ext = hstack([X_trans[:train_rows, :], X_train])
     X_test_ext = hstack([X_trans[train_rows:, :], X_test])
     # lr对组合特征的样本模型训练
     lr.fit(X_train_ext, y_train)
     # 预测及AUC评测
     filename = 'finalized_model.sav'
     pickle.dump(lr, open(filename, 'wb'))
     # load the model from disk
     loaded_model = pickle.load(open(filename, 'rb'))
     y_pred_gbdtlr2 = loaded_model.predict_proba(X_test_ext)[:, 1]
     print(y_pred_gbdtlr2)