Ejemplo n.º 1
0
def gbdt_lr_train(libsvmFileName):

    # load样本数据
    X_all, y_all = load_svmlight_file(libsvmFileName)

    # 训练/测试数据分割
    X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size = 0.3, random_state = 42)

    # 定义GBDT模型
    gbdt = GradientBoostingClassifier(n_estimators=40, max_depth=3, verbose=0,max_features=0.5)

    # 训练学习
    gbdt.fit(X_train, y_train)

    # 预测及AUC评测
    y_pred_gbdt = gbdt.predict_proba(X_test.toarray())[:, 1]
    gbdt_auc = roc_auc_score(y_test, y_pred_gbdt)
    print('gbdt auc: %.5f' % gbdt_auc)

    # lr对原始特征样本模型训练
    lr = LogisticRegression()
    lr.fit(X_train, y_train)    # 预测及AUC评测
    y_pred_test = lr.predict_proba(X_test)[:, 1]
    lr_test_auc = roc_auc_score(y_test, y_pred_test)
    print('基于原有特征的LR AUC: %.5f' % lr_test_auc)

    # GBDT编码原有特征
    X_train_leaves = gbdt.apply(X_train)[:,:,0]
    X_test_leaves = gbdt.apply(X_test)[:,:,0]

    # 对所有特征进行ont-hot编码
    (train_rows, cols) = X_train_leaves.shape

    gbdtenc = OneHotEncoder()
    X_trans = gbdtenc.fit_transform(np.concatenate((X_train_leaves, X_test_leaves), axis=0))

    # 定义LR模型
    lr = LogisticRegression()
    # lr对gbdt特征编码后的样本模型训练
    lr.fit(X_trans[:train_rows, :], y_train)
    # 预测及AUC评测
    y_pred_gbdtlr1 = lr.predict_proba(X_trans[train_rows:, :])[:, 1]
    gbdt_lr_auc1 = roc_auc_score(y_test, y_pred_gbdtlr1)
    print('基于GBDT特征编码后的LR AUC: %.5f' % gbdt_lr_auc1)

    # 定义LR模型
    lr = LogisticRegression(n_jobs=-1)
    # 组合特征
    X_train_ext = hstack([X_trans[:train_rows, :], X_train])
    X_test_ext = hstack([X_trans[train_rows:, :], X_test])

    print(X_train_ext.shape)
    # lr对组合特征的样本模型训练
    lr.fit(X_train_ext, y_train)

    # 预测及AUC评测
    y_pred_gbdtlr2 = lr.predict_proba(X_test_ext)[:, 1]
    gbdt_lr_auc2 = roc_auc_score(y_test, y_pred_gbdtlr2)
    print('基于组合特征的LR AUC: %.5f' % gbdt_lr_auc2)
def main():
    #1,加载数据(训练和测试)和预处理数据
    #将NumberTime30-59,60-89,90中标记的96,98替换为NaN
    #将Age中的0替换为NaN
    colnames = [
        'ID', 'label', 'RUUnsecuredL', 'age', 'NOTime30-59', 'DebtRatio',
        'Income', 'NOCredit', 'NOTimes90', 'NORealEstate', 'NOTime60-89',
        'NODependents'
    ]
    col_nas = [
        '', 'NA', 'NA', 0, [98, 96], 'NA', 'NA', 'NA', [98, 96], 'NA',
        [98, 96], 'NA'
    ]
    col_na_values = creatDictKV(colnames, col_nas)
    dftrain = pd.read_csv("./data/cs-training.csv",
                          names=colnames,
                          na_values=col_na_values,
                          skiprows=[0])
    train_id = [int(x) for x in dftrain.pop("ID")]
    y_train = np.asarray([int(x) for x in dftrain.pop("label")])
    x_train = dftrain.as_matrix()

    dftest = pd.read_csv("./data/cs-test.csv",
                         names=colnames,
                         na_values=col_na_values,
                         skiprows=[0])
    test_id = [int(x) for x in dftest.pop("ID")]
    y_test = np.asarray(dftest.pop("label"))
    x_test = dftest.as_matrix()
    #2,使用StratifiedShuffleSplit将训练数据分解为training_new和test_new(用于验证模型)
    sss = StratifiedShuffleSplit(n_splits=1, test_size=0.33333, random_state=0)
    for train_index, test_index in sss.split(x_train, y_train):
        print("TRAIN:", train_index, "TEST:", test_index)
        x_train_new, x_test_new = x_train[train_index], x_train[test_index]
        y_train_new, y_test_new = y_train[train_index], y_train[test_index]

    y_train = y_train_new
    x_train = x_train_new
    #3,使用Imputer将NaN替换为平均值
    imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
    imp.fit(x_train)
    x_train = imp.transform(x_train)
    x_test_new = imp.transform(x_test_new)
    x_test = imp.transform(x_test)
    x_train = np.delete(x_train, 5, axis=1)
    x_test_new = np.delete(x_test_new, 5, axis=1)
    if not os.path.isfile("lr_model.m"):
        clf = LogisticRegression(class_weight="balanced")
        clf.fit(x_train, y_train)
        joblib.dump(clf, "lr_model.m")
        predicted_probs_train = clf.predict_proba(x_train)
        predicted_probs_train = [x[1] for x in predicted_probs_train]
        computeAUC(y_train, predicted_probs_train)
    else:
        clf = joblib.load("lr_model.m")
        predicted_probs_test_new = clf.predict_proba(x_test_new)
        predicted_probs_test_new = [x[1] for x in predicted_probs_test_new]
        computeAUC(y_test_new, predicted_probs_test_new)
Ejemplo n.º 3
0
def lr_training_and_test(X_train, X_test, y_train, y_test):
    print 'model: logistic regression.'
    model = LogisticRegression()
    model.fit(X_train, y_train)
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    y_train_pred_prob = model.predict_proba(X_train)[:, 1]
    y_test_pred_prob = model.predict_proba(X_test)[:, 1]

    evaluate_model(y_train, y_train_pred, y_train_pred_prob, y_test, y_test_pred, y_test_pred_prob)
    return model
class LogReg:
    def __init__(self):
        self.load_data()
        self.clf = LogisticRegression(class_weight = 'balanced')
        self.train()
        self.predict()

    def load_data(self):
        train_csv = './data/train.csv'
        test_csv = './data/test.csv'
        df_train = pd.read_csv(train_csv, header=0)
        df_test = pd.read_csv(test_csv, header=0)
        arr_train = df_train.values
        arr_test = df_test.values
        self.train_X = arr_train[0::,1::]
        self.train_Y = arr_train[0::, 0]
        self.test_X = arr_test[0::, 1::]
        self.test_ID = arr_test[0::,0]

    def train(self):
        self.clf.fit(self.train_X, self.train_Y)

    def predict(self):
        self.test_Y = self.clf.predict_proba(self.test_X)

    def get_training_accuracy(self):
        return (self.clf.score(self.train_X, self.train_Y))

    def store_result(self):
        df_out = pd.DataFrame()
        df_out['Id'] = self.test_ID
        df_out['Action'] = self.test_Y[0::,1]
        df_out.to_csv('./data/results/c1_result.csv',index=False)
Ejemplo n.º 5
0
def get_second_level(train_dim, train_label, test_dim, num_class):
    meta_train, meta_test = get_first_level(train_dim, train_label, test_dim,
                                            num_class)
    LR = LogisticRegression()
    hist = LR.fit(meta_train, train_label)
    pre_score = LR.predict_proba(meta_test)
    return meta_train, meta_test, pre_score
Ejemplo n.º 6
0
def logic_pca_standard(y, n):
    # 逻辑回归+降维+标准化
    pa = PCA(n_components=n)
    data = pa.fit_transform(train)
    # 分割数据
    x_train, x_test, y_train, y_test = train_test_split(data,
                                                        y,
                                                        test_size=0.25,
                                                        random_state=24)
    # 标准化
    std = StandardScaler()
    print(std)
    x_train = std.fit_transform(x_train)
    x_test = std.transform(x_test)
    # estimator
    logic = LogisticRegression()
    logic.fit(x_train, y_train)
    # 预测
    pre_score = logic.score(x_test, y_test)
    print("准确率(逻辑回归+降维+标准化):{}".format(pre_score))
    print(
        "精确率和召回率:",
        classification_report(y_test,
                              logic.predict(x_test),
                              labels=[0, 1],
                              target_names=["非高收入", "高收入"]))
    # 输出概率
    predictions = logic.predict_proba(x_test)
    # Compute Receiver operating characteristic (ROC)
    fpr, tpr, thresholds = metrics.roc_curve(y_test, predictions[:, 1])
    auc_value = metrics.auc(fpr, tpr)
    print("auc值为:{}".format(auc_value))
Ejemplo n.º 7
0
class Ensemble:
    def __init__(self, base_estimators=None, random_state=0):
        self.base_estimators = base_estimators
        self.estimator = MetaEstimator()
        self.random_state = random_state

    def fit(self, X, y):
        cv = KFold(n_splits=5, shuffle=True, random_state=self.random_state)
        predictions = []
        for estimator in self.base_estimators:
            prediction = cross_val_predict(estimator,
                                           X,
                                           y,
                                           cv=cv,
                                           method='predict_proba')
            print('prediction of', estimator.__class__.__name__)
            print(prediction)
            # predictions.extend(prediction.T)
            predictions.append(prediction.T[0])
        print('all predictions')
        print(np.array(predictions), y)
        self.estimator.fit(np.array(predictions).T, y)
        for estimator in self.base_estimators:
            estimator.fit(X, y)

    def predict(self, X, margin):
        return np.array(self.predict_proba(X)) > margin

    def predict_proba(self, X):
        predictions = []
        for estimator in self.base_estimators:
            # predictions.extend(estimator.predict_proba(X).T)
            predictions.append(estimator.predict_proba(X).T[0])
        return self.estimator.predict_proba(np.array(predictions).T)
def main():
    scriptdir = os.path.dirname(os.path.realpath(__file__))
    parser = argparse.ArgumentParser(description="Skeleton for features and classifier for CWI-2016--optimisation of threshhold")
    parser.add_argument('--threshold',type=float,default=0.5)
    parser.add_argument('--annotator',type=str,default="03")
    parser.add_argument('--penalty',type=str,choices=["l1","l2"],default="l1")


    args = parser.parse_args()
    current_single_ann = scriptdir+"/../data/cwi_training/cwi_training_"+args.annotator+".lbl.conll"
    testfile = scriptdir+"/../data/cwi_testing/cwi_testing.txt.lbl.conll"
    X__dict_train, y_train, v_train = feats_and_classify.collect_features(current_single_ann,vectorize=False)
    X_dict_test, y_test, v_test = feats_and_classify.collect_features(testfile,vectorize=False)
    featdicts = list([x for x in X__dict_train + X_dict_test])
    vect = DictVectorizer()
    X = vect.fit_transform(featdicts).toarray()
    X_train=X[:len(y_train)]
    X_test=X[len(y_train):]

    maxent = LogisticRegression(penalty=args.penalty)
    maxent.fit(X_train,y_train)
    y_pred_proba = maxent.predict_proba(X_test)
    ypred_i=["1" if pair[1]>=args.threshold else "0" for pair in y_pred_proba]
    fout = open(args.annotator+".pred",mode="w")
    print("\n".join(ypred_i),file=fout)
    fout.close()
    sys.exit(0)
Ejemplo n.º 9
0
class LogisticRegressionImpl():

    def __init__(self, penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight='balanced', random_state=None, solver='liblinear', max_iter=100, multi_class='ovr', verbose=0, warm_start=False, n_jobs=None):
        self._hyperparams = {
            'penalty': penalty,
            'dual': dual,
            'tol': tol,
            'C': C,
            'fit_intercept': fit_intercept,
            'intercept_scaling': intercept_scaling,
            'class_weight': class_weight,
            'random_state': random_state,
            'solver': solver,
            'max_iter': max_iter,
            'multi_class': multi_class,
            'verbose': verbose,
            'warm_start': warm_start,
            'n_jobs': n_jobs}
        self._wrapped_model = SKLModel(**self._hyperparams)

    def fit(self, X, y=None):
        if (y is not None):
            self._wrapped_model.fit(X, y)
        else:
            self._wrapped_model.fit(X)
        return self

    def predict(self, X):
        return self._wrapped_model.predict(X)

    def predict_proba(self, X):
        return self._wrapped_model.predict_proba(X)

    def decision_function(self, X):
        return self._wrapped_model.decision_function(X)
Ejemplo n.º 10
0
def test_logreg_predict_proba_multinomial():
    X, y = make_classification(n_samples=10, n_features=20, random_state=0, n_classes=3, n_informative=10)

    # Predicted probabilites using the true-entropy loss should give a
    # smaller loss than those using the ovr method.
    clf_multi = LogisticRegression(multi_class="multinomial", solver="lbfgs")
    clf_multi.fit(X, y)
    clf_multi_loss = log_loss(y, clf_multi.predict_proba(X))
    clf_ovr = LogisticRegression(multi_class="ovr", solver="lbfgs")
    clf_ovr.fit(X, y)
    clf_ovr_loss = log_loss(y, clf_ovr.predict_proba(X))
    assert_greater(clf_ovr_loss, clf_multi_loss)

    # Predicted probabilites using the soft-max function should give a
    # smaller loss than those using the logistic function.
    clf_multi_loss = log_loss(y, clf_multi.predict_proba(X))
    clf_wrong_loss = log_loss(y, clf_multi._predict_proba_lr(X))
    assert_greater(clf_wrong_loss, clf_multi_loss)
Ejemplo n.º 11
0
def get_second_level(train_dim, train_label, test_dim, num_class):
    meta_train, meta_test = get_first_level(train_dim, train_label, test_dim,
                                            num_class)
    meta_train_fusion = np.concatenate((meta_train, train_dim), axis=1)
    meta_test_fusion = np.concatenate((meta_test, test_dim), axis=1)
    LR = LogisticRegression(C=0.03125, penalty="l1")
    hist = LR.fit(meta_train_fusion, train_label)
    pre_score = LR.predict_proba(meta_test_fusion)
    return meta_train_fusion, meta_test_fusion, pre_score
Ejemplo n.º 12
0
def main():
    train_data, test_data = load_data()
    train_data, test_data = data_fillna(train_data, test_data)
    train_data, test_data = data_process(train_data, test_data)
    #特征选择
    features = [
        'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 'My'
    ]
    train_data['My'] = train_data['Age'] + train_data['Sex']
    test_data['My'] = test_data['Age'] + test_data['Sex']

    train_labels = train_data['Survived']
    train_features = train_data[features]
    train_x, train_y, label_x, label_y = train_test_split(train_features,
                                                          train_labels,
                                                          test_size=0.3,
                                                          random_state=1)
    test_features = test_data[features]

    LR = LogisticRegression(max_iter=100,
                            verbose=True,
                            random_state=33,
                            tol=1e-4)
    LR.fit(train_x, label_x)
    predict = LR.predict_proba(train_y)[:, 1]
    feature_importance = LR.coef_[0]
    feature_importance = 100.0 * (feature_importance /
                                  feature_importance.max())
    print('feature importance is:')
    print(feature_importance)
    print("LR auc:%0.6lf" % metrics.roc_auc_score(label_y, predict))

    SVM = SVC(kernel='rbf', probability=True, C=0.2)
    SVM.fit(train_x, label_x)
    predict_svm = SVM.predict_proba(train_y)[:, 1]
    print("SVM auc:%0.6lf" % metrics.roc_auc_score(label_y, predict_svm))

    LGB = lgb.LGBMClassifier(
        boosting_type='gbdt',
        objective='binary',
        metric='auc',
        verbose=0,
        learning_rate=0.01,
        num_leaves=31,
        feature_fraction=0.8,
        bagging_fraction=0.8,
        bagging_freq=2,
        lambda_l1=0.8,
        lambda_l2=0,
        max_depth=5,
        # silent = False
        cat_smooth=1)
    LGB.fit(train_x, label_x)
    predict_LGB = LGB.predict_proba(train_y)[:, 1]
    lgb.plot_importance(LGB, max_num_features=30)
    print("LGB auc:%0.6lf" % metrics.roc_auc_score(label_y, predict))
Ejemplo n.º 13
0
def test_logreg_predict_proba_multinomial():
    X, y = make_classification(n_samples=10, n_features=20, random_state=0,
                               n_classes=3, n_informative=10)

    # Predicted probabilities using the true-entropy loss should give a
    # smaller loss than those using the ovr method.
    clf_multi = LogisticRegression(multi_class="multinomial", solver="lbfgs")
    clf_multi.fit(X, y)
    clf_multi_loss = log_loss(y, clf_multi.predict_proba(X))
    clf_ovr = LogisticRegression(multi_class="ovr", solver="lbfgs")
    clf_ovr.fit(X, y)
    clf_ovr_loss = log_loss(y, clf_ovr.predict_proba(X))
    assert_greater(clf_ovr_loss, clf_multi_loss)

    # Predicted probabilities using the soft-max function should give a
    # smaller loss than those using the logistic function.
    clf_multi_loss = log_loss(y, clf_multi.predict_proba(X))
    clf_wrong_loss = log_loss(y, clf_multi._predict_proba_lr(X))
    assert_greater(clf_wrong_loss, clf_multi_loss)
Ejemplo n.º 14
0
def test_nnet(n_samples=200, n_features=5, distance=0.5, complete=False):
    X, y = make_blobs(n_samples=n_samples,
                      n_features=5,
                      centers=[
                          numpy.ones(n_features) * distance,
                          -numpy.ones(n_features) * distance
                      ])

    nn_types = [
        nnet.SimpleNeuralNetwork,
        nnet.MLPClassifier,
        nnet.SoftmaxNeuralNetwork,
        nnet.RBFNeuralNetwork,
        nnet.PairwiseNeuralNetwork,
        nnet.PairwiseSoftplusNeuralNetwork,
    ]

    if complete:
        # checking all possible combinations
        for loss in nnet.losses:
            for NNType in nn_types:
                for trainer in nnet.trainers:
                    nn = NNType(layers=[5],
                                loss=loss,
                                trainer=trainer,
                                random_state=42)
                    nn.fit(X, y, epochs=100)
                    print(roc_auc_score(y, nn.predict_proba(X)[:, 1]), nn)

        lr = LogisticRegression().fit(X, y)
        print(lr, roc_auc_score(y, lr.predict_proba(X)[:, 1]))

        assert 0 == 1, "Let's see and compare results"
    else:
        # checking combinations of losses, nn_types, trainers, most of them are used once during tests.
        attempts = max(len(nnet.losses), len(nnet.trainers), len(nn_types))
        attempts = 4
        losses_shift = numpy.random.randint(10)
        trainers_shift = numpy.random.randint(10)
        for attempt in range(attempts):
            loss = nnet.losses.keys()[(attempt + losses_shift) %
                                      len(nnet.losses)]
            trainer = nnet.trainers.keys()[(attempt + trainers_shift) %
                                           len(nnet.trainers)]

            nn_type = nn_types[attempt % len(nn_types)]

            nn = nn_type(layers=[5],
                         loss=loss,
                         trainer=trainer,
                         random_state=42)
            print(nn)
            nn.fit(X, y, epochs=200)
            assert roc_auc_score(y, nn.predict_proba(X)[:, 1]) > 0.8, \
                'quality of model is too low: {}'.format(nn)
Ejemplo n.º 15
0
def compare_nnets_quality(n_samples=200, n_features=7, distance=0.8):
    X, y = generate_sample(n_samples=n_samples, n_features=n_features, distance=distance)
    # checking all possible combinations
    for loss in ['log_loss']:  # nnet.losses:
        for NNType in nn_types:
            for trainer in nnet.trainers:
                nn = NNType(layers=[5], loss=loss, trainer=trainer, epochs=100, random_state=42)
                nn.fit(X, y)
                print(roc_auc_score(y, nn.predict_proba(X)[:, 1]), nn)

    lr = LogisticRegression().fit(X, y)
    print(roc_auc_score(y, lr.predict_proba(X)[:, 1]), lr)
Ejemplo n.º 16
0
def test_nnet(n_samples=200, n_features=7, distance=0.8, complete=False):
    """
    :param complete: if True, all possible combinations will be checked, and quality is printed
    """
    X, y = generate_sample(n_samples=n_samples, n_features=n_features, distance=distance)

    nn_types = [
        nnet.SimpleNeuralNetwork,
        nnet.MLPClassifier,
        nnet.SoftmaxNeuralNetwork,
        nnet.RBFNeuralNetwork,
        nnet.PairwiseNeuralNetwork,
        nnet.PairwiseSoftplusNeuralNetwork,
    ]

    if complete:
        # checking all possible combinations
        for loss in nnet.losses:
            for NNType in nn_types:
                for trainer in nnet.trainers:
                    nn = NNType(layers=[5], loss=loss, trainer=trainer, random_state=42, epochs=100)
                    nn.fit(X, y )
                    print(roc_auc_score(y, nn.predict_proba(X)[:, 1]), nn)

        lr = LogisticRegression().fit(X, y)
        print(lr, roc_auc_score(y, lr.predict_proba(X)[:, 1]))

        assert 0 == 1, "Let's see and compare results"
    else:
        # checking combinations of losses, nn_types, trainers, most of them are used once during tests.
        attempts = max(len(nnet.losses), len(nnet.trainers), len(nn_types))
        losses_shift = numpy.random.randint(10)
        trainers_shift = numpy.random.randint(10)
        for attempt in range(attempts):
            # each combination is tried 3 times. before raising exception
            retry_attempts = 3
            for retry_attempt in range(retry_attempts):
                loss = list(nnet.losses.keys())[(attempt + losses_shift) % len(nnet.losses)]
                trainer = list(nnet.trainers.keys())[(attempt + trainers_shift) % len(nnet.trainers)]

                nn_type = nn_types[attempt % len(nn_types)]

                nn = nn_type(layers=[5], loss=loss, trainer=trainer, random_state=42 + retry_attempt, epochs=200)
                print(nn)
                nn.fit(X, y)
                quality = roc_auc_score(y, nn.predict_proba(X)[:, 1])
                computed_loss = nn.compute_loss(X, y)
                if quality > 0.8:
                    break
                else:
                    print('attempt {} : {}'.format(retry_attempt, quality))
                    if retry_attempt == retry_attempts - 1:
                        raise RuntimeError('quality of model is too low: {} {}'.format(quality, nn))
def classifierPrecission():
    import numpy as np
    import pandas as pd
    import matplotlib.pyplot as plt
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.linear_model.logistic import LogisticRegression
    from sklearn.cross_validation import train_test_split, cross_val_score
    from sklearn.metrics import roc_curve, auc

    df = pd.read_csv('data/sms.csv')
    X_train_raw, X_test_raw, y_train, y_test = train_test_split \
                                        (df['message'],df['label'])
    vectorizer = TfidfVectorizer()
    X_train = vectorizer.fit_transform(X_train_raw)
    X_test = vectorizer.transform(X_test_raw)
    classifier = LogisticRegression()
    classifier.fit(X_train, y_train)
    precisions = cross_val_score(classifier,
                                 X_train,
                                 y_train,
                                 cv=5,
                                 scoring='precision')
    print 'precission:', np.mean(precisions), precisions

    recalls = cross_val_score(classifier,
                              X_train,
                              y_train,
                              cv=5,
                              scoring='recall')
    print 'recalls:', np.mean(recalls), recalls

    #f1 = 2*PR/(P+R) for perfect should be 1
    f1s = cross_val_score(classifier, X_train, y_train, cv=5, scoring='f1')
    print 'f1s:', np.mean(f1s), f1s

    #ROC  Receiver operating characteristic ROC Currve clasisfier performance
    #its classifier recall against its fall-out
    #F = FP /(TN + FP)
    predictions = classifier.predict_proba(X_test)
    false_positive_rate, recall, thresholds = roc_curve(
        y_test, predictions[:, 1])
    roc_auc = auc(false_positive_rate, recall)
    plt.title('ROC')
    plt.plot(false_positive_rate, recall, 'b', label='AUC = %0.2f' % roc_auc)
    plt.legend(loc='lower right')
    plt.plot([0, 1], [0, 1], 'r--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.0])
    plt.ylabel('Recall')
    plt.xlabel('fall-out')
    plt.show()
Ejemplo n.º 18
0
    def fit_model_2(self, lol = .07, toWrite = False):
        model = LogisticRegression(C = lol, penalty = 'l1', tol = 1e-6)

        for data in self.cv_data:
            X_train, X_test, Y_train, Y_test = data
            X_train,Y_train = self.balance_data(X_train,Y_train)
            model.fit(X_train,Y_train)
            pred = model.predict_proba(X_test)[:,1]
            print("Model 2 Score: %f" % (logloss(Y_test,pred),))

        if toWrite:
            f2 = open('model2/model.pkl','w')
            pickle.dump(model,f2)
            f2.close()
Ejemplo n.º 19
0
def test_multinomial_binary_probabilities():
    # Test multinomial LR gives expected probabilities based on the
    # decision function, for a binary problem.
    X, y = make_classification()
    clf = LogisticRegression(multi_class='multinomial', solver='saga')
    clf.fit(X, y)

    decision = clf.decision_function(X)
    proba = clf.predict_proba(X)

    expected_proba_class_1 = (np.exp(decision) /
                              (np.exp(decision) + np.exp(-decision)))
    expected_proba = np.c_[1-expected_proba_class_1, expected_proba_class_1]

    assert_almost_equal(proba, expected_proba)
Ejemplo n.º 20
0
def test_predict_iris():
    """Test logistic regression with the iris dataset"""
    n_samples, n_features = iris.data.shape

    target = iris.target_names[iris.target]
    clf = LogisticRegression(C=len(iris.data)).fit(iris.data, target)
    assert_array_equal(np.unique(target), clf.classes_)

    pred = clf.predict(iris.data)
    assert_greater(np.mean(pred == target), .95)

    probabilities = clf.predict_proba(iris.data)
    assert_array_almost_equal(probabilities.sum(axis=1), np.ones(n_samples))

    pred = iris.target_names[probabilities.argmax(axis=1)]
    assert_greater(np.mean(pred == target), .95)
Ejemplo n.º 21
0
def test_predict_iris():
    """Test logistic regression with the iris dataset"""
    n_samples, n_features = iris.data.shape

    target = iris.target_names[iris.target]
    clf = LogisticRegression(C=len(iris.data)).fit(iris.data, target)
    assert_array_equal(np.unique(target), clf.classes_)

    pred = clf.predict(iris.data)
    assert_greater(np.mean(pred == target), .95)

    probabilities = clf.predict_proba(iris.data)
    assert_array_almost_equal(probabilities.sum(axis=1), np.ones(n_samples))

    pred = iris.target_names[probabilities.argmax(axis=1)]
    assert_greater(np.mean(pred == target), .95)
Ejemplo n.º 22
0
def train_and_predics(featuredicts, labels, trainsize):
    vec = DictVectorizer()
    y_train = labels[:trainsize]
    X_train = vec.fit_transform(featuredicts[:trainsize])
    X_test = vec.transform(featuredicts[trainsize:])
    maxent = LogisticRegression(penalty='l2')
    maxent.fit(X_train, y_train)
    predictions = []
    #header = "\t".join(["prediction"]+[str(c) for c in maxent.classes_])
    #predictions.append(header)
    for list, label in zip(maxent.predict_proba(X_test),
                           maxent.predict(X_test)):
        line = "\t".join([label] + ["{0:.2f}".format(k) for k in list])
        predictions.append(line)

    return predictions
Ejemplo n.º 23
0
def test_nnet(n_samples=200, n_features=5, distance=0.5, complete=False):
    X, y = make_blobs(
        n_samples=n_samples,
        n_features=5,
        centers=[numpy.ones(n_features) * distance, -numpy.ones(n_features) * distance],
    )

    nn_types = [
        nnet.SimpleNeuralNetwork,
        nnet.MultiLayerNetwork,
        nnet.SoftmaxNeuralNetwork,
        nnet.RBFNeuralNetwork,
        nnet.PairwiseNeuralNetwork,
        nnet.PairwiseSoftplusNeuralNetwork,
    ]

    if complete:
        # checking all possible combinations
        for loss in nnet.losses:
            for NNType in nn_types:
                for trainer in nnet.trainers:
                    nn = NNType(layers=[5], loss=loss, trainer=trainer, random_state=42)
                    nn.fit(X, y, epochs=100)
                    print(roc_auc_score(y, nn.predict_proba(X)[:, 1]), nn)

        lr = LogisticRegression().fit(X, y)
        print(lr, roc_auc_score(y, lr.predict_proba(X)[:, 1]))

        assert 0 == 1, "Let's see and compare results"
    else:
        # checking combinations of losses, nn_types, trainers, most of them are used once during tests.
        attempts = max(len(nnet.losses), len(nnet.trainers), len(nn_types))
        attempts = 4
        losses_shift = numpy.random.randint(10)
        trainers_shift = numpy.random.randint(10)
        for attempt in range(attempts):
            loss = nnet.losses.keys()[(attempt + losses_shift) % len(nnet.losses)]
            trainer = nnet.trainers.keys()[(attempt + trainers_shift) % len(nnet.trainers)]

            nn_type = nn_types[attempt % len(nn_types)]

            nn = nn_type(layers=[5], loss=loss, trainer=trainer, random_state=42)
            print(nn)
            nn.fit(X, y, epochs=200)
            assert roc_auc_score(y, nn.predict_proba(X)[:, 1]) > 0.8, "quality of model is too low: {}".format(nn)
Ejemplo n.º 24
0
class WebsiteMatchConfidencePredictor(object):
    def __init__(self):
        self.model = LogisticRegression()


    def fit(self, urls, websites, y):
        """

        :param urls: list of urls
        :param websites: list of corresponding scraped websites
        :param y: list of corresponding booleans - matches or not
        """
        X = [make_features(url, web) for url, web in zip(urls, websites)]
        self.model.fit(X, y)

    def predict(self, url, website):
        X = make_features(url, website)
        return self.model.predict_proba(X)
Ejemplo n.º 25
0
def	predictTestSet():
	#generate training features and labels
	trainfile='/home/natschluter/GroupAlgorithms/cwi2016/data/cwi_training/cwi_training_cat.lbl.conll'
	trainfeatures, trainlabels, vec = feats_and_classify_py2.collect_features(trainfile)
	#generate training+test features
	
	bothfiles='/home/natschluter/GroupAlgorithms/cwi2016/data/train_and_test1.conll'
	bothfeatures, bothlabels, bothvec = feats_and_classify_py2.collect_features(bothfiles)
	thresholds_med=np.median(np.array([ 0.145,  0.85,   0.12,   0.657,  0.71,   0.824,  0.506,  0.461,  0.662,  0.888]))
	
	TrainX=bothfeatures[np.array(range(len(trainfeatures)))]
	TrainY=bothlabels[np.array(range(len(trainlabels)))]
	TestX=bothfeatures[np.array(range(len(trainfeatures),len(bothfeatures)))]
	maxent = LogisticRegression(penalty='l2')
	print('training...')
	maxent.fit(TrainX,TrainY)
	print('predicting...')
	ypred_probs=maxent.predict_proba(TestX)
Ejemplo n.º 26
0
class Ensemble:
    def __init__(self, base_estimators=None, random_state=0, cv=3):
        self.base_estimators = base_estimators
        self.estimator = MetaEstimator()
        self.random_state = random_state
        self.fit_cv = cv

    def fit(self, X, y):
        cv = KFold(n_splits=self.fit_cv,
                   shuffle=True,
                   random_state=self.random_state)
        predictions = []
        for estimator in self.base_estimators:
            name = estimator.__class__.__name__
            log(0x25, 'cross_val_predict start', name)
            prediction = cross_val_predict(estimator,
                                           X,
                                           y,
                                           cv=cv,
                                           method='predict_proba')
            log(0x25, 'cross_val_predict end', name)
            # print('prediction of', estimator.__class__.__name__)
            # print(prediction)
            log(0x25, 'CV Score', name, check_result(y, prediction))
            predictions.append(prediction.T[0])
        # print('all predictions')
        # print(np.array(predictions), y)
        self.estimator.fit(np.array(predictions).T, y)
        for estimator in self.base_estimators:
            name = estimator.__class__.__name__
            log(0x25, 'fit start', name)
            estimator.fit(X, y)
            log(0x25, 'fit end:', name)

    def predict(self, X, margin):
        return np.array(self.predict_proba(X)[:, 0]) > margin

    def predict_proba(self, X):
        predictions = []
        for estimator in self.base_estimators:
            # predictions.extend(estimator.predict_proba(X).T)
            predictions.append(estimator.predict_proba(X).T[0])
        return self.estimator.predict_proba(np.array(predictions).T)
def main():
    parser = argparse.ArgumentParser(description="""Export AMT""")
    parser.add_argument('--input', default="../res/dga_extendedamt_simplemajority.tsv")
    parser.add_argument('--dump_to_predict', default="../res/dga_data_october2016.tsv")
    parser.add_argument('--embeddings', default="/Users/hmartine/data/glove.6B/glove.6B.50d.txt")
    args = parser.parse_args()

    E = load_embeddings(args.embeddings)

    predarrays = {}

    variants = ["bcd","cd"]
    for variant in variants:
    #1 collect features for train
        trainfeatures, labels, vec = collect_features(args.input,embeddings=E,variant=variant,vectorize=False)
        maxent = LogisticRegression(penalty='l2')
        #TODO collect features for new data
        #TODO proper vectorization
        dumpfeatdicts = features_from_dump(args.dump_to_predict,variant=variant,embeddings=E,bowfilter=trainingbow)
        #dumpfeats = vec.fit_transform(dumpfeatdicts)
        vec = DictVectorizer()
        X_train = vec.fit_transform(trainfeatures)

        maxent.fit(X_train,labels)
        X_test = vec.transform(dumpfeatdicts)

        predarrays[variant+"_pred_label"] = ["SAME" if x == 0 else "OMISSION" for x in maxent.predict(X_test)]
        predarrays[variant + "_pred_prob"] = ['{:.2}'.format(y) for x,y in maxent.predict_proba(X_test)]


    #maxent.fit(np.array(allfeatures[:len(labels)]),labels)
    #print(maxent.predict(allfeatures[len(labels):]))
    # predict using {features, features without lenght} --> instance 'variants' properly
        #TODO compare prediction similarity
        #TODO provide an output format with labels and probs for both feature templates
    frame = read_dump(args.dump_to_predict)
    keyindices = sorted(predarrays.keys())

    header = "Index Ref TitleRef URLRef Target TitleTarget URLTarget Source Contains BCD_label BCD_prob CD_label CD_prob".replace(" ","\t")

    print(header)
    for a in zip([str(x) for x in range(len(frame.Ref))],list(frame.Ref),list(frame.Target),list(frame.TitleRef),list(frame.URLRef),list(frame.TitleTarget),list(frame.URLTarget),list(frame.Source),list(frame.Contains),predarrays[keyindices[0]],predarrays[keyindices[1]],predarrays[keyindices[2]],predarrays[keyindices[3]]):
        print("\t".join(a))
Ejemplo n.º 28
0
	def buildModel(self, X_train_d, X_train_c, X_test_d, X_test_c, y_train, y_test):
		'''
		开始构建模型
		Args:
			X_train_d: 离散特征训练数据
			X_train_c: 连续特征训练数据
			X_test_d: 离散特征测试数据
			X_test_c: 连续特征测试数据
			y_train: 训练数据标记 {-1, 1}
			y_test: 测试数据标记 {-1, 1}
		Returns:
			gbc_enc: GBDT OneHotEncoder
			gbc: GBDT模型
			comb_model: 训练得到的组合模型
			threshold: 正负样例阈值, Pred_Prob >= threshold 为正样例; Pred_Prob < threshold 为负样例
			comb_model_auc: 模型AUC
			precision: 模型精度
			recall: 模型召回率
		'''
		if self._random_state is not None:
			gbc = GradientBoostingClassifier(n_estimators=self._n_estimators, learning_rate=self._gbdt_learning_rate, max_depth=self._max_depth, random_state=self._random_state).fit(X_train_c, y_train)
		else:
			gbc = GradientBoostingClassifier(n_estimators=self._n_estimators, learning_rate=self._gbdt_learning_rate, max_depth=self._max_depth).fit(X_train_c, y_train)
		X_train_leaves = gbc.apply(X_train_c)[:,:,0]
		X_test_leaves = gbc.apply(X_test_c)[:,:,0]
		(X_train_rows, cols) = X_train_leaves.shape
		gbc_enc = OneHotEncoder().fit(np.concatenate([X_train_leaves,X_test_leaves], axis = 0))
		X_trans = gbc_enc.transform(np.concatenate([X_train_leaves,X_test_leaves], axis = 0))
		X_train_ext = hstack([X_trans[:X_train_rows,:], X_train_d])
		X_test_ext = hstack([X_trans[X_train_rows:,:], X_test_d])
		log.debug("Combine features done.")
		comb_model = LogisticRegression().fit(X_train_ext, y_train)
		log.debug("Training done.")
		comb_model_pred = comb_model.predict_proba(X_test_ext)[:,1]
		precision, recall, thresholds = precision_recall_curve(y_test, comb_model_pred)
		ap = average_precision_score(y_test, comb_model_pred)
		recall_meet = recall >= self._recall_rate
		recall_meet_min = len([item for item in recall_meet if item == True])
		threshold = thresholds[recall_meet_min-1]
		log.debug("threshold: %f - precision: %f - recall: %f", threshold, precision[recall_meet_min-1], recall[recall_meet_min-1])
		comb_model_auc = roc_auc_score(y_test, comb_model_pred)
		log.debug("AUC score is: %f", comb_model_auc)
		return gbc_enc, gbc, comb_model, threshold, comb_model_auc, precision[recall_meet_min-1], recall[recall_meet_min-1]
Ejemplo n.º 29
0
    def regress_on_words(self, word_index, X):
        """
        word: The word that we are interested in
        text_corpus: input
        """
        labels = []
        tmp_X = X  # Avoid directly changing the variable
        for idx, sentence in enumerate(X):
            if (sentence[word_index] == 1):
                # tmp_X[idx][word_index] = 0
                labels.append(1)
            else:
                labels.append(0)

        # Build the logistic regression model
        log_reg = LogisticRegression()
        log_reg.fit(tmp_X, labels)

        probs = log_reg.predict_proba(tmp_X)[:, -1]

        return probs
Ejemplo n.º 30
0
def validate_model(X, y, N, digit, classifier):
    """
    This function validate the model by K-fold cross validation and print the ROC curves
    in the result folder

    :param X: nparray, one row is one sample
    :param y: nparray, labels
    :param out: output filename
    :param N: number of CPU cores to use
    :param digit: digit of the captcha
    :param classifier: which classifier to use
    :return: None
    """
    # K-fold cross validation
    folds = KFold(n_splits=5, shuffle=True, random_state=1234567).split(X)
    fold_r = fold_result()
    labels = np.unique(y)
    category_rs = [None] * len(labels)
    for label in labels:
        category_rs[label] = category_result()
    for train, test in folds:
        X_train = X[train]
        X_test = X[test]
        y_train = y[train]
        y_test = y[test]
        if (classifier == 'Logistic'):
            clf = LogisticRegression(solver='sag', n_jobs=N)
        else:
            clf = RandomForestClassifier(n_estimators=200,
                                         random_state=1234567,
                                         n_jobs=N)
        clf.fit(X_train, y_train)
        probas = clf.predict_proba(X_test)
        fold_r.append(clf, X_train, y_train, X_test, y_test)
        for label in labels:
            category_rs[label].append(y_test, probas[:, label], label)
    fold_r.print_score(digit)
    # print and save ROCS
    for label in labels:
        category_rs[label].print_result(label, digit)
Ejemplo n.º 31
0
def main():
    X = df_train.drop(['cust_id', 'y', 'cust_group'], axis=1, inplace=False)
    y = df_train['y']
    X_train,X_test , y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    print(X_train.shape, X_test.shape)
    # X_train=extract_feature(X_train,y_train)
    clf=LogisticRegression(C=1.0,max_iter=100,random_state=10)

    print("===="*20)
    clf.fit(X_train, y_train)
    prob  = clf.predict_proba(X_test)
    pred = np.argmax(prob, axis=1)
    print("mean_squared_error:", mean_squared_error(y_test, prob[:, 1]))
    print("log_loss:", log_loss(y_test.astype(int), prob[:, 1]))
    print("roc_auc_score:", roc_auc_score(y_test, prob[:, 1]))
    # high_danger_prob=prob[:, 1]
    # print(high_danger_prob)

    # print("调参")
    # tune_params(X_test, y_test)

    predict(clf)
Ejemplo n.º 32
0
def gbdt_lr_clf(X_train_data,Y_train_data):
    n_estimators = [10,20,30,40,50]
    estimator_best = 0
    f1_best = 0
    gblr_p_t = []
    gblr_r_t = []
    gblr_f1_t = []
    for item in n_estimators:
        grd = ensemble.GradientBoostingClassifier(n_estimators = item)
        stratified_folder = StratifiedKFold(n_splits=5, random_state=0, shuffle=False)
        print('gbdt_classifier + LR:')
        gblr_p = []
        gblr_r = []
        gblr_f1 = []
        for X_train_index, X_test_index in stratified_folder.split(X_train_data, Y_train_data):
            X_train, X_train_lr, y_train, y_train_lr = train_test_split(X_train_data[X_train_index], Y_train_data[X_train_index], test_size=0.5)
            grd_enc = OneHotEncoder()
            grd_lm = LogisticRegression()
            grd.fit(X_train, y_train)
            grd_enc.fit(grd.apply(X_train)[:, :, 0])
            grd_lm.fit(grd_enc.transform(grd.apply(X_train_lr)[:, :, 0]), y_train_lr)
            y_pred_grd_lm = grd_lm.predict_proba(grd_enc.transform(grd.apply(X_train_data[X_test_index])[:, :, 0]))[:, 1]
            y_pred_grd_lm = (y_pred_grd_lm>=0.5)*1
            gblr_p_tmp = precision_score(Y_train_data[X_test_index], y_pred_grd_lm)
            gblr_p.append(gblr_p_tmp)
            gblr_p_t.append(gblr_p_tmp)
            gblr_r_tmp = recall_score(Y_train_data[X_test_index], y_pred_grd_lm)
            gblr_r.append(gblr_r_tmp)
            gblr_r_t.append(gblr_r_tmp)
            gblr_f1_tmp = f1_score(Y_train_data[X_test_index], y_pred_grd_lm)
            gblr_f1.append(gblr_f1_tmp)
            gblr_f1_t.append(gblr_f1_tmp)
        print("n_estimators:%f,gblr_p:%f,gblr_r:%f,gblr_f1:%f" %
              (item,sum(gblr_p) / len(gblr_p), sum(gblr_r) / len(gblr_r), sum(gblr_f1) / len(gblr_f1)))
        if f1_best < sum(gblr_f1) / len(gblr_f1):
            estimator_best = item
    return gblr_r_t, gblr_p_t, gblr_f1_t,estimator_best
Ejemplo n.º 33
0
def GBDT_LR_test(X_train,Y_train,X_test,glr_norm,dim_para):
    '''
    
    :param X_train: train data
    :param Y_train: train label
    :param X_test:  test data 说明卡号在第一类,第二列之后才是数据
    :param glr_norm: 0: 不进行归一化  1:归一化(0,1) 2:标准化
    :param dim_para: PCA 降维 0:不进行降维 其他:降维
    :return: 输出卡号
    '''

    print("GBDT_LR_model:")
    X_test_org = X_test
    X_test = X_test_org[:, 1:]
    if glr_norm == 0:
        pass
    if glr_norm == 1 or 2:
        X_train, X_test = norm_data(X_train, X_test, glr_norm)

    if dim_para == 0:
        pass
    if dim_para != 0:
        X_train, X_test = dim_reduction(X_train, X_test, dim_para)

    gblr_r, gblr_p, gblr_f1, estimator_best = gbdt_lr_clf(X_train, Y_train)
    grd = ensemble.GradientBoostingClassifier(n_estimators = estimator_best)
    X_train, X_train_lr, y_train, y_train_lr = train_test_split(X_train, Y_train, test_size=0.5)
    grd_enc = OneHotEncoder()
    grd_lm = LogisticRegression()
    grd.fit(X_train, y_train)
    grd_enc.fit(grd.apply(X_train)[:, :, 0])
    grd_lm.fit(grd_enc.transform(grd.apply(X_train_lr)[:, :, 0]), y_train_lr)
    gblr_predictions = grd_lm.predict_proba(grd_enc.transform(grd.apply(X_test)[:, :, 0]))[:, 1]
    gblr_label = (gblr_predictions>=0.5) * 1
    print("gbdt_lr predict positive label number: %d" % (sum(gblr_label == 1)))
    card_list = X_test_org[gblr_label == 1,0]
    return card_list
def main():
    scriptdir = os.path.dirname(os.path.realpath(__file__))
    parser = argparse.ArgumentParser(
        description=
        "Skeleton for features and classifier for CWI-2016--optimisation of threshhold"
    )
    parser.add_argument('--threshold', type=float, default=0.5)
    parser.add_argument('--annotator', type=str, default="03")
    parser.add_argument('--penalty',
                        type=str,
                        choices=["l1", "l2"],
                        default="l1")

    args = parser.parse_args()
    current_single_ann = scriptdir + "/../data/cwi_training/cwi_training_" + args.annotator + ".lbl.conll"
    testfile = scriptdir + "/../data/cwi_testing/cwi_testing.txt.lbl.conll"
    X__dict_train, y_train, v_train = feats_and_classify.collect_features(
        current_single_ann, vectorize=False)
    X_dict_test, y_test, v_test = feats_and_classify.collect_features(
        testfile, vectorize=False)
    featdicts = list([x for x in X__dict_train + X_dict_test])
    vect = DictVectorizer()
    X = vect.fit_transform(featdicts).toarray()
    X_train = X[:len(y_train)]
    X_test = X[len(y_train):]

    maxent = LogisticRegression(penalty=args.penalty)
    maxent.fit(X_train, y_train)
    y_pred_proba = maxent.predict_proba(X_test)
    ypred_i = [
        "1" if pair[1] >= args.threshold else "0" for pair in y_pred_proba
    ]
    fout = open(args.annotator + ".pred", mode="w")
    print("\n".join(ypred_i), file=fout)
    fout.close()
    sys.exit(0)
Ejemplo n.º 35
0
def test_nnet(n_samples=200, n_features=5, distance=0.5):
    X, y = make_blobs(n_samples=n_samples, n_features=5,
                      centers=[numpy.ones(n_features) * distance, - numpy.ones(n_features) * distance])

    nn_types = [
        nnet.SimpleNeuralNetwork,
        nnet.MultiLayerNetwork,
        nnet.SoftmaxNeuralNetwork,
        nnet.RBFNeuralNetwork,
        nnet.PairwiseNeuralNetwork,
        nnet.PairwiseSoftplusNeuralNetwork,
    ]

    for loss in nnet.losses:
        for NNType in nn_types:
            for trainer in nnet.trainers:
                nn = NNType(layers=[5], loss=loss, trainer=trainer, random_state=42)
                nn.fit(X, y, stages=100, verbose=nnet.SILENT)
                print(roc_auc_score(y, nn.predict_proba(X)[:, 1]), nn)

    lr = LogisticRegression().fit(X, y)
    print(lr, roc_auc_score(y, lr.predict_proba(X)[:, 1]))

    assert 0 == 1
Ejemplo n.º 36
0
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model.logistic import LogisticRegression
from sklearn.cross_validation import train_test_split, cross_val_score
from sklearn.metrics import roc_curve, auc

df = pd.read_csv('sms.csv')
X_train_raw, X_test_raw, y_train, y_test = train_test_split(
    df['message'], df['label'])
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train_raw)
X_test = vectorizer.transform(X_test_raw)
classifier = LogisticRegression()
classifier.fit(X_train, y_train)
predictions = classifier.predict_proba(X_test)
false_positive_rate, recall, thresholds = roc_curve(y_test, predictions[:, 1])
roc_auc = auc(false_positive_rate, recall)
plt.title('Receiver Operating Characteristic')
plt.plot(false_positive_rate, recall, 'b', label='AUC = %0.2f' % roc_auc)
plt.legend(loc='lower right')
plt.plot([0, 1], [0, 1], 'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.ylabel('Recall')
plt.xlabel('Fall-out')
plt.show()
Ejemplo n.º 37
0
def main():
    global train
    global test
    # 将训练集Y数据存储在y中,并删除训练集Y数据
    y = train['Y']
    del train['Y']

    # 重命名列标题
    origin = [
        "年龄", "工作天数", "职业类型", "投资收入", "投资损失", "省份", "教育", "家庭角色", "婚姻状况",
        "教育时间", "民族", "工作情况", "性别"
    ]
    target = [
        "age", "work_days", "job", "invest_income", "invest_loss", "province",
        "education", "home_role", "marital_status", "education_time", "nation",
        "work_type", "gender"
    ]
    rename_dict = dict()
    for i in range(len(origin)):
        rename_dict[origin[i]] = target[i]
    train.rename(columns=rename_dict, inplace=True)
    test.rename(columns=rename_dict, inplace=True)

    # 查看是否有缺失数据
    print("===================统计缺失数据-训练集====================")
    print(train.isnull().sum(axis=0))
    print(train.isnull().any())
    print("===================统计缺失数据-测试集====================")
    print(test.isnull().sum(axis=0))
    print(test.isnull().any())

    full_data = [train, test]
    # 性别特征转为数字
    for dataset in full_data:
        dataset['gender'] = dataset['gender'].map({'女': 0, '男': 1}).astype(int)

    # 处理投资收益,分为五类
    for dataset in full_data:
        dataset['invest'] = dataset['invest_income'] - dataset['invest_loss']
    for dataset in full_data:
        dataset.loc[dataset['invest'] < 0, 'invest'] = 0
        dataset.loc[dataset['invest'] == 0, 'invest'] = 1
        dataset.loc[(dataset['invest'] > 0) & (dataset['invest'] <= 5000),
                    'invest'] = 2
        dataset.loc[(dataset['invest'] > 5000) & (dataset['invest'] <= 10000),
                    'invest'] = 3
        dataset.loc[dataset['invest'] > 10000, 'invest'] = 4

    # 处理省份为数字
    for dataset in full_data:
        province_list = []
        for province_name in dataset['province']:
            province_list.append(int(province_name.replace("省份", "")) / 2)
        dataset['province'] = np.array(province_list)

    # 分类特征转为哑变量(数字分类)
    dumb_columns('job')  # 职业类型
    dumb_columns('education')  # 教育
    dumb_columns('nation')  # 民族
    dumb_columns('home_role')  # 家庭角色
    dumb_columns('marital_status')  # 婚姻状况
    dumb_columns('work_type')  # 工作情况

    # 年龄分类-五类
    for dataset in full_data:
        # Mapping Age
        dataset.loc[dataset['age'] <= 22, 'age'] = 0
        dataset.loc[(dataset['age'] > 22) & (dataset['age'] <= 32), 'age'] = 1
        dataset.loc[(dataset['age'] > 32) & (dataset['age'] <= 48), 'age'] = 2
        dataset.loc[(dataset['age'] > 48) & (dataset['age'] <= 64), 'age'] = 3
        dataset.loc[dataset['age'] > 64, 'age'] = 4

    # 工作天数-按比例缩小,防止维度之间差异过大
    for dataset in full_data:
        dataset['work_days'] = dataset['work_days'] / 10

    # 删除不必要的列
    drop_elements = ['invest_income', 'invest_loss', 'education']
    train = train.drop(drop_elements, axis=1)
    test = test.drop(drop_elements, axis=1)

    # 显示训练集和测试集特征
    print(train.head(3))
    print("===")
    print(test.head(3))

    # 模型列表
    model_list = []

    # =====================逻辑回归===================
    # 分割数据
    x_train, x_test, y_train, y_test = train_test_split(train,
                                                        y,
                                                        test_size=0.25,
                                                        random_state=24)
    # estimator
    logic = LogisticRegression()
    logic.fit(x_train, y_train)
    # 预测
    print(
        "精确率和召回率(逻辑回归):",
        classification_report(y_test,
                              logic.predict(x_test),
                              labels=[0, 1],
                              target_names=["非高收入", "高收入"]))
    pre_score = logic.score(x_test, y_test)
    print("准确率(逻辑回归):{}".format(pre_score))
    # 输出概率
    predictions = logic.predict_proba(x_test)
    # 计算auc
    fpr, tpr, thresholds = metrics.roc_curve(y_test, predictions[:, 1])
    auc_value = metrics.auc(fpr, tpr)
    print("auc值为:{}".format(auc_value))
    model_list.append({"model": logic, "auc": auc_value})
    # 绘图
    plt.title('LogisticRegression AUC')
    plt.plot(fpr, tpr, 'r', label='AUC_LOGIC = %0.3f' % auc_value)
    plt.legend(loc='lower right')
    plt.plot([0, 1], [0, 1], 'r--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.0])
    plt.ylabel('tpr')
    plt.xlabel('fpr')
    # plt.savefig("./LogisticRegression_auc.png")

    # ==============决策树===========
    # 数据集分割
    x_train, x_test, y_train, y_test = train_test_split(train,
                                                        y,
                                                        test_size=0.25,
                                                        random_state=24)
    # 转换为字典数据,并进行特征抽取
    dc = DictVectorizer(sparse=False)
    x_train = dc.fit_transform(x_train.to_dict(orient="records"))
    features = dc.get_feature_names()
    x_test = dc.transform(x_test.to_dict(orient="records"))
    # estimator
    dec = DecisionTreeClassifier(max_depth=4)
    dec.fit(x_train, y_train)
    # 决策树本地保存
    # dot -Tpng -o tree.png tree.dot
    export_graphviz(dec, out_file="./tree.dot", feature_names=features)
    # 预测
    print(
        "精确率和召回率(决策树):",
        classification_report(y_test,
                              dec.predict(x_test),
                              labels=[0, 1],
                              target_names=["非高收入", "高收入"]))
    pre_score = dec.score(x_test, y_test)
    print("准确率(决策树):{}".format(pre_score))
    # 输出概率
    predictions = dec.predict_proba(x_test)
    # 计算auc
    fpr, tpr, thresholds = metrics.roc_curve(y_test, predictions[:, 1])
    auc_value = metrics.auc(fpr, tpr)
    print("auc值为:{}".format(auc_value))
    model_list.append({"model": dec, "auc": auc_value})
    # 绘图
    plt.title('DecisionTreeClassifier AUC')
    plt.plot(fpr, tpr, 'b', label='AUC_DTC = %0.3f' % auc_value)
    plt.legend(loc='lower right')
    plt.plot([0, 1], [0, 1], 'r--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.0])
    plt.ylabel('tpr')
    plt.xlabel('fpr')
    # plt.savefig("./DecisionTreeClassifier_auc.png")

    # =============随机森林==============
    # 数据集分割
    x_train, x_test, y_train, y_test = train_test_split(train,
                                                        y,
                                                        test_size=0.25,
                                                        random_state=24)
    # 转换为字典数据,并进行特征抽取
    dc = DictVectorizer(sparse=False)
    x_train = dc.fit_transform(x_train.to_dict(orient="records"))
    # print(dc.get_feature_names())
    x_test = dc.transform(x_test.to_dict(orient="records"))
    # estimator
    rf = RandomForestClassifier(n_estimators=5)
    rf.fit(x_train, y_train)
    # 预测
    print(
        "精确率和召回率(随机森林):",
        classification_report(y_test,
                              rf.predict(x_test),
                              labels=[0, 1],
                              target_names=["非高收入", "高收入"]))
    pre_score = rf.score(x_test, y_test)
    print("准确率(随机森林):{}".format(pre_score))
    # 输出概率
    predictions = rf.predict_proba(x_test)
    # 计算auc
    fpr, tpr, thresholds = metrics.roc_curve(y_test, predictions[:, 1])
    auc_value = metrics.auc(fpr, tpr)
    print("auc值为:{}".format(auc_value))
    model_list.append({"model": rf, "auc": auc_value})
    # 绘图
    plt.title('RandomForestClassifier AUC')
    plt.plot(fpr, tpr, 'y', label='AUC_RF = %0.3f' % auc_value)
    plt.legend(loc='lower right')
    plt.plot([0, 1], [0, 1], 'r--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.0])
    plt.ylabel('tpr')
    plt.xlabel('fpr')
    plt.savefig("./count_auc.png")

    # 模型对比,选择auc值最大的模型进行预测
    sorted_key_list = sorted(model_list, key=lambda x: x['auc'], reverse=True)
    model = sorted_key_list[0]['model']
    auc_v = sorted_key_list[0]['auc']

    print("选择模型 {}".format(model))
    print("AUC值为 {}".format(auc_v))
    pre_data = model.predict_proba(test)
    # 保存目标值
    test['Y'] = pre_data[:, 0]
    test['Y'].to_csv('Results_1.csv',
                     encoding='utf-8',
                     index=False,
                     header=False)
    # 保存完整版本
    test_origin['Y'] = pre_data[:, 0]
    test_origin.to_csv("./my_results.csv", encoding='utf-8', index=False)
def train():
    weather = load_weather()
    training = load_training()
    
    X, y = assemble_X_y(training, weather)
    mean, std = normalize(X)
    #y = assemble_y(training)
    '''    
    input_size = len(X[0])
    
    learning_rate = theano.shared(np.float32(0.1))
    
    net = NeuralNet(
    layers=[  
        ('input', InputLayer),
         ('hidden1', DenseLayer),
        ('dropout1', DropoutLayer),
        ('hidden2', DenseLayer),
        ('dropout2', DropoutLayer),
        ('output', DenseLayer),
        ],
    # layer parameters:
    input_shape=(None, input_size), 
    hidden1_num_units=256, 
    dropout1_p=0.4,
    hidden2_num_units=256, 
    dropout2_p=0.4,
    output_nonlinearity=sigmoid, 
    output_num_units=1, 

    # optimization method:
    update=nesterov_momentum,
    update_learning_rate=learning_rate,
    update_momentum=0.9,
    
    # Decay the learning rate
    on_epoch_finished=[
            AdjustVariable(learning_rate, target=0, half_life=4),
            ],

    # This is silly, but we don't want a stratified K-Fold here
    # To compensate we need to pass in the y_tensor_type and the loss.
    regression=True,
    y_tensor_type = T.imatrix,
    objective_loss_function = binary_crossentropy,
     
    max_epochs=32, 
    eval_size=0.1,
    verbose=1,
    )
    '''
    clf = LogisticRegression(C = 10)
    #clf = svm.SVC()
    X, y = shuffle(X, y, random_state=123)
    
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33)
    clf.fit(X_train, y_train)

    probas = clf.predict_proba(X_test)[:,1]
    print("ROC score", metrics.roc_auc_score(np.ravel(y_test), probas))
    
    print("fitting...")
    clf.fit(X, y)
    
    #clf.fit(X[:100, :], y[:100])
    #Tracer()()
    #probas = clf.predict(X[:100, :])[:,1]
    #y_pred = (probas > 0.5).astype(int)
    
    #print(np.abs(y_pred-y[:100]).sum()) 

    return clf, mean, std     
Ejemplo n.º 39
0
def train_model(X_train, y_train, X_test, y_test, name, plot=False):
    """
        train_model(vector, vector, name[, plot=False])
        
        Trains and saves model to disk.
    """
    labels = np.unique(y_train)

    train_errors = []
    test_errors = []

    scores = []
    pr_scores = defaultdict(list)
    precisions, recalls, thresholds = defaultdict(list), defaultdict(
        list), defaultdict(list)

    roc_scores = defaultdict(list)
    tprs = defaultdict(list)
    fprs = defaultdict(list)

    clfs = []  # for the median

    cms = []

    #    print "X_train::"
    #    print X_train
    #    print "X_test::"
    #    print X_test
    #    print "y_train::"
    #    print y_train
    #    print "y_test::"
    #    print y_test

    clf = LogisticRegression()
    #clf=GaussianNB()
    #clf=SVC(probability=True)
    clf.fit(X_train, y_train)
    clfs.append(clf)

    train_score = clf.score(X_train, y_train)
    test_score = clf.score(X_test, y_test)
    print "train_score:: " + str(train_score)
    print "test_score:: " + str(test_score)
    scores.append(test_score)

    train_errors.append(1 - train_score)
    test_errors.append(1 - test_score)

    y_pred = clf.predict(X_test)
    print y_pred
    cm = confusion_matrix(y_test, y_pred)
    cms.append(cm)
    #    cms = np.asarray(cms)
    #    cm_avg = np.mean(cms, axis=0)
    #    cm_norm = cm_avg / np.sum(cm_avg, axis=0)
    #    plot_confusion_matrix(cm_norm, genre_list, "ceps","CEPS classifier - Confusion matrix")

    for label in labels:
        #print "label "+str(label)
        y_label_test = np.asarray(y_test == label, dtype=int)
        #print "y_label_test "+str(y_label_test)
        proba = clf.predict_proba(X_test)
        #print str(len(proba))+"proba "+str(proba)
        proba_label = proba[:, label]

        fpr, tpr, roc_thresholds = roc_curve(y_label_test, proba_label)
        roc_scores[label].append(auc(fpr, tpr))
        tprs[label].append(tpr)
        fprs[label].append(fpr)

    #sys.exit(1)
    if plot:
        for label in labels:
            scores_to_sort = roc_scores[label]
            median = np.argsort(scores_to_sort)[len(scores_to_sort) / 2]
            desc = "%s %s" % (name, genre_list[label])
            plot_roc_curves(roc_scores[label][median],
                            desc,
                            tprs[label][median],
                            fprs[label][median],
                            label='%s vs rest' % genre_list[label])

    all_pr_scores = np.asarray(pr_scores.values()).flatten()
    summary = (np.mean(scores), np.std(scores), np.mean(all_pr_scores),
               np.std(all_pr_scores))
    #print("%.3f\t%.3f\t%.3f\t%.3f\t" % summary)

    #save the trained model to disk
    joblib.dump(clf, 'saved_model/model_ceps.pkl')

    return np.mean(train_errors), np.mean(test_errors), np.asarray(cms)
def train_model(X,
                Y,
                name,
                plot=False,
                outModelName=outModelName,
                testSize=0.3):
    """
    train_model(vector, vector, name[, plot=False])
    Trains and saves model to disk.
    Parameters
    ----------
    outModelName : path to save the trained model (*.pkl)
    testsize : fracion of the data used for testing
    Returns
    -------
    outModelName, 
    np.mean(train_errors)
    np.mean(test_errors)
    np.asarray(cms)
    
    """
    labels = np.unique(Y)

    cv = ShuffleSplit(n=len(X), n_iter=1, test_size=testSize, random_state=0)

    train_errors = []
    test_errors = []

    scores = []
    pr_scores = defaultdict(list)
    precisions, recalls, thresholds = defaultdict(list), defaultdict(
        list), defaultdict(list)

    roc_scores = defaultdict(list)
    tprs = defaultdict(list)
    fprs = defaultdict(list)

    clfs = []  # for the median

    cms = []

    for train, test in cv:
        X_train, y_train = X[train], Y[train]
        X_test, y_test = X[test], Y[test]

        clf = LogisticRegression()
        clf.fit(X_train, y_train)
        clfs.append(clf)

        train_score = clf.score(X_train, y_train)
        test_score = clf.score(X_test, y_test)
        scores.append(test_score)

        train_errors.append(1 - train_score)
        test_errors.append(1 - test_score)

        y_pred = clf.predict(X_test)
        cm = confusion_matrix(y_test, y_pred)
        cms.append(cm)

        for label in labels:
            y_label_test = np.asarray(y_test == label, dtype=int)
            proba = clf.predict_proba(X_test)
            proba_label = proba[:, label]

            fpr, tpr, roc_thresholds = roc_curve(y_label_test, proba_label)
            roc_scores[label].append(auc(fpr, tpr))
            tprs[label].append(tpr)
            fprs[label].append(fpr)

    if plot:
        for label in labels:
            scores_to_sort = roc_scores[label]
            median = np.argsort(scores_to_sort)[len(scores_to_sort) / 2]
            desc = "%s %s" % (name, genre_list[label])
            plot_roc_curves(roc_scores[label][median],
                            desc,
                            tprs[label][median],
                            fprs[label][median],
                            label='%s vs rest' % genre_list[label])

    all_pr_scores = np.asarray(pr_scores.values()).flatten()
    summary = (np.mean(scores), np.std(scores), np.mean(all_pr_scores),
               np.std(all_pr_scores))
    #print("%.3f\t%.3f\t%.3f\t%.3f\t" % summary)

    #save the trained model to disk
    if outModelName: joblib.dump(clf, outModelName)

    return np.mean(train_errors), np.mean(test_errors), np.asarray(cms)
Ejemplo n.º 41
0
def test_fit_credit_backupsklearn():
    df = pd.read_csv("./open_data/creditcard.csv")
    X = np.array(df.iloc[:, :df.shape[1] - 1], dtype='float32', order='C')
    y = np.array(df.iloc[:, df.shape[1] - 1], dtype='float32', order='C')
    Solver = h2o4gpu.LogisticRegression

    enet_h2o4gpu = Solver(glm_stop_early=False)
    print("h2o4gpu fit()")
    enet_h2o4gpu.fit(X, y)
    print("h2o4gpu predict()")
    print(enet_h2o4gpu.predict(X))
    print("h2o4gpu score()")
    print(enet_h2o4gpu.score(X,y))

    enet = Solver(dual=True, max_iter=100, tol=1E-4, intercept_scaling=0.99, random_state=1234)
    print("h2o4gpu scikit wrapper fit()")
    enet.fit(X, y)
    print("h2o4gpu scikit wrapper predict()")
    print(enet.predict(X))
    print("h2o4gpu scikit wrapper predict_proba()")
    print(enet.predict_proba(X))
    print("h2o4gpu scikit wrapper predict_log_proba()")
    print(enet.predict_log_proba(X))
    print("h2o4gpu scikit wrapper score()")
    print(enet.score(X,y))
    print("h2o4gpu scikit wrapper decision_function()")
    print(enet.decision_function(X))
    print("h2o4gpu scikit wrapper densify()")
    print(enet.densify())
    print("h2o4gpu scikit wrapper sparsify")
    print(enet.sparsify())
    
    from sklearn.linear_model.logistic import  LogisticRegression
    enet_sk = LogisticRegression(dual=True, max_iter=100, tol=1E-4, intercept_scaling=0.99, random_state=1234)
    print("Scikit fit()")
    enet_sk.fit(X, y)
    print("Scikit predict()")
    print(enet_sk.predict(X))
    print("Scikit predict_proba()")
    print(enet_sk.predict_proba(X))
    print("Scikit predict_log_proba()")
    print(enet_sk.predict_log_proba(X))
    print("Scikit score()")
    print(enet_sk.score(X,y))
    print("Scikit decision_function()")
    print(enet_sk.decision_function(X))
    print("Scikit densify()")
    print(enet_sk.densify())
    print("Sciki sparsify")
    print(enet_sk.sparsify())

    enet_sk_coef = csr_matrix(enet_sk.coef_, dtype=np.float32).toarray()
    print(enet_sk.coef_)
    print(enet_sk_coef)
    print(enet.coef_)
    print(enet_sk.intercept_)
    print("Coeffs, intercept, and n_iters should match")
    assert np.allclose(enet.coef_, enet_sk_coef)
    assert np.allclose(enet.intercept_, enet_sk.intercept_)
    assert np.allclose(enet.n_iter_, enet_sk.n_iter_)
    print("Preds should match")
    assert np.allclose(enet.predict_proba(X), enet_sk.predict_proba(X))
    assert np.allclose(enet.predict(X), enet_sk.predict(X))
    assert np.allclose(enet.predict_log_proba(X), enet_sk.predict_log_proba(X))
Ejemplo n.º 42
0
def train_model(X, Y, name, plot=False):
    """
        train_model(vector, vector, name[, plot=False])
        
        Trains and saves model to disk.
    """
    labels = np.unique(Y)
    print labels

    cv = ShuffleSplit(n=len(X), n_iter=1, test_size=0.3, random_state=0)
    
    
    train_errors = []
    test_errors = []

    scores = []
    pr_scores = defaultdict(list)
    precisions, recalls, thresholds = defaultdict(list), defaultdict(list), defaultdict(list)

    roc_scores = defaultdict(list)
    tprs = defaultdict(list)
    fprs = defaultdict(list)

    clfs = []  # for the median

    cms = []

    for train, test in cv:
        X_train, y_train = X[train], Y[train]
        X_test, y_test = X[test], Y[test]

        clf = LogisticRegression()
        clf.fit(X_train, y_train)
        clfs.append(clf)

        train_score = clf.score(X_train, y_train)
        test_score = clf.score(X_test, y_test)
        scores.append(test_score)

        train_errors.append(1 - train_score)
        test_errors.append(1 - test_score)

        y_pred = clf.predict(X_test)
        cm = confusion_matrix(y_test, y_pred)
        cms.append(cm)

        for label in labels:
            y_label_test = np.asarray(y_test == label, dtype=int)
            proba = clf.predict_proba(X_test)
            proba_label = proba[:, label]

            fpr, tpr, roc_thresholds = roc_curve(y_label_test, proba_label)
            roc_scores[label].append(auc(fpr, tpr))
            tprs[label].append(tpr)
            fprs[label].append(fpr)

    if plot:
        for label in labels:
            scores_to_sort = roc_scores[label]
            median = np.argsort(scores_to_sort)[len(scores_to_sort) / 2]
            desc = "%s %s" % (name, genre_list[label])
            plot_roc_curves(roc_scores[label][median], desc, tprs[label][median],fprs[label][median], label='%s vs rest' % genre_list[label])

    all_pr_scores = np.asarray(pr_scores.values()).flatten()
    summary = (np.mean(scores), np.std(scores), np.mean(all_pr_scores), np.std(all_pr_scores))
    #print("%.3f\t%.3f\t%.3f\t%.3f\t" % summary)

    #save the trained model to disk
    joblib.dump(clf, 'saved_model/model_ceps.pkl')
    
    return np.mean(train_errors), np.mean(test_errors), np.asarray(cms)
Ejemplo n.º 43
0
def train_reg(reg, clazz, X, X_val, two_class_y, two_class_y_val):
    print 'Training clazz', clazz, 'with C=', reg
    model = LogisticRegression('l1', False, C=reg)
    model.fit(X, two_class_y)
    precision = functions.precision(model.predict_proba(X_val), two_class_y_val)
    return model, precision
def train_model(clf_factory, X, Y, name, plot=False):
    """
        Trains and saves model to disk.
    """
    labels = np.unique(Y)
    cv = ShuffleSplit( n=len(X), n_iterations=1, test_fraction=0.3, indices=True, random_state=0)
    #print "cv = ",cv
    train_errors = []
    test_errors = []

    scores = []

    pr_scores, precisions, recalls, thresholds = defaultdict(list), defaultdict(list), defaultdict(list), defaultdict(list)

    roc_scores, tprs, fprs = defaultdict(list), defaultdict(list) ,defaultdict(list)

    clfs = []  # just to later get the median

    cms = []

    for train, test in cv:
        X_train, y_train = X[train], Y[train]
        X_test, y_test = X[test], Y[test]
        global clf
        clf = LogisticRegression()
        clf.fit(X_train, y_train)
        clfs.append(clf)

        train_score = clf.score(X_train, y_train)
        test_score = clf.score(X_test, y_test)
        scores.append(test_score)

        train_errors.append(1 - train_score)
        test_errors.append(1 - test_score)

        y_pred = clf.predict(X_test)
        cm = confusion_matrix(y_test, y_pred)
        cms.append(cm)
        
        for label in labels:
            y_label_test = np.asarray(y_test == label, dtype=int)
            proba = clf.predict_proba(X_test)
            proba_label = proba[:, label]

            precision, recall, pr_thresholds = precision_recall_curve(
                y_label_test, proba_label)
            pr_scores[label].append(auc(recall, precision))
            precisions[label].append(precision)
            recalls[label].append(recall)
            thresholds[label].append(pr_thresholds)

            fpr, tpr, roc_thresholds = roc_curve(y_label_test, proba_label)
            roc_scores[label].append(auc(fpr, tpr))
            tprs[label].append(tpr)
            fprs[label].append(fpr)

    if plot:
        for label in labels:
            #print("Plotting %s"%genre_list[label])
            scores_to_sort = roc_scores[label]
            median = np.argsort(scores_to_sort)[len(scores_to_sort) / 2]
            desc = "%s %s" % (name, genre_list[label])
            #plot_pr(pr_scores[label][median], desc, precisions[label][median],recalls[label][median], label='%s vs rest' % genre_list[label])
            #plot_roc(roc_scores[label][median], desc, tprs[label][median],fprs[label][median], label='%s vs rest' % genre_list[label])

    all_pr_scores = np.asarray(pr_scores.values()).flatten()
    summary = (np.mean(scores), np.std(scores),np.mean(all_pr_scores), np.std(all_pr_scores))
    print("%.3f\t%.3f\t%.3f\t%.3f\t" % summary)

    #save the trained model to disk
    joblib.dump(clf, 'saved_model_fft/my_model.pkl')
    
    return np.mean(train_errors), np.mean(test_errors), np.asarray(cms)
    train_visit_df = pd.read_csv("%s/../input/coupon_visit_train.csv" %
                                 script_path)
    test_coupon_df = pd.read_csv("%s/../input/coupon_list_test.csv" %
                                 script_path)
    # create train_df
    train_df = pd.merge(train_visit_df, train_coupon_df,
                        left_on="VIEW_COUPON_ID_hash", right_on="COUPON_ID_hash")
    train_df = pd.merge(train_df, user_df,
                        left_on="USER_ID_hash", right_on="USER_ID_hash")
    # create train feature
    fu_obj = FeatureUnion(transformer_list=feature_list)
    X_train = fu_obj.fit_transform(train_df)
    y_train = train_df["PURCHASE_FLG"]
    assert X_train.shape[0] == y_train.size
    # fit model
    clf = LogisticRegression()
    clf.fit(X_train, y_train)
    # create test_df
    test_coupon_df["cross"] = 1
    user_df["cross"] = 1
    test_df = pd.merge(test_coupon_df, user_df, on="cross")
    # create test Feature
    X_test = fu_obj.transform(test_df)
    # predict test data
    predict_proba = clf.predict_proba(X_test)
    pos_idx = np.where(clf.classes_ == True)[0][0]
    test_df["predict"] = predict_proba[:, pos_idx]
    top10_coupon = test_df.groupby("USER_ID_hash").apply(top_merge)
    top10_coupon.name = "PURCHASED_COUPONS"
    top10_coupon.to_csv("submission.csv", header=True)
Ejemplo n.º 46
0
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model.logistic import LogisticRegression
from sklearn.cross_validation import train_test_split, cross_val_score
from sklearn.metrics import roc_curve, auc

df = pd.read_csv('data/sms.csv')
X_train_raw, X_test_raw, y_train, y_test = train_test_split(df['message'], df['label'])
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train_raw)
X_test = vectorizer.transform(X_test_raw)
classifier = LogisticRegression()
classifier.fit(X_train, y_train)
predictions = classifier.predict_proba(X_test)
false_positive_rate, recall, thresholds = roc_curve(y_test, predictions[:, 1])
roc_auc = auc(false_positive_rate, recall)
plt.title('Receiver Operating Characteristic')
plt.plot(false_positive_rate, recall, 'b', label='AUC = %0.2f' % roc_auc)
plt.legend(loc='lower right')
plt.plot([0, 1], [0, 1], 'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.ylabel('Recall')
plt.xlabel('Fall-out')
plt.show()


################# Sample 8 #################
"""
Ejemplo n.º 47
0
from sklearn import svm
from sklearn.linear_model.logistic import LogisticRegression
from sklearn.linear_model import RidgeCV, LassoCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
import csv
data = []
mark = []
with open('/Users/hhy/Desktop/1/test.csv', 'r', encoding='utf-8_sig') as f:
    csv_reader = csv.reader(f)
    for x in csv_reader:
        data.append(list(map(float, x[0:-1])))
        mark.append(float(x[-1]))
auc = []
acc = []
f1 = []
for i in range(10):
    X_train, X_test, y_train, y_test = cross_validation.train_test_split(
        data, mark, test_size=0.05, random_state=i)
    clf = LogisticRegression(C=4.8, random_state=1113)
    clf.fit(X_train, y_train)
    y_predict = clf.predict_proba(X_test)[:, 1]
    test_auc = metrics.roc_auc_score(y_test, y_predict)  # 验证集上的auc值
    auc.append(test_auc)
    y_pred = clf.predict(X_test)
    acc.append(metrics.accuracy_score(y_test, y_pred))
    f1.append(metrics.f1_score(y_test, y_pred))
print("acc==", sum(acc) / len(acc))
print("auc==", sum(auc) / len(auc))
print("f1==", sum(f1) / len(f1))
Ejemplo n.º 48
0
from sklearn.metrics import classification_report  
import time  
#计算运行时间 
start_time = time.time() 


path = "E:/Desktop/Image/SVMData/gender_wechat_scale.txt"
x,y = readData(path)

average = 0
testNum = 10
clf = LogisticRegression()  
print clf
for i in range(0,testNum):
    x_train, x_test, y_train, y_test = train_test_split(x, y)
    clf = LogisticRegression()   
    clf.fit(x_train, y_train)  
    y_pred = clf.predict(x_test)  
    p = np.mean(y_pred == y_test)  
    print(p)  
    average += p  


answer = clf.predict_proba(x_test)[:,1]  
precision, recall, thresholds = precision_recall_curve(y_test, answer)      
report = answer > 0.5  
print(classification_report(y_test, report, target_names = ['neg', 'pos']))  
print("average precision:", average/testNum)  
print("time spent:", time.time() - start_time)