def run_LR(lmbd, preprocess_method, train_validation_frame, test_frame,
           columns):
    result = {}
    minimum_cv_error = 1
    for lamda in lmbd:
        cv_accuracy = []
        for times in range(5):
            # generate train set and validation set
            train_frame, validation_frame = split_train_set(
                train_validation_frame, columns)
            train_set, validation_set, test_set = transform(
                preprocess_method, train_frame, validation_frame, test_frame,
                columns)

            # Logistic regression
            L2_classify = LogisticRegression(C=1 / lamda,
                                             penalty='l2',
                                             solver='newton-cg')
            L2_classify.fit(train_set.drop(['y'], axis=1), train_set['y'])

            # cv for model selection
            cv_result_list = cross_val_score(L2_classify,
                                             validation_set.drop(['y'],
                                                                 axis=1),
                                             validation_set['y'],
                                             cv=5)
            cv_accuracy.append(np.mean(cv_result_list))
        result[lamda] = 1 - np.mean(cv_accuracy)
        minimum_cv_error = min(minimum_cv_error, result[lamda])

    # print all lambda and cv error
    for key in result.keys():
        print('when lambda =', key, 'cv error rate =', result[key])

    # select the best model
    for key in result.keys():
        if result[key] == minimum_cv_error:
            print('the best model is when lambda =', key)
            print('cv error rate =', minimum_cv_error)

            train_set, no_use_set, test_set = transform(
                preprocess_method, train_validation_frame,
                train_validation_frame, test_frame, columns)
            L2_classify = LogisticRegression(C=1 / key,
                                             penalty='l2',
                                             solver='newton-cg')
            L2_classify.fit(train_set.drop(['y'], axis=1), train_set['y'])

            # print test error
            print(
                'test error =', 1 -
                L2_classify.score(test_set.drop(['y'], axis=1), test_set['y']))

            # print train error for whole train set
            print(
                'train error =', 1 - L2_classify.score(
                    train_set.drop(['y'], axis=1), train_set['y']))
            break
    print('')
Beispiel #2
0
def logic_pca_standard(y, n):
    # 逻辑回归+降维+标准化
    pa = PCA(n_components=n)
    data = pa.fit_transform(train)
    # 分割数据
    x_train, x_test, y_train, y_test = train_test_split(data,
                                                        y,
                                                        test_size=0.25,
                                                        random_state=24)
    # 标准化
    std = StandardScaler()
    print(std)
    x_train = std.fit_transform(x_train)
    x_test = std.transform(x_test)
    # estimator
    logic = LogisticRegression()
    logic.fit(x_train, y_train)
    # 预测
    pre_score = logic.score(x_test, y_test)
    print("准确率(逻辑回归+降维+标准化):{}".format(pre_score))
    print(
        "精确率和召回率:",
        classification_report(y_test,
                              logic.predict(x_test),
                              labels=[0, 1],
                              target_names=["非高收入", "高收入"]))
    # 输出概率
    predictions = logic.predict_proba(x_test)
    # Compute Receiver operating characteristic (ROC)
    fpr, tpr, thresholds = metrics.roc_curve(y_test, predictions[:, 1])
    auc_value = metrics.auc(fpr, tpr)
    print("auc值为:{}".format(auc_value))
class LogReg:
    def __init__(self):
        self.load_data()
        self.clf = LogisticRegression(class_weight = 'balanced')
        self.train()
        self.predict()

    def load_data(self):
        train_csv = './data/train.csv'
        test_csv = './data/test.csv'
        df_train = pd.read_csv(train_csv, header=0)
        df_test = pd.read_csv(test_csv, header=0)
        arr_train = df_train.values
        arr_test = df_test.values
        self.train_X = arr_train[0::,1::]
        self.train_Y = arr_train[0::, 0]
        self.test_X = arr_test[0::, 1::]
        self.test_ID = arr_test[0::,0]

    def train(self):
        self.clf.fit(self.train_X, self.train_Y)

    def predict(self):
        self.test_Y = self.clf.predict_proba(self.test_X)

    def get_training_accuracy(self):
        return (self.clf.score(self.train_X, self.train_Y))

    def store_result(self):
        df_out = pd.DataFrame()
        df_out['Id'] = self.test_ID
        df_out['Action'] = self.test_Y[0::,1]
        df_out.to_csv('./data/results/c1_result.csv',index=False)
Beispiel #4
0
def regressiontest(list0Posts, listClasses):
    digits = datasets.load_digits()
    digits.data = list0Posts
    digits.target = listClasses
    length = len(digits.data)
    # 多元线性回归
    # model = LinearRegression()
    model = LogisticRegression()
    model.fit(digits.data, digits.target)
    joblib.dump(model, 'models/regressiontest/regressiontest.pkl')
    model_load = joblib.load('models/regressiontest/regressiontest.pkl')
    testInput = digits.data
    testResult = model_load.predict(testInput)
    realResult = digits.target
    testLen = len(testResult)
    errorCount = 0
    totalError = 0
    for i in range(len(testResult)):
        if fabs(testResult[i] - realResult[i]) > 10:
            errorCount += 1
            # print i+2
    totalError += float(errorCount) / testLen
    totalRight = 1 - totalError
    print "number of training:", length, "errorCount :", errorCount, "total error rate", totalError, "total right rate", totalRight
    print "R-squared:", model.score(testInput, testResult)
Beispiel #5
0
def score(id):
    data = []
    mark = []
    with open(id, 'r', encoding='utf-8_sig') as f:
        csv_reader = csv.reader(f)
        for x in csv_reader:
            data.append(list(map(float, x[0:-1])))
            mark.append(float(x[-1]))
    acc = []
    auc = []
    f1 = []
    for i in range(10):
        X_train, X_test, y_train, y_test = cross_validation.train_test_split(
            data, mark, test_size=0.05, random_state=i)
        clf = LogisticRegression(C=4.8, random_state=1113)
        clf.fit(X_train, y_train)
        # print('准确率:',clf.score(X_test, y_test))
        acc.append(round(clf.score(X_test, y_test), 3))
        y_pred = clf.predict(X_test)
        # print ('ACC: %.4f' % metrics.accuracy_score(y_test,y_pred))
        auc.append(round(metrics.roc_auc_score(y_test, y_pred), 3))
        # print ('F1-score: %.4f' %metrics.f1_score(y_test,y_predict))
        f1.append(round(metrics.f1_score(y_test, y_pred), 3))
    acc.append(round(sum(acc) / len(acc), 3))
    auc.append(round(sum(auc) / len(auc), 3))
    f1.append(round(sum(f1) / len(f1), 3))
    return [auc, acc, f1]
Beispiel #6
0
def train_model(X, Y, name, plot=False):
    """
        train_model(vector, vector, name[, plot=False])
        
        Trains and saves model to disk.
    """
    labels = np.unique(Y)

    cv = ShuffleSplit(n=len(X), test_size=0.3, random_state=0)

    train_errors = []
    test_errors = []

    scores = []
    pr_scores = defaultdict(list)
    precisions, recalls, thresholds = defaultdict(list), defaultdict(
        list), defaultdict(list)

    clfs = []  # for the median

    cms = []

    for train, test in cv:
        X_train, y_train = X[train], Y[train]
        X_test, y_test = X[test], Y[test]

        clf = LogisticRegression()
        clf.fit(X_train, y_train)
        clfs.append(clf)

        train_score = clf.score(X_train, y_train)
        test_score = clf.score(X_test, y_test)
        scores.append(test_score)

        train_errors.append(1 - train_score)
        test_errors.append(1 - test_score)

        y_pred = clf.predict(X_test)
        cm = confusion_matrix(y_test, y_pred)
        cms.append(cm)

    #save the trained model to disk
    joblib.dump(clf, 'C:\\Users\\hp\\Desktop\\project\\logregdata.rar')

    return np.mean(train_errors), np.mean(test_errors), np.asarray(cms)
Beispiel #7
0
def model(train_data, new_label):
    train_x, test_x, train_y, test_y = train_test_split(train_data,
                                                        new_label,
                                                        test_size=0.5,
                                                        random_state=0)
    clf = LogisticRegression()
    clf.fit(train_x, train_y)
    score = clf.score(test_x, test_y)
    return score
Beispiel #8
0
all_real_y = []
all_pred_y = []
for train_inds, test_inds in folder.split(X):
    print(n_fold)
    n_fold += 1
    sel_w_pvals = fwd_stepwise_selection(
        pd.DataFrame(X[train_inds, :], columns=immu_cols),
            y[train_inds], verbose=True, top_n=10)  # ~1/3 of available samples
    print('Forward-stepwise selection: ' +  ' -> '.join(sel_w_pvals))
    sel_vars_inds = np.isin(immu_cols, sel_w_pvals)

    clf.fit(X[train_inds, :][:, sel_vars_inds], y[train_inds])

    sel_frequencies.append(sel_vars_inds)
    coefs.append(clf.coef_)
    accs.append(clf.score(X[test_inds, :][:, sel_vars_inds], y[test_inds]))
    all_pred_y += list(clf.predict(X[test_inds, :][:, sel_vars_inds]))
    all_real_y += list(y[test_inds])
    all_sel_cols += sel_w_pvals
print(np.mean(accs))
# print(accs)

from sklearn.metrics import f1_score
print(f1_score(y_true=all_real_y, y_pred=all_pred_y))

#Confusion matrix, Accuracy, sensitivity and specificity
from sklearn.metrics import confusion_matrix

cm1 = confusion_matrix(y_true=all_real_y, y_pred=all_pred_y)
print('Confusion Matrix : \n', cm1)
def train_model(clf_factory, X, Y, name, plot=False):
    """
        Trains and saves model to disk.
    """
    labels = np.unique(Y)
    cv = ShuffleSplit( n=len(X), n_iterations=1, test_fraction=0.3, indices=True, random_state=0)
    #print "cv = ",cv
    train_errors = []
    test_errors = []

    scores = []

    pr_scores, precisions, recalls, thresholds = defaultdict(list), defaultdict(list), defaultdict(list), defaultdict(list)

    roc_scores, tprs, fprs = defaultdict(list), defaultdict(list) ,defaultdict(list)

    clfs = []  # just to later get the median

    cms = []

    for train, test in cv:
        X_train, y_train = X[train], Y[train]
        X_test, y_test = X[test], Y[test]
        global clf
        clf = LogisticRegression()
        clf.fit(X_train, y_train)
        clfs.append(clf)

        train_score = clf.score(X_train, y_train)
        test_score = clf.score(X_test, y_test)
        scores.append(test_score)

        train_errors.append(1 - train_score)
        test_errors.append(1 - test_score)

        y_pred = clf.predict(X_test)
        cm = confusion_matrix(y_test, y_pred)
        cms.append(cm)
        
        for label in labels:
            y_label_test = np.asarray(y_test == label, dtype=int)
            proba = clf.predict_proba(X_test)
            proba_label = proba[:, label]

            precision, recall, pr_thresholds = precision_recall_curve(
                y_label_test, proba_label)
            pr_scores[label].append(auc(recall, precision))
            precisions[label].append(precision)
            recalls[label].append(recall)
            thresholds[label].append(pr_thresholds)

            fpr, tpr, roc_thresholds = roc_curve(y_label_test, proba_label)
            roc_scores[label].append(auc(fpr, tpr))
            tprs[label].append(tpr)
            fprs[label].append(fpr)

    if plot:
        for label in labels:
            #print("Plotting %s"%genre_list[label])
            scores_to_sort = roc_scores[label]
            median = np.argsort(scores_to_sort)[len(scores_to_sort) / 2]
            desc = "%s %s" % (name, genre_list[label])
            #plot_pr(pr_scores[label][median], desc, precisions[label][median],recalls[label][median], label='%s vs rest' % genre_list[label])
            #plot_roc(roc_scores[label][median], desc, tprs[label][median],fprs[label][median], label='%s vs rest' % genre_list[label])

    all_pr_scores = np.asarray(pr_scores.values()).flatten()
    summary = (np.mean(scores), np.std(scores),np.mean(all_pr_scores), np.std(all_pr_scores))
    print("%.3f\t%.3f\t%.3f\t%.3f\t" % summary)

    #save the trained model to disk
    joblib.dump(clf, 'saved_model_fft/my_model.pkl')
    
    return np.mean(train_errors), np.mean(test_errors), np.asarray(cms)
def main():
    global train
    global test
    # 将训练集Y数据存储在y中,并删除训练集Y数据
    y = train['Y']
    del train['Y']

    # 重命名列标题
    origin = [
        "年龄", "工作天数", "职业类型", "投资收入", "投资损失", "省份", "教育", "家庭角色", "婚姻状况",
        "教育时间", "民族", "工作情况", "性别"
    ]
    target = [
        "age", "work_days", "job", "invest_income", "invest_loss", "province",
        "education", "home_role", "marital_status", "education_time", "nation",
        "work_type", "gender"
    ]
    rename_dict = dict()
    for i in range(len(origin)):
        rename_dict[origin[i]] = target[i]
    train.rename(columns=rename_dict, inplace=True)
    test.rename(columns=rename_dict, inplace=True)

    # 查看是否有缺失数据
    print("===================统计缺失数据-训练集====================")
    print(train.isnull().sum(axis=0))
    print(train.isnull().any())
    print("===================统计缺失数据-测试集====================")
    print(test.isnull().sum(axis=0))
    print(test.isnull().any())

    full_data = [train, test]
    # 性别特征转为数字
    for dataset in full_data:
        dataset['gender'] = dataset['gender'].map({'女': 0, '男': 1}).astype(int)

    # 处理投资收益,分为五类
    for dataset in full_data:
        dataset['invest'] = dataset['invest_income'] - dataset['invest_loss']
    for dataset in full_data:
        dataset.loc[dataset['invest'] < 0, 'invest'] = 0
        dataset.loc[dataset['invest'] == 0, 'invest'] = 1
        dataset.loc[(dataset['invest'] > 0) & (dataset['invest'] <= 5000),
                    'invest'] = 2
        dataset.loc[(dataset['invest'] > 5000) & (dataset['invest'] <= 10000),
                    'invest'] = 3
        dataset.loc[dataset['invest'] > 10000, 'invest'] = 4

    # 处理省份为数字
    for dataset in full_data:
        province_list = []
        for province_name in dataset['province']:
            province_list.append(int(province_name.replace("省份", "")) / 2)
        dataset['province'] = np.array(province_list)

    # 分类特征转为哑变量(数字分类)
    dumb_columns('job')  # 职业类型
    dumb_columns('education')  # 教育
    dumb_columns('nation')  # 民族
    dumb_columns('home_role')  # 家庭角色
    dumb_columns('marital_status')  # 婚姻状况
    dumb_columns('work_type')  # 工作情况

    # 年龄分类-五类
    for dataset in full_data:
        # Mapping Age
        dataset.loc[dataset['age'] <= 22, 'age'] = 0
        dataset.loc[(dataset['age'] > 22) & (dataset['age'] <= 32), 'age'] = 1
        dataset.loc[(dataset['age'] > 32) & (dataset['age'] <= 48), 'age'] = 2
        dataset.loc[(dataset['age'] > 48) & (dataset['age'] <= 64), 'age'] = 3
        dataset.loc[dataset['age'] > 64, 'age'] = 4

    # 工作天数-按比例缩小,防止维度之间差异过大
    for dataset in full_data:
        dataset['work_days'] = dataset['work_days'] / 10

    # 删除不必要的列
    drop_elements = ['invest_income', 'invest_loss', 'education']
    train = train.drop(drop_elements, axis=1)
    test = test.drop(drop_elements, axis=1)

    # 显示训练集和测试集特征
    print(train.head(3))
    print("===")
    print(test.head(3))

    # 模型列表
    model_list = []

    # =====================逻辑回归===================
    # 分割数据
    x_train, x_test, y_train, y_test = train_test_split(train,
                                                        y,
                                                        test_size=0.25,
                                                        random_state=24)
    # estimator
    logic = LogisticRegression()
    logic.fit(x_train, y_train)
    # 预测
    print(
        "精确率和召回率(逻辑回归):",
        classification_report(y_test,
                              logic.predict(x_test),
                              labels=[0, 1],
                              target_names=["非高收入", "高收入"]))
    pre_score = logic.score(x_test, y_test)
    print("准确率(逻辑回归):{}".format(pre_score))
    # 输出概率
    predictions = logic.predict_proba(x_test)
    # 计算auc
    fpr, tpr, thresholds = metrics.roc_curve(y_test, predictions[:, 1])
    auc_value = metrics.auc(fpr, tpr)
    print("auc值为:{}".format(auc_value))
    model_list.append({"model": logic, "auc": auc_value})
    # 绘图
    plt.title('LogisticRegression AUC')
    plt.plot(fpr, tpr, 'r', label='AUC_LOGIC = %0.3f' % auc_value)
    plt.legend(loc='lower right')
    plt.plot([0, 1], [0, 1], 'r--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.0])
    plt.ylabel('tpr')
    plt.xlabel('fpr')
    # plt.savefig("./LogisticRegression_auc.png")

    # ==============决策树===========
    # 数据集分割
    x_train, x_test, y_train, y_test = train_test_split(train,
                                                        y,
                                                        test_size=0.25,
                                                        random_state=24)
    # 转换为字典数据,并进行特征抽取
    dc = DictVectorizer(sparse=False)
    x_train = dc.fit_transform(x_train.to_dict(orient="records"))
    features = dc.get_feature_names()
    x_test = dc.transform(x_test.to_dict(orient="records"))
    # estimator
    dec = DecisionTreeClassifier(max_depth=4)
    dec.fit(x_train, y_train)
    # 决策树本地保存
    # dot -Tpng -o tree.png tree.dot
    export_graphviz(dec, out_file="./tree.dot", feature_names=features)
    # 预测
    print(
        "精确率和召回率(决策树):",
        classification_report(y_test,
                              dec.predict(x_test),
                              labels=[0, 1],
                              target_names=["非高收入", "高收入"]))
    pre_score = dec.score(x_test, y_test)
    print("准确率(决策树):{}".format(pre_score))
    # 输出概率
    predictions = dec.predict_proba(x_test)
    # 计算auc
    fpr, tpr, thresholds = metrics.roc_curve(y_test, predictions[:, 1])
    auc_value = metrics.auc(fpr, tpr)
    print("auc值为:{}".format(auc_value))
    model_list.append({"model": dec, "auc": auc_value})
    # 绘图
    plt.title('DecisionTreeClassifier AUC')
    plt.plot(fpr, tpr, 'b', label='AUC_DTC = %0.3f' % auc_value)
    plt.legend(loc='lower right')
    plt.plot([0, 1], [0, 1], 'r--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.0])
    plt.ylabel('tpr')
    plt.xlabel('fpr')
    # plt.savefig("./DecisionTreeClassifier_auc.png")

    # =============随机森林==============
    # 数据集分割
    x_train, x_test, y_train, y_test = train_test_split(train,
                                                        y,
                                                        test_size=0.25,
                                                        random_state=24)
    # 转换为字典数据,并进行特征抽取
    dc = DictVectorizer(sparse=False)
    x_train = dc.fit_transform(x_train.to_dict(orient="records"))
    # print(dc.get_feature_names())
    x_test = dc.transform(x_test.to_dict(orient="records"))
    # estimator
    rf = RandomForestClassifier(n_estimators=5)
    rf.fit(x_train, y_train)
    # 预测
    print(
        "精确率和召回率(随机森林):",
        classification_report(y_test,
                              rf.predict(x_test),
                              labels=[0, 1],
                              target_names=["非高收入", "高收入"]))
    pre_score = rf.score(x_test, y_test)
    print("准确率(随机森林):{}".format(pre_score))
    # 输出概率
    predictions = rf.predict_proba(x_test)
    # 计算auc
    fpr, tpr, thresholds = metrics.roc_curve(y_test, predictions[:, 1])
    auc_value = metrics.auc(fpr, tpr)
    print("auc值为:{}".format(auc_value))
    model_list.append({"model": rf, "auc": auc_value})
    # 绘图
    plt.title('RandomForestClassifier AUC')
    plt.plot(fpr, tpr, 'y', label='AUC_RF = %0.3f' % auc_value)
    plt.legend(loc='lower right')
    plt.plot([0, 1], [0, 1], 'r--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.0])
    plt.ylabel('tpr')
    plt.xlabel('fpr')
    plt.savefig("./count_auc.png")

    # 模型对比,选择auc值最大的模型进行预测
    sorted_key_list = sorted(model_list, key=lambda x: x['auc'], reverse=True)
    model = sorted_key_list[0]['model']
    auc_v = sorted_key_list[0]['auc']

    print("选择模型 {}".format(model))
    print("AUC值为 {}".format(auc_v))
    pre_data = model.predict_proba(test)
    # 保存目标值
    test['Y'] = pre_data[:, 0]
    test['Y'].to_csv('Results_1.csv',
                     encoding='utf-8',
                     index=False,
                     header=False)
    # 保存完整版本
    test_origin['Y'] = pre_data[:, 0]
    test_origin.to_csv("./my_results.csv", encoding='utf-8', index=False)
def test_fit_credit_backupsklearn():
    df = pd.read_csv("./open_data/creditcard.csv")
    X = np.array(df.iloc[:, :df.shape[1] - 1], dtype='float32', order='C')
    y = np.array(df.iloc[:, df.shape[1] - 1], dtype='float32', order='C')
    Solver = h2o4gpu.LogisticRegression

    enet_h2o4gpu = Solver(glm_stop_early=False)
    print("h2o4gpu fit()")
    enet_h2o4gpu.fit(X, y)
    print("h2o4gpu predict()")
    print(enet_h2o4gpu.predict(X))
    print("h2o4gpu score()")
    print(enet_h2o4gpu.score(X,y))

    enet = Solver(dual=True, max_iter=100, tol=1E-4, intercept_scaling=0.99, random_state=1234)
    print("h2o4gpu scikit wrapper fit()")
    enet.fit(X, y)
    print("h2o4gpu scikit wrapper predict()")
    print(enet.predict(X))
    print("h2o4gpu scikit wrapper predict_proba()")
    print(enet.predict_proba(X))
    print("h2o4gpu scikit wrapper predict_log_proba()")
    print(enet.predict_log_proba(X))
    print("h2o4gpu scikit wrapper score()")
    print(enet.score(X,y))
    print("h2o4gpu scikit wrapper decision_function()")
    print(enet.decision_function(X))
    print("h2o4gpu scikit wrapper densify()")
    print(enet.densify())
    print("h2o4gpu scikit wrapper sparsify")
    print(enet.sparsify())
    
    from sklearn.linear_model.logistic import  LogisticRegression
    enet_sk = LogisticRegression(dual=True, max_iter=100, tol=1E-4, intercept_scaling=0.99, random_state=1234)
    print("Scikit fit()")
    enet_sk.fit(X, y)
    print("Scikit predict()")
    print(enet_sk.predict(X))
    print("Scikit predict_proba()")
    print(enet_sk.predict_proba(X))
    print("Scikit predict_log_proba()")
    print(enet_sk.predict_log_proba(X))
    print("Scikit score()")
    print(enet_sk.score(X,y))
    print("Scikit decision_function()")
    print(enet_sk.decision_function(X))
    print("Scikit densify()")
    print(enet_sk.densify())
    print("Sciki sparsify")
    print(enet_sk.sparsify())

    enet_sk_coef = csr_matrix(enet_sk.coef_, dtype=np.float32).toarray()
    print(enet_sk.coef_)
    print(enet_sk_coef)
    print(enet.coef_)
    print(enet_sk.intercept_)
    print("Coeffs, intercept, and n_iters should match")
    assert np.allclose(enet.coef_, enet_sk_coef)
    assert np.allclose(enet.intercept_, enet_sk.intercept_)
    assert np.allclose(enet.n_iter_, enet_sk.n_iter_)
    print("Preds should match")
    assert np.allclose(enet.predict_proba(X), enet_sk.predict_proba(X))
    assert np.allclose(enet.predict(X), enet_sk.predict(X))
    assert np.allclose(enet.predict_log_proba(X), enet_sk.predict_log_proba(X))
def train_model(clf_factory, X, Y, name, plot=False):
    """
        Trains and saves model to disk.
    """
    labels = np.unique(Y)
    cv = ShuffleSplit(n=len(X), n_iter=1, test_size=0.3, random_state=0)
    #print "cv = ",cv
    train_errors = []
    test_errors = []

    scores = []

    pr_scores, precisions, recalls, thresholds = list(defaultdict(list)), list(
        defaultdict(list)), list(defaultdict(list)), list(defaultdict(list))

    roc_scores, tprs, fprs = list(defaultdict(list)), list(
        defaultdict(list)), list(defaultdict(list))

    clfs = []  # just to later get the median

    cms = []

    for train, test in cv:
        X_train, y_train = X[train], Y[train]
        X_test, y_test = X[test], Y[test]
        global clf
        clf = LogisticRegression()
        clf.fit(X_train, y_train)
        clfs.append(clf)

        train_score = clf.score(X_train, y_train)
        test_score = clf.score(X_test, y_test)
        scores.append(test_score)

        train_errors.append(1 - train_score)
        test_errors.append(1 - test_score)

        y_pred = clf.predict(X_test)
        cm = confusion_matrix(y_test, y_pred)
        cms.append(cm)
    """ for label in labels:
            y_label_test = np.asarray(y_test == label, dtype=int)
            proba = clf.predict_proba(X_test)
            proba_label = proba[:, label]

            precision, recall, pr_thresholds = precision_recall_curve(
                y_label_test, proba_label)
            pr_scores[label].append(auc(recall, precision))
            precisions[label].append(precision)
            recalls[label].append(recall)
            thresholds[label].append(pr_thresholds)

            fpr, tpr, roc_thresholds = roc_curve(y_label_test, proba_label)
            roc_scores[label].append(auc(fpr, tpr))
            tprs[label].append(tpr)
            fprs[label].append(fpr)"""

    if plot:
        for label in labels:
            print("Plotting %s" % genre_list[label])
            scores_to_sort = roc_scores[label]
            median = np.argsort(scores_to_sort)[len(scores_to_sort) // 2]
            desc = "%s %s" % (name, genre_list[label])
            plot_pr(pr_scores[label][median],
                    desc,
                    precisions[label][median],
                    recalls[label][median],
                    label='%s vs rest' % genre_list[label])
            plot_roc(roc_scores[label][median],
                     desc,
                     tprs[label][median],
                     fprs[label][median],
                     label='%s vs rest' % genre_list[label])

    all_pr_scores = np.asarray(pr_scores.values()).flatten()
    summary = (np.mean(scores), np.std(scores)
               )  #222pr_scores[label].append(auc(recall, precision))
    print(summary)

    #save the trained model to disk
    joblib.dump(
        clf,
        r'C:\Users\Rag9704\Documents\GitHub\Music_Genre_Classification\my_model.pkl'
    )

    return np.mean(train_errors), np.mean(test_errors), np.asarray(cms)
#df_german=pd.read_excel("df_after_vif.xlsx")
y = df_german["target"]
#x=df_german.ix[:,"Account Balance":"Foreign Worker"]
x = df_german.loc[:, "Account Balance":"Foreign Worker"]
X_train, X_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=0)

#solver='liblinear'
classifier = LogisticRegression(solver='liblinear')
classifier.fit(X_train, y_train)
predictions = classifier.predict(X_test)

print("accuracy on the training subset:{:.3f}".format(
    classifier.score(X_train, y_train)))
print("accuracy on the test subset:{:.3f}".format(
    classifier.score(X_test, y_test)))
'''
P0 = 50
PDO = 10
theta0 = 1.0/20
B = PDO/np.log(2)
A = P0 + B*np.log(theta0)
'''


def Score(probability):

    score = A - B * np.log(probability / (1 - probability))
    return score
Beispiel #14
0
pl.plot(x[:, 0], x[:, 1], '.')
pl.show()

pca = PCA(n_components=1)
pca.fit(x)

print pca.explained_variance_
print np.cov(np.dot(x, v).T)

x = load_iris()['data']
y = load_iris()['target']

lr = LR()
lr.fit(x, y)

lr.score(x, y) #accuracy

tx = np.dot(x, lr.coef_.T)

pl.plot(tx)
pl.show()

pca2 = PCA(n_components=2)
pca2.fit(x)
px = pca2.transform(x)

pl.plot(px[0:50, 0], px[0:50, 1], 'r.')
pl.plot(px[50:100, 0], px[50:100, 1], 'gx')
pl.plot(px[100:150, 0], px[100:150, 1], 'ko')
pl.show()
print(root_path)
file_path = os.path.abspath(
    os.path.join(root_path, "python", "datasets", "datasets", "bankloan.xls"))
print(file_path)

data = pd.read_excel(file_path)

x = data.iloc[:, :8].as_matrix()
y = data.iloc[:, 8].as_matrix()

features_columns = data.columns[:len(data.columns) - 1]

rlr = RandomizedLogisticRegression()  # random logistics regression model
rlr.fit(x, y)  # training
rlr.get_support()  # get feature selection results

print(features_columns)
print(rlr.get_support())  # get feature selection result.
print(rlr.scores_)  # get each feature score

print(u'RandomizedLogisticRegression feature selection finished.')
print(u'The effective features are:\n\t %s' %
      ', '.join(features_columns[rlr.get_support()]))

x = data[features_columns[rlr.get_support()]].as_matrix()  # selected features

lr = LogisticRegression()  # create logistic regression model
lr.fit(x, y)  # using effective features training model

print(u'Accuracy:%s' % lr.score(x, y))
from sklearn import datasets
from sklearn.linear_model.logistic import LogisticRegression
from sklearn.model_selection import train_test_split

iris = datasets.load_iris()

X_train, X_test, Y_train, Y_test = train_test_split(iris.data, iris.target, test_size=0.33)

lgr = LogisticRegression()
lgr.fit(X_train, Y_train)
score = lgr.score(X_test, Y_test)

print(score.mean())
print(score.std())
Beispiel #17
0
acc = clf.score(XX, yy) * 100

y_pre = clf.predict(XX)
d = np.equal(y_pre, yy)
dd = np.sum(np.equal(y_pre, yy) == True)
print('matchs:{0}/{1}'.format(np.sum(np.equal(y_pre, yy) == True),
                              yy.shape[0]))
print(dd / yy.shape[0])

t = np.array([1, 2, 3])
u = np.array([4, 5, 6])
print(t * u.T)

claa = LogisticRegression(max_iter=7000)
claa.fit(XX, yy)
acc2 = claa.score(XX, yy)

y_pre = claa.predict(XX)
d = np.equal(y_pre, yy)
dd = np.sum(np.equal(y_pre, yy) == True)
print('matchs:{0}/{1}'.format(np.sum(np.equal(y_pre, yy) == True),
                              yy.shape[0]))
print(dd / yy.shape[0])

data_reg = []  # obtain data for linear regression
with open('linear-regression.txt', 'r') as file:
    line = file.readline().strip('\n').split(',')
    while line != ['']:
        line = list(map(float, line))
        data_reg.append(line)
        line = file.readline().strip('\n').split(',')
Beispiel #18
0
combine = [train_df, test_df]
"""
It's part of Model construction

"""

X_train = train_df.drop("Survived", axis=1)
Y_train = train_df["Survived"]
X_test = test_df.drop("PassengerId", axis=1).copy()

# Logistic Regression

logreg = LogisticRegression()
logreg.fit(X_train, Y_train)
Y_pred1 = logreg.predict(X_test)
acc_log = round(logreg.score(X_train, Y_train) * 100, 2)
print('acc_log:', acc_log)

coeff_df = pd.DataFrame(train_df.columns.delete(0))
coeff_df.columns = ['Feature']
coeff_df["Correlation"] = pd.Series(
    logreg.coef_[0]
)  # Each  logistic reg has a coef result and use this way could get values .

# print(coeff_df.sort_values(by='Correlation', ascending=False))

svc = SVC()
svc.fit(X_train, Y_train)
Y_pred2 = svc.predict(X_test)
acc_svc = round(svc.score(X_train, Y_train) * 100, 2)
print('acc_svc:', acc_svc)
Beispiel #19
0
X_train, X_test, y_train, y_test = train_test_split(X,y, stratify=y, test_size=0.2, random_state=31)

train_sizes = range(10,len(X_train),25)


lr = LogisticRegression()
nb = GaussianNB()

lr_scores = []
nb_scores = []

# Epoches
for train_size in train_sizes:
    X_slice,_,y_slice,_=train_test_split(X_train,y_train,train_size=train_size,stratify=y_train,random_state=31)
    nb.fit(X_slice,y_slice)
    nb_scores.append((nb.score(X_test,y_test)))
    lr.fit(X_slice,y_slice)
    lr_scores.append((lr.score(X_test,y_test)))

# Figure
plt.figure()
plt.title("Naive Bayes and Logistic Regression Accuracies")
plt.xlabel("Number of training instances")
plt.ylabel("Test set accuracy")
plt.grid(True)
plt.plot(train_sizes,nb_scores,label='Naive Bayes')
plt.plot(train_sizes,lr_scores,label='Logistic Regression',linestyle='--')
plt.legend()
plt.savefig('Naive Bayes and Logistic Regression Accuracies.png')
# plt.show()
# print all lambda and cv error
for key in result.keys():
    print('when lambda =', key, 'cv error rate =', result[key])

# select the best model
for key in result.keys():
    if result[key] == minimum_cv_error:
        print('the best model is when lambda =', key)
        print('cv error rate =', minimum_cv_error)
        train, validation = train_test_split(train_data_frame, test_size=0.33, random_state=42)

        classify = LogisticRegression(C=1 / key, penalty='l1', solver='liblinear')
        classify.fit(train.drop(['default payment next month'], axis=1), train['default payment next month'])

        # print test error
        print('test error =', 1 - classify.score(test_data_frame.drop(['default payment next month'], axis=1), test_data_frame['default payment next month']))

        # print train error for whole train set
        print('train error =', 1 - classify.score(train.drop(['default payment next month'], axis=1), train['default payment next month']))
        break
print('')



# logis-l2
print('logis-l2')
result = {}
minimum_cv_error = 1
lmbd = [10 ** x for x in np.arange(-2, 2, 0.1)]
for lamda in lmbd:
    cv_accuracy = []
Beispiel #21
0
def train_model(X, Y, name, plot=False):
    """
        Training the model and saving it to disk.
    """
    labels = np.unique(Y)

    cv = ShuffleSplit(n=len(X),
                      n_iterations=1,
                      test_fraction=0.3,
                      indices=True,
                      random_state=0)

    train_errors = []
    test_errors = []

    scores = []
    pr_scores = defaultdict(list)
    precisions, recalls, thresholds = defaultdict(list), defaultdict(
        list), defaultdict(list)

    roc_scores = defaultdict(list)
    tprs = defaultdict(list)
    fprs = defaultdict(list)

    clfs = []

    cms = []

    for train, test in cv:
        X_train, y_train = X[train], Y[train]
        X_test, y_test = X[test], Y[test]

        clf = LogisticRegression()
        clf.fit(X_train, y_train)
        clfs.append(clf)

        train_score = clf.score(X_train, y_train)
        test_score = clf.score(X_test, y_test)
        scores.append(test_score)

        train_errors.append(1 - train_score)
        test_errors.append(1 - test_score)

        y_pred = clf.predict(X_test)
        cm = confusion_matrix(y_test, y_pred)
        cms.append(cm)

        for label in labels:
            y_label_test = np.asarray(y_test == label, dtype=int)
            proba = clf.predict_proba(X_test)
            proba_label = proba[:, label]

            fpr, tpr, roc_thresholds = roc_curve(y_label_test, proba_label)
            roc_scores[label].append(auc(fpr, tpr))
            tprs[label].append(tpr)
            fprs[label].append(fpr)

    if plot:
        for label in labels:
            scores_to_sort = roc_scores[label]
            median = np.argsort(scores_to_sort)[len(scores_to_sort) / 2]
            desc = "%s %s" % (name, genre_list[label])
            plot_roc_curves(roc_scores[label][median],
                            desc,
                            tprs[label][median],
                            fprs[label][median],
                            label='%s vs rest' % genre_list[label])

    all_pr_scores = np.asarray(pr_scores.values()).flatten()
    summary = (np.mean(scores), np.std(scores), np.mean(all_pr_scores),
               np.std(all_pr_scores))
    #print("%.3f\t%.3f\t%.3f\t%.3f\t" % summary)

    #saving the trained model to disk
    joblib.dump(clf, 'saved_model/model_ceps.pkl')

    return np.mean(train_errors), np.mean(test_errors), np.asarray(cms)
Beispiel #22
0
x_test = digits[N_train:, :]
y_test = dig_label[N_train:]

# do logistic regression
lr = LogisticRegression()
lr.fit(x_train, y_train)

pred_train = lr.predict(x_train)
pred_test = lr.predict(x_test)

# calculate train/test accuracy
acc_train = accuracy_score(y_train, pred_train)
acc_test = accuracy_score(y_test, pred_test)
print("accuracy train = %f, accuracy_test = %f" % (acc_train, acc_test))

score_train = lr.score(x_train, y_train)
score_test = lr.score(x_test, y_test)
print("score_train = %f, score_test = %f" % (score_train, score_test))

# +
from sklearn.metrics import confusion_matrix

# plot confusion matrix
cm = confusion_matrix(y_test, pred_test)

plt.matshow(cm)
plt.title(u'Confusion Matrix')
plt.colorbar()
plt.ylabel(u'Groundtruth')
plt.xlabel(u'Predict')
plt.show()
Beispiel #23
0
data = np.zeros((N, len(word_index_map) + 1))
i = 0
for tokens in positive_tokenized:
    xy = tokens_to_vector(tokens, 1)
    data[i, :] = xy
    i += 1

for tokens in negative_tokenized:
    xy = tokens_to_vector(tokens, 0)
    data[i, :] = xy
    i += 1

np.random.shuffle(data)

X = data[:, :-1]
Y = data[:, -1]

Xtrain = X[:-100, ]
Ytrain = Y[:-100, ]
Xtest = X[-100:, ]
Ytest = Y[-100:, ]

model = LogisticRegression()
model.fit(Xtrain, Ytrain)
print("Classification rate:", model.score(Xtest, Ytest))

threshold = 0.5
for word, index in word_index_map.items():
    weight = model.coef_[0][index]
    if weight > threshold or weight < -threshold:
        print(word, " : ", weight)
Beispiel #24
0
def plotshow(solver, trainingSet, testSet):

    data = []
    label = []
    data_test = []
    label_test = []

    for i in range(len(trainingSet)):
        data.append(list(map(eval, trainingSet[i][:-1])))
        label.append(list(map(eval, trainingSet[i][-1])))
    for n in range(len(testSet)):
        data_test.append(list(map(eval, testSet[n][:-1])))
        label_test.append(list(map(eval, testSet[n][-1])))

    global clf
    clf = LogisticRegression(C=1000.0, solver=solver, multi_class="ovr")

    clf.fit(data, label)
    score = clf.score(data_test, label_test)
    data = np.array(data)
    label = np.array(label)

    x_min, x_max = data[:, 0].min() - .5, data[:, 0].max() + .5
    y_min, y_max = data[:, 1].min() - .5, data[:, 1].max() + .5
    h = .02  # step size in the mesh
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))
    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])

    # Put the result into a color plot
    Z = Z.reshape(xx.shape)
    plt.figure(figsize=(12, 6))
    plt.subplot(1, 2, 1)
    plt.title('Logistic')
    #plt.figure(1, figsize=(4, 3))
    plt.pcolormesh(xx, yy, Z, cmap=plt.cm.Paired)

    # Plot also the training points
    plt.scatter(data[:, 0],
                data[:, 1],
                c=np.squeeze(label),
                edgecolors='k',
                cmap=plt.cm.Paired)
    plt.xlabel('petal length')
    plt.ylabel('petal width')
    plt.xlim(xx.min(), xx.max())
    plt.ylim(yy.min(), yy.max())
    plt.xticks(())
    plt.yticks(())

    plt.subplot(1, 2, 2)
    z = np.arange(-5, 5, 0.05)
    phi_z = sigmoid(z)
    plt.title('Sigmoid')
    plt.plot(z, phi_z)
    plt.axvline(0.0, color='k')
    plt.ylim(-0.1, 1.1)

    plt.yticks([0.0, 0.5, 1.0])
    ax = plt.gca()
    ax.yaxis.grid(True)
    plt.tight_layout()
    plt.savefig(
        "E:/Anaconda/Scripts/CorsApi/snippets/static/picture/logistic.jpg")
    return score
def train_model(X, Y, name, plot=False):
    """
        train_model(vector, vector, name[, plot=False])
        
        Trains and saves model to disk.
    """
    labels = np.unique(Y)
    print labels

    cv = ShuffleSplit(n=len(X), n_iter=1, test_size=0.3, random_state=0)
    
    
    train_errors = []
    test_errors = []

    scores = []
    pr_scores = defaultdict(list)
    precisions, recalls, thresholds = defaultdict(list), defaultdict(list), defaultdict(list)

    roc_scores = defaultdict(list)
    tprs = defaultdict(list)
    fprs = defaultdict(list)

    clfs = []  # for the median

    cms = []

    for train, test in cv:
        X_train, y_train = X[train], Y[train]
        X_test, y_test = X[test], Y[test]

        clf = LogisticRegression()
        clf.fit(X_train, y_train)
        clfs.append(clf)

        train_score = clf.score(X_train, y_train)
        test_score = clf.score(X_test, y_test)
        scores.append(test_score)

        train_errors.append(1 - train_score)
        test_errors.append(1 - test_score)

        y_pred = clf.predict(X_test)
        cm = confusion_matrix(y_test, y_pred)
        cms.append(cm)

        for label in labels:
            y_label_test = np.asarray(y_test == label, dtype=int)
            proba = clf.predict_proba(X_test)
            proba_label = proba[:, label]

            fpr, tpr, roc_thresholds = roc_curve(y_label_test, proba_label)
            roc_scores[label].append(auc(fpr, tpr))
            tprs[label].append(tpr)
            fprs[label].append(fpr)

    if plot:
        for label in labels:
            scores_to_sort = roc_scores[label]
            median = np.argsort(scores_to_sort)[len(scores_to_sort) / 2]
            desc = "%s %s" % (name, genre_list[label])
            plot_roc_curves(roc_scores[label][median], desc, tprs[label][median],fprs[label][median], label='%s vs rest' % genre_list[label])

    all_pr_scores = np.asarray(pr_scores.values()).flatten()
    summary = (np.mean(scores), np.std(scores), np.mean(all_pr_scores), np.std(all_pr_scores))
    #print("%.3f\t%.3f\t%.3f\t%.3f\t" % summary)

    #save the trained model to disk
    joblib.dump(clf, 'saved_model/model_ceps.pkl')
    
    return np.mean(train_errors), np.mean(test_errors), np.asarray(cms)
Beispiel #26
0
clf = SVC()
clf.fit(X_train,y_train)
print "使用支持向量分类算法分类结果:"
print clf.score(X_test,y_test) #支持向量分类
#nusvm
clf=NuSVC()
clf.fit(X_train,y_train)
print "使用支持向量分类算法分类结果:"
print clf.score(X_test,y_test)#核支持向量分类

clf = GaussianNB()
clf.fit(X_train,y_train)
print "使用朴素贝叶斯分类算法分类结果:"
print clf.score(X_test,y_test)#朴素贝叶斯分类

classifier=LogisticRegression()
classifier.fit(X_train,y_train)
print "使用逻辑回归算法分类结果:"
print classifier.score(X_test,y_test)#逻辑回归

classifier=tree.DecisionTreeClassifier()
classifier.fit(X_train,y_train)
print "使用决策树算法分类结果:"
print classifier.score(X_test,y_test)

classifier=GradientBoostingClassifier(n_estimators=200)
classifier.fit(X_train,y_train)
print "使用GBDT算法分类结果:"
print classifier.score(X_test,y_test)

def train_model(X,
                Y,
                name,
                plot=False,
                outModelName=outModelName,
                testSize=0.3):
    """
    train_model(vector, vector, name[, plot=False])
    Trains and saves model to disk.
    Parameters
    ----------
    outModelName : path to save the trained model (*.pkl)
    testsize : fracion of the data used for testing
    Returns
    -------
    outModelName, 
    np.mean(train_errors)
    np.mean(test_errors)
    np.asarray(cms)
    
    """
    labels = np.unique(Y)

    cv = ShuffleSplit(n=len(X), n_iter=1, test_size=testSize, random_state=0)

    train_errors = []
    test_errors = []

    scores = []
    pr_scores = defaultdict(list)
    precisions, recalls, thresholds = defaultdict(list), defaultdict(
        list), defaultdict(list)

    roc_scores = defaultdict(list)
    tprs = defaultdict(list)
    fprs = defaultdict(list)

    clfs = []  # for the median

    cms = []

    for train, test in cv:
        X_train, y_train = X[train], Y[train]
        X_test, y_test = X[test], Y[test]

        clf = LogisticRegression()
        clf.fit(X_train, y_train)
        clfs.append(clf)

        train_score = clf.score(X_train, y_train)
        test_score = clf.score(X_test, y_test)
        scores.append(test_score)

        train_errors.append(1 - train_score)
        test_errors.append(1 - test_score)

        y_pred = clf.predict(X_test)
        cm = confusion_matrix(y_test, y_pred)
        cms.append(cm)

        for label in labels:
            y_label_test = np.asarray(y_test == label, dtype=int)
            proba = clf.predict_proba(X_test)
            proba_label = proba[:, label]

            fpr, tpr, roc_thresholds = roc_curve(y_label_test, proba_label)
            roc_scores[label].append(auc(fpr, tpr))
            tprs[label].append(tpr)
            fprs[label].append(fpr)

    if plot:
        for label in labels:
            scores_to_sort = roc_scores[label]
            median = np.argsort(scores_to_sort)[len(scores_to_sort) / 2]
            desc = "%s %s" % (name, genre_list[label])
            plot_roc_curves(roc_scores[label][median],
                            desc,
                            tprs[label][median],
                            fprs[label][median],
                            label='%s vs rest' % genre_list[label])

    all_pr_scores = np.asarray(pr_scores.values()).flatten()
    summary = (np.mean(scores), np.std(scores), np.mean(all_pr_scores),
               np.std(all_pr_scores))
    #print("%.3f\t%.3f\t%.3f\t%.3f\t" % summary)

    #save the trained model to disk
    if outModelName: joblib.dump(clf, outModelName)

    return np.mean(train_errors), np.mean(test_errors), np.asarray(cms)
def Logistic_Regression(iter_argument=200):
    classifier_LG = LogisticRegression(max_iter=iter_argument)
    classifier_LG.fit(X_train, y_train)
    test_accur = (classifier_LG.score(X_test, y_test))
    train_accur = (classifier_LG.score(X_train, y_train))
    return test_accur, train_accur
def test_fit_credit_backupsklearn():
    df = pd.read_csv("./open_data/creditcard.csv")
    X = np.array(df.iloc[:, :df.shape[1] - 1], dtype='float32', order='C')
    y = np.array(df.iloc[:, df.shape[1] - 1], dtype='float32', order='C')
    Solver = h2o4gpu.LogisticRegression

    enet_h2o4gpu = Solver(glm_stop_early=False)
    print("h2o4gpu fit()")
    enet_h2o4gpu.fit(X, y)
    print("h2o4gpu predict()")
    print(enet_h2o4gpu.predict(X))
    print("h2o4gpu score()")
    print(enet_h2o4gpu.score(X, y))

    enet = Solver(dual=True, max_iter=100, tol=1E-4, random_state=1234)
    print("h2o4gpu scikit wrapper fit()")
    enet.fit(X, y)
    print("h2o4gpu scikit wrapper predict()")
    print(enet.predict(X))
    print("h2o4gpu scikit wrapper predict_proba()")
    print(enet.predict_proba(X))
    print("h2o4gpu scikit wrapper predict_log_proba()")
    print(enet.predict_log_proba(X))
    print("h2o4gpu scikit wrapper score()")
    print(enet.score(X, y))
    print("h2o4gpu scikit wrapper decision_function()")
    print(enet.decision_function(X))
    print("h2o4gpu scikit wrapper densify()")
    print(enet.densify())
    print("h2o4gpu scikit wrapper sparsify")
    print(enet.sparsify())

    from sklearn.linear_model.logistic import LogisticRegression
    enet_sk = LogisticRegression(dual=True,
                                 max_iter=100,
                                 tol=1E-4,
                                 random_state=1234)
    print("Scikit fit()")
    enet_sk.fit(X, y)
    print("Scikit predict()")
    print(enet_sk.predict(X))
    print("Scikit predict_proba()")
    print(enet_sk.predict_proba(X))
    print("Scikit predict_log_proba()")
    print(enet_sk.predict_log_proba(X))
    print("Scikit score()")
    print(enet_sk.score(X, y))
    print("Scikit decision_function()")
    print(enet_sk.decision_function(X))
    print("Scikit densify()")
    print(enet_sk.densify())
    print("Sciki sparsify")
    print(enet_sk.sparsify())

    enet_sk_coef = csr_matrix(enet_sk.coef_, dtype=np.float32).toarray()
    print(enet_sk.coef_)
    print(enet_sk_coef)
    print(enet.coef_)
    print(enet_sk.intercept_)
    print("Coeffs, intercept, and n_iters should match")
    assert np.allclose(enet.coef_, enet_sk_coef)
    assert np.allclose(enet.intercept_, enet_sk.intercept_)
    assert np.allclose(enet.n_iter_, enet_sk.n_iter_)
    print("Preds should match")
    assert np.allclose(enet.predict_proba(X), enet_sk.predict_proba(X))
    assert np.allclose(enet.predict(X), enet_sk.predict(X))
    assert np.allclose(enet.predict_log_proba(X), enet_sk.predict_log_proba(X))
Beispiel #30
0
# get word vector
x = np.zeros((len(raw_x), 100))
for i in range(len(raw_x)):
    x[i] = model[raw_x[i]]
print(x.shape, y.shape)
print(np.unique(y))
# training set and test set split
x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.25,
                                                    stratify=y)

# logistic regression
lr = LogisticRegression(C=1.0, solver='sag', max_iter=400, n_jobs=-1)
lr.fit(x_train, y_train)
print('Logistic regression accuracy: ', lr.score(x_test, y_test))
y_pred = lr.predict(x_test)
ac = np.sum([1 if y_pred[i] == y_test[i] else 0
             for i in range(len(y_pred))]) / len(y_pred)

# svm
sigma = 0.1  # sigma影响聚集程度,从而影响泛化程度, sigma小则高斯分布基本作用于支持向量
# 附近,sigma大则可作用范围变大,泛化程度提高. sigma过小会造成过拟合
gamma = np.power(sigma, -2.) / 2.
svm_model = SVC(C=1.0, kernel='rbf', gamma=gamma)
svm_model.fit(x_train, y_train)
print('svm accuracy: ', svm_model.score(x_test, y_test))


# neural networks
# 先将y转化为one-hot形式, 即(len(y), np.unique(y).shape]) ==> (30804, 3)
	vect__norm: 'l2'
	vect__use_idf: True
"""
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model.logistic import LogisticRegression
from sklearn.cross_validation import train_test_split
from sklearn.metrics.metrics import precision_score, recall_score, confusion_matrix

__author__ = 'gavin'
import pandas as pd

df = pd.read_csv('sms/sms.csv')

X_train_r, X_test_r, y_train, y_test = train_test_split(
    df['message'], df['label'])

vectorizer = TfidfVectorizer(max_df=0.5,
                             max_features=None,
                             ngram_range=(1, 1),
                             norm='l2',
                             use_idf=True)
X_train = vectorizer.fit_transform(X_train_r)
X_test = vectorizer.transform(X_test_r)
classifier = LogisticRegression(penalty='l2', C=7)
classifier.fit(X_train, y_train)
predictions = classifier.predict(X_test)
print 'score', classifier.score(X_test, y_test)
print 'precision', precision_score(y_test, predictions)
print 'recall', recall_score(y_test, predictions)
print confusion_matrix(y_test, predictions)
#plot and show the confusion matrix with color bar
plt.matshow(confusion_matrix)
plt.title('Sentiment Analysis from reviews')
plt.ylabel('True Values')
plt.xlabel('Predicted Values')
plt.colorbar()
plt.show()

# %%
#recall precision accuracy and f1 scores
print('Accuracy: %s' % accuracy_score(Y_test, regressor.predict(X_test)))
#print('Recall: %s'%recall_score(Y_test, regressor.predict(X_test), average='macro'))
#print('Precision: %s'%precision_score(Y_test, regressor.predict(X_test), average='macro'))
#print('F1: %s'%f1_score(Y_test, regressor.predict(X_test), average='macro'))
print('CR: %s' % classification_report(Y_test, regressor.predict(X_test)))
print('R Square: %s' % regressor.score(X_test, Y_test))
print('Mean sqared error: %s' % msq(Y_test, regressor.predict(X_test)))

### USING GRID SEARCH ### (called only when the funtion main is called)
# %%
#CROSS VAL SCORE
#print('Cross Val Score: %s'%cross_val_score(regressor, X_vec, Y, cv=5))


# %%
def main():
    pipeline = Pipeline([('vect', TfidfVectorizer(stop_words='english')),
                         ('reg', LogisticRegression())])
    parameters = {
        'vect__max_df': (0.25, 0.5, 0.75),
        'vect__ngram_range': ((1, 1), (1, 2), (1, 3)),
Beispiel #33
0
	clf__C: 7.0
	clf__penalty: 'l2'
	vect__max_df: 0.5
	vect__max_features: None
	vect__ngram_range: (1, 2)
	vect__norm: 'l2'
	vect__use_idf: True
"""
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model.logistic import LogisticRegression
from sklearn.cross_validation import train_test_split
from sklearn.metrics.metrics import precision_score, recall_score, confusion_matrix

__author__ = 'gavin'
import pandas as pd

df = pd.read_csv('sms/sms.csv')

X_train_r, X_test_r, y_train, y_test = train_test_split(df['message'], df['label'])

vectorizer = TfidfVectorizer(max_df=0.5, max_features=None, ngram_range=(1, 1), norm='l2', use_idf=True)
X_train = vectorizer.fit_transform(X_train_r)
X_test = vectorizer.transform(X_test_r)
classifier = LogisticRegression(penalty='l2', C=7)
classifier.fit(X_train, y_train)
predictions = classifier.predict(X_test)
print 'score', classifier.score(X_test, y_test)
print 'precision', precision_score(y_test, predictions)
print 'recall', recall_score(y_test, predictions)
print confusion_matrix(y_test, predictions)
    for num in number:
        data = []
        mark = []
        #with open('/Users/hhy/Desktop/1/node/final/cos/herb_only_meansum_'+str(threshold)+'.csv','r',encoding='utf-8_sig') as f:
        with open('/Users/hhy/Desktop/1/node/final/cmp/' + str(num) + 'emb' +
                  str(threshold) + '.csv',
                  'r',
                  encoding='utf-8_sig') as f:
            csv_reader = csv.reader(f)
            for x in csv_reader:
                data.append(list(map(float, x[0:-1])))
                mark.append(float(x[-1]))
        tmp_acc = []
        tmp_auc = []
        name = str(num) + 'emb' + str(threshold)
        for i in range(10):
            X_train, X_test, y_train, y_test = cross_validation.train_test_split(
                data, mark, test_size=0.05, random_state=i)
            clf = LogisticRegression(C=4.8, random_state=1113)
            clf.fit(X_train, y_train)
            tmp_acc.append(clf.score(X_test, y_test))
            y_predict = clf.predict_proba(X_test)[:, 1]
            tmp_auc.append(metrics.roc_auc_score(y_test,
                                                 y_predict))  # 验证集上的auc值
        final_acc[name] = (sum(tmp_acc) / len(tmp_acc))
        final_auc[name] = (sum(tmp_auc) / len(tmp_auc))
final_acc = sorted(final_acc.items(), key=lambda x: x[1], reverse=True)
final_auc = sorted(final_auc.items(), key=lambda x: x[1], reverse=True)
print('final acc:', final_acc)
print('final auc:', final_auc)
Beispiel #35
0
def train(data):
    X, Y, _ = vectorize(data)
    classifier = LogisticRegression()
    classifier.fit(X,Y)
    print(classifier.score(X,Y))
def train_model(X_train, y_train, X_test, y_test, name, plot=False):
    """
        train_model(vector, vector, name[, plot=False])
        
        Trains and saves model to disk.
    """
    labels = np.unique(y_train)

    train_errors = []
    test_errors = []

    scores = []
    pr_scores = defaultdict(list)
    precisions, recalls, thresholds = defaultdict(list), defaultdict(
        list), defaultdict(list)

    roc_scores = defaultdict(list)
    tprs = defaultdict(list)
    fprs = defaultdict(list)

    clfs = []  # for the median

    cms = []

    #    print "X_train::"
    #    print X_train
    #    print "X_test::"
    #    print X_test
    #    print "y_train::"
    #    print y_train
    #    print "y_test::"
    #    print y_test

    clf = LogisticRegression()
    #clf=GaussianNB()
    #clf=SVC(probability=True)
    clf.fit(X_train, y_train)
    clfs.append(clf)

    train_score = clf.score(X_train, y_train)
    test_score = clf.score(X_test, y_test)
    print "train_score:: " + str(train_score)
    print "test_score:: " + str(test_score)
    scores.append(test_score)

    train_errors.append(1 - train_score)
    test_errors.append(1 - test_score)

    y_pred = clf.predict(X_test)
    print y_pred
    cm = confusion_matrix(y_test, y_pred)
    cms.append(cm)
    #    cms = np.asarray(cms)
    #    cm_avg = np.mean(cms, axis=0)
    #    cm_norm = cm_avg / np.sum(cm_avg, axis=0)
    #    plot_confusion_matrix(cm_norm, genre_list, "ceps","CEPS classifier - Confusion matrix")

    for label in labels:
        #print "label "+str(label)
        y_label_test = np.asarray(y_test == label, dtype=int)
        #print "y_label_test "+str(y_label_test)
        proba = clf.predict_proba(X_test)
        #print str(len(proba))+"proba "+str(proba)
        proba_label = proba[:, label]

        fpr, tpr, roc_thresholds = roc_curve(y_label_test, proba_label)
        roc_scores[label].append(auc(fpr, tpr))
        tprs[label].append(tpr)
        fprs[label].append(fpr)

    #sys.exit(1)
    if plot:
        for label in labels:
            scores_to_sort = roc_scores[label]
            median = np.argsort(scores_to_sort)[len(scores_to_sort) / 2]
            desc = "%s %s" % (name, genre_list[label])
            plot_roc_curves(roc_scores[label][median],
                            desc,
                            tprs[label][median],
                            fprs[label][median],
                            label='%s vs rest' % genre_list[label])

    all_pr_scores = np.asarray(pr_scores.values()).flatten()
    summary = (np.mean(scores), np.std(scores), np.mean(all_pr_scores),
               np.std(all_pr_scores))
    #print("%.3f\t%.3f\t%.3f\t%.3f\t" % summary)

    #save the trained model to disk
    joblib.dump(clf, 'saved_model/model_ceps.pkl')

    return np.mean(train_errors), np.mean(test_errors), np.asarray(cms)
Beispiel #37
0
data_frame=pd.read_csv('data.csv')

X = data_frame[['speed_p','speed_r','speed_d','distance_d_p','distance_d_r','distance_d1_p','distance_d2_r' ,'angle_d','angle_d1_p','angle_d2_r' , 'possTimePre','possessionTime']]
Y = data_frame[['flag']]

#用pandas加载数据.csv文件,然后用train_test_split分成训练集(75%)和测试集(25%):
X_train, X_test, y_train, y_test = train_test_split(X,Y.values.T[0],random_state=1)

#LogisticRegression同样实现了fit()和predict()方法
classifier=LogisticRegression()
classifier.fit(X_train,y_train)

predictions=classifier.predict(X_test)
probabilities = classifier.predict_proba(X_test)
score = classifier.score(X_test,y_test)

print("probabilities" ,probabilities)
print("R-requested", score)

def rmse(y_test, y):
	return sp.sqrt(sp.mean((y_test - y) ** 2))

# 均方误差及log-loss
print("rmse" ,rmse(predictions,y_test))
print("log_loss" ,log_loss(y_test,predictions))

# 线性组合系数数组
coef = classifier.coef_
print(coef)
Beispiel #38
0
def classify(data_set_df, user_info_df, feat_set_name, features=None, label='gender',
             classifier=None, reg_param=1.0, selection=False, num_feat=20, sel_method='LR',
             cv=10):
    instance_num = len(data_set_df.columns)
    df_filtered, y_v = pc.get_filtered_x_y(data_set_df, user_info_df, label)
    x = df_filtered if features is None else df_filtered.loc[features]

    x = x.dropna(how='all', axis=0)
    x = x.dropna(how='all', axis=1)
    if x.isnull().any().any() or (x == np.inf).any().any() or (x == -np.inf).any().any():
        x_imp = pc.fill_nan_features(x)
        # x_imp = dense_df.loc[x.index, x.columns]
    else:
        x_imp = x
    y_filtered = y_v[(map(int, x.columns.values))]

    clf = LogisticRegression(C=reg_param) if classifier is None else classifier
    cv_num = min(len(y_filtered), cv)
    score_mean = 0.0
    miss_clf_rate = 1.0
    if cv_num > 1 and len(y_filtered.unique()) > 1:
        kf = KFold(y_filtered.shape[0], n_folds=cv_num, shuffle=True)
        # skf = StratifiedKFold(y_filtered, n_folds=cv_num, shuffle=True)
        fold = 0
        result_str = ""
        matrix_str = ""
        for tr_index, te_index in kf:
            fold += 1
            x_train, x_test = x_imp.T.iloc[tr_index], x_imp.T.iloc[te_index]
            y_train, y_test = y_filtered.iloc[tr_index], y_filtered.iloc[te_index]

            if selection:
                if sel_method == 'LR' or 'RF' in sel_method:
                    feat_index = fimp.feature_selection(x_train.T, user_info_df, num_feat,
                                                        method=sel_method, label=label)
                else:
                    x_tr_df, x_te_df = x.T.iloc[tr_index].T, x.T.iloc[te_index].T
                    feat_index = fimp.feature_selection(x_tr_df, user_info_df, num_feat,
                                                        method=sel_method, label=label)
                x_train = x_train.loc[:, feat_index].values
                x_test = x_test.loc[:, feat_index].values

            try:
                clf.fit(x_train, y_train)
                score = clf.score(x_test, y_test)
                score_mean += score

                result_str += "%s, %s, %s, %s, %s, %s, %s, %s, %s, %s\n" \
                              % (label, True if param.FILL_SUFFIX in feat_set_name else False,
                                 True if param.SCALING_SUFFIX in feat_set_name else False, selection, 'LR',
                                 reg_param, cv, fold, x_train.shape[1], score)
                cf_mat = confusion_matrix(y_test, clf.predict(x_test),
                                          labels=range(len(info.LABEL_CATEGORY[label])))
                matrix_str += np.array_str(cf_mat) + "\n"
            except ValueError:
                pass
                # traceback.print_exc()
                # print i, "why error? skip!"

        print result_str
        file_name = "%s/new_%s.csv" % (param.EXPERIMENT_PATH, feat_set_name)
        with open(file_name, mode='a') as f:
            f.write(result_str)

        file_name = "%s/new_%s_mat.csv" % (param.EXPERIMENT_PATH, feat_set_name)
        with open(file_name, mode='a') as f:
            f.write(matrix_str)

        if fold > 0:
            score_mean = score_mean / fold
            miss_clf_rate = (float(instance_num - len(y_filtered)) / instance_num)
    return score_mean, miss_clf_rate
Beispiel #39
0
def Logistic_train(X_in, y_in, X_out, cs, file_log=None):    
    if file_log:        
        file_log.writelines('# of Samples: {}, # of Features: {}\n'.format(len(X_in), len(X_in[0])))
    M = len(X_in[0])   #Number of features
    seed(time())
    
    #To prevent data snooping, breakes the input set into train. cross validation and test sets, with sizes proportional to 8-1-1
    
    #First puts aside 10% of the data for the tests
    test_indices, train_indices = split_indices(len(X_in), int(round(0.1*len(X_in))))
    
    X_scaler = [X_in[i] for i in test_indices]
    y_scaler = [y_in[i] for i in test_indices]
    X_in = [X_in[i] for i in train_indices]
    y_in = [y_in[i] for i in train_indices]
    
    
    
    #scale data first
    scaler = Scaler(copy=False) #in place modification
    #Normalize the data and stores as inner parameters the mean and standard deviation
    #To avoid data snooping, normalization is computed on training set only, and then reported on data
    scaler.fit(X_scaler, y_scaler)  
    X_scaler = scaler.transform(X_scaler)
    X_in = scaler.transform(X_in)
    X_out = scaler.transform(X_out) #uses the same transformation (same mean_ and std_) fit before
    
    std_test = X_scaler.std(axis=0)
    f_indices = [j for j in range(M) if std_test[j] > 1e-7]
    
    #Removes feature with null variance
    
    X_in = [[X_in[i][j] for j in f_indices] for i in range(len(X_in))]
    X_scaler = [[X_scaler[i][j] for j in f_indices] for i in range(len(X_scaler))]
    X_out = [[X_out[i][j] for j in f_indices] for i in range(len(X_out))]   
    
    M = len(X_in[0])
    #Then, on the remaining data, performs a ten-fold cross validation over the number of features considered
    best_cv_accuracy = 0.
    best_c = 0.



    for c in cs:
        kfold = cross_validation.StratifiedKFold(y_in, k=10)
        lrc = LogisticRegression(C=c, tol=1e-5)
                            
        in_accuracy = 0.
        cv_accuracy = 0.
        for t_indices, cv_indices in kfold:
    
            X_train = array([X_in[i][:] for i in t_indices])
            y_train = [y_in[i] for i in t_indices]
            X_cv = array([X_in[i][:] for i in cv_indices])
            y_cv = [y_in[i] for i in cv_indices]            
            
            lrc.fit(X_train, y_train)
            in_accuracy += lrc.score(X_train, y_train)
            cv_accuracy += lrc.score(X_cv, y_cv)
              
        in_accuracy /= kfold.k
        cv_accuracy /= kfold.k
        
        if file_log:
            file_log.writelines('C: {}\n'.format(c))  
            file_log.writelines('\tEin= {}\n'.format(1. - in_accuracy))
            file_log.writelines('\tEcv= {}\n'.format(1. - cv_accuracy))

        if (cv_accuracy > best_cv_accuracy):
            best_c = c
            best_cv_accuracy = cv_accuracy
            
    #Now tests the out of sample error
    if file_log:        
        file_log.writelines('\nBEST result: E_cv={}, C={}\n'.format(1. - best_cv_accuracy, best_c)) 
    
    lrc = LogisticRegression(C=best_c, tol=1e-5)

    lrc.fit(X_in, y_in)
    if file_log:        
        file_log.writelines('Ein= {}\n'.format(1. - lrc.score(X_in, y_in)))
        file_log.writelines('Etest= {}\n'.format(1. - lrc.score(X_scaler, y_scaler)))     
        
    y_out = lrc.predict(X_out)
    return y_out