Example #1
0
from sklearn.model_selection import cross_validate
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_digits
import matplotlib.pyplot as plt

X, y = load_digits(return_X_y=True)

pipe = make_pipeline(MinMaxScaler(),
                     LogisticRegression(random_state=42, max_iter=1000))

scores = cross_validate(pipe, X, y, cv=3, return_train_score=True)

import pandas as pd

df_scores = pd.DataFrame(scores)

df_scores

df_scores.mean()

df_scores[['train_score', 'test_score']].boxplot()

scores = cross_validate(pipe, X, y, cv=10, return_train_score=True)

df_scores = pd.DataFrame(scores)

df_scores

df_scores[['train_score', 'test_score']].boxplot()
'''

import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report

df = pd.read_csv('../datasets/diabetes.csv')

X = df.drop(['diabetes'], axis=1)
y = df['diabetes']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

logreg = LogisticRegression()

logreg.fit(X_train, y_train)

'''
INSTRUCTIONS

*   Import roc_curve from sklearn.metrics.
*   Using the logreg classifier, which has been fit to the training data, compute the predicted probabilities of the labels of the test set X_test. Save the result as y_pred_prob.
*   Use the roc_curve() function with y_test and y_pred_prob and unpack the result into the variables fpr, tpr, and thresholds.
*   Plot the ROC curve with fpr on the x-axis and tpr on the y-axis.
'''

# Import necessary modules
from sklearn.metrics import roc_curve
def get_dataset():
    x, y = load_iris(return_X_y=True)
    random_state = np.random.RandomState(2020)
    n_samples, n_features = x.shape
    # 为数据增加噪音维度以便更好观察pr曲线
    x = np.concatenate([x, random_state.randn(n_samples, 200 * n_features)],
                       axis=1)
    # 针对二分类下的pr曲线
    x_train, x_test, y_train, y_test = train_test_split(
        x[y < 2], y[y < 2], test_size=0.5, random_state=random_state)
    return x_train, x_test, y_train, y_test


if __name__ == '__main__':
    x_train, x_test, y_train, y_test = get_dataset()
    model = LogisticRegression()
    model.fit(x_train, y_train)
    y_scores = model.predict_proba(x_test)
    precision, recall, _ = p_r_curve(y_test, y_scores[:, 1])
    ap = compute_ap(recall, precision)
    plt.plot(recall,
             precision,
             drawstyle="steps-post",
             label=f'LogisticRegression (AP={ap})')
    plt.legend(loc="lower left")
    plt.xlabel("Recall (Positive label: 1)")
    plt.ylabel("Precision (Positive label: 1)")
    # 通过sklear方法进行绘制
    plot_precision_recall_curve(model, x_test, y_test)
    plt.show()
# Create a pipeline that extracts features from the data then creates a model
from pandas import read_csv
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest
# load data
filename = 'pima-indians-diabetes.data.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = read_csv(filename, names=names)
array = dataframe.values
X = array[:,0:8]
Y = array[:,8]
# create feature union
features = []
features.append(('pca', PCA(n_components=3)))
features.append(('select_best', SelectKBest(k=6)))
feature_union = FeatureUnion(features)
# create pipeline
estimators = []
estimators.append(('feature_union', feature_union))
estimators.append(('logistic', LogisticRegression(solver='liblinear')))
model = Pipeline(estimators)
# evaluate pipeline
kfold = KFold(n_splits=10, random_state=7, shuffle=True)
results = cross_val_score(model, X, Y, cv=kfold)
print(results.mean())
Example #5
0
# 二值化,阈值设定为 0.5 ,返回二值化的数据
b_data=Binarizer(threshold=0.5).fit_transform(boston.data)
print(b_data[0:5,:])


# 哑编码,对boston数据集的目标值,返回值为哑编码后的数据
o_target=OneHotEncoder().fit_transform(boston.target)
print(o_target[0:5])



###特征选择###


#方差选择法,返回值为特征选择后的数据
#参数threshold为方差的阈值
VarianceThreshold(threshold=3).fit_transform(iris.data)

# 卡方检验,选择K个最好的特征,返回选择特征后的数据
select_data=SelectKBest(chi2, k=2).fit_transform(iris.data, iris.target)

# 递归特征消除法,返回特征选择后的数据
# 参数estimator为基模型
# 参数n_features_to_select为选择的特征个数
RFE(estimator=LogisticRegression(), n_features_to_select=2).fit_transform(iris.data, iris.target)

#带L1惩罚项的逻辑回归作为基模型的特征选择
SelectFromModel(LogisticRegression(penalty="l1", C=0.1)).fit_transform(iris.data, iris.target)

#GBDT作为基模型的特征选择
SelectFromModel(GradientBoostingClassifier()).fit_transform(iris.data, iris.target)
Example #6
0
svm_clf = Pipeline([('vect', CountVectorizer()),
                    ('tfidf', TfidfTransformer()),
                    ('clf', svm.LinearSVC(C=1.0))
                   ])
svm_clf.fit(X_train, y_train)
svm_predicted = svm_clf.predict(X_test)

print(metrics.confusion_matrix(y_test, svm_predicted))
print(np.mean(svm_predicted==y_test))
print(metrics.classification_report(y_test, svm_predicted))

from sklearn.linear_model import LogisticRegression
lr_clf = Pipeline([('vect', CountVectorizer()),
                   ('tfidf', TfidfTransformer()),
                   ('clf', LogisticRegression())
                  ])
lr_clf.fit(X_train, y_train)
lr_predicted = lr_clf.predict(X_test)

print(metrics.confusion_matrix(y_test, lr_predicted))
print(np.mean(lr_predicted==y_test))
print(metrics.classification_report(y_test, lr_predicted))

lr_clf = LogisticRegression()
# lr_clf.fit(dtm, y_train)
lr_clf.fit(X_train_tf, y_train)

lr_clf_coef = (
    pd.DataFrame(lr_clf.coef_[0], index=dtm.columns)
    .rename(columns={0:'Coefficient'})
Example #7
0
def main():
    p = optparse.OptionParser()
    p.add_option('--attr', '-a', type = str, help = 'attribute')
    p.add_option('--attr_type', '-t', type = str, help = 'attribute type')
    p.add_option('--num_train_each', '-n', type = int, help = 'number of training samples of True and False for the attribute (for total of 2n training samples)')
    p.add_option('--embedding', '-e', type = str, help = 'embedding (adj, normlap, regnormlap)')
    p.add_option('-k', type = int, help = 'number of eigenvalues')
    p.add_option('--sphere', '-s', action = 'store_true', default = False, help = 'normalize in sphere')
    p.add_option('--num_samples', '-S', type = int, default = 50, help = 'number of Monte Carlo samples')
    p.add_option('-v', action = 'store_true', default = False, help = 'save plot')
    p.add_option('--jobs', '-j', type = int, default = -1, help = 'number of jobs')
    opts, args = p.parse_args()

    attr, attr_type, num_train_each, embedding, k, sphere, num_samples, save_plot, jobs = opts.attr, opts.attr_type, opts.num_train_each, opts.embedding, opts.k, opts.sphere, opts.num_samples, opts.v, opts.jobs

    folder = 'gplus0_lcc/baseline5/'
    agg_precision_filename = folder + '%s_%s_n%d_%s_k%d%s_precision.csv' % (attr_type, attr, num_train_each, embedding, k, '_normalize' if sphere else '')
    plot_filename = folder + '%s_%s_n%d_%s_k%d%s_precision.png' % (attr_type, attr, num_train_each, embedding, k, '_normalize' if sphere else '')
    top_attrs_filename = folder + '%s_%s_n%d_%s_k%d%s_top_attrs.txt' % (attr_type, attr, num_train_each, embedding, k, '_normalize' if sphere else '')

    print("\nNominating nodes with whose '%s' attribute is '%s' (%d pos/neg seeds)..." % (attr_type, attr, num_train_each))
    print("\nLoading AttributeAnalyzer...")
    a = AttributeAnalyzer(load_data = False)
    sqrt_samples = np.sqrt(num_samples)

    try:
        agg_precision_df = pd.read_csv(agg_precision_filename)
        print("\nLoaded data from '%s'." % agg_precision_filename)
        selected_attrs = pd.read_csv('selected_attrs.csv')
        if (attr in list(selected_attrs['attribute'])):
            row = selected_attrs[selected_attrs['attribute'] == attr].iloc[0]
            num_true_in_test = row['freq'] - num_train_each
            num_test = row['totalKnown'] - 2 * num_train_each
        else:
            ind = a.get_attribute_indicator(attr, attr_type)
            num_true_in_test = len(ind[ind == 1]) - num_train_each
            num_test = ind.count() - 2 * num_train_each

    except OSError:
        print("\nLoading attribute data...")
        timeit(a.load_data)()
        a.make_joint_attr_embedding_matrix(attr_type, sim = sim, embedding = embedding, delta = delta, tau = tau, k = k, sphere = 2 if sphere else 0)

        # get attribute indicator for all the nodes
        attr_indicator = a.get_attribute_indicator(attr, attr_type)

        # prepare the classifiers
        rfc = RandomForestClassifier(n_estimators = num_rf_trees, n_jobs = jobs)
        boost = AdaBoostClassifier(n_estimators = num_boost_trees)
        logreg = LogisticRegression(n_jobs = jobs)
        gnb = GaussianNB()
        rfc_precision_df = pd.DataFrame(columns = range(num_samples))
        boost_precision_df = pd.DataFrame(columns = range(num_samples))
        logreg_precision_df = pd.DataFrame(columns = range(num_samples))
        gnb_precision_df = pd.DataFrame(columns = range(num_samples))

        # maintain top nominee attributes dictionary
        top_attrs = defaultdict(float)

        for s in range(num_samples):
            print("\nSEED = %d" % s)
            np.random.seed(s)
            print("\nObtaining feature vectors for random training and test sets...")
            ((train_in, train_out), (test_in, test_out)) = timeit(a.get_joint_PMI_training_and_test)(attr, attr_type, num_train_each)

            # train and predict
            print("\nTraining %d random forest trees..." % num_rf_trees)
            timeit(rfc.fit)(train_in, train_out)
            print("\nPredicting probabilities...")
            probs_rfc = timeit(rfc.predict_proba)(test_in)[:, 1]

            print("\nTraining %d AdaBoost trees..." % num_boost_trees)
            timeit(boost.fit)(train_in, train_out)
            print("\nPredicting probabilities...")
            probs_boost = timeit(boost.predict_proba)(test_in)[:, 1]

            print("\nTraining logistic regression...")
            timeit(logreg.fit)(train_in, train_out)
            print("\nPredicting probabilities...")
            probs_logreg = timeit(logreg.predict_proba)(test_in)[:, 1]

            print("\nTraining Naive Bayes...")
            timeit(gnb.fit)(train_in, train_out)
            print("\nPredicting probabilities...")
            probs_gnb = timeit(gnb.predict_proba)(test_in)[:, 1]            

            test_df = pd.DataFrame(columns = ['test', 'probs_rfc', 'probs_boost', 'probs_logreg', 'probs_gnb'])
            test_df['test'] = test_out
            test_df['probs_rfc'] = probs_rfc
            test_df['probs_boost'] = probs_boost
            test_df['probs_logreg'] = probs_logreg
            test_df['probs_gnb'] = probs_gnb

            # do vertex nomination
            test_df = test_df.sort_values(by = 'probs_rfc', ascending = False)
            rfc_precision_df[s] = np.asarray(test_df['test']).cumsum() / np.arange(1.0, len(test_out) + 1.0)
            test_df = test_df.sort_values(by = 'probs_boost', ascending = False)
            boost_precision_df[s] = np.asarray(test_df['test']).cumsum() / np.arange(1.0, len(test_out) + 1.0)
            test_df = test_df.sort_values(by = 'probs_logreg', ascending = False)
            logreg_precision_df[s] = np.asarray(test_df['test']).cumsum() / np.arange(1.0, len(test_out) + 1.0)
            test_df = test_df.sort_values(by = 'probs_gnb', ascending = False)
            gnb_precision_df[s] = np.asarray(test_df['test']).cumsum() / np.arange(1.0, len(test_out) + 1.0)

            # determine top attributes
            best_i, best_prec = -1, -1.0
            for (i, prec_series) in enumerate([rfc_precision_df[s], boost_precision_df[s], logreg_precision_df[s], gnb_precision_df[s]]):
                if (prec_series[topN_nominees - 1] > best_prec):
                    best_i, best_prec = i, prec_series[topN_nominees - 1]
            test_df = test_df.sort_values(by = 'probs_%s' % classifiers[i], ascending = False)
            for node in test_df.index[:topN_nominees]:
                attrs = a.attrs_by_node_by_type[attr_type][node]
                for at in attrs:
                    top_attrs[at] += 1.0 / len(attrs)  # divide the vote equally among all attributes

            sys.stdout.flush()  # flush the output buffer

        # compute means and standard errors over all the samples
        agg_precision_df = pd.DataFrame(columns = ['mean_rfc_prec', 'stderr_rfc_prec', 'mean_boost_prec', 'stderr_boost_prec', 'mean_logreg_prec', 'stderr_logreg_prec', 'mean_gnb_prec', 'stderr_gnb_prec', 'max_mean_prec'])
        agg_precision_df['mean_rfc_prec'] = rfc_precision_df.mean(axis = 1)
        agg_precision_df['stderr_rfc_prec'] = rfc_precision_df.std(axis = 1) / sqrt_samples
        agg_precision_df['mean_boost_prec'] = boost_precision_df.mean(axis = 1)
        agg_precision_df['stderr_boost_prec'] = boost_precision_df.std(axis = 1) / sqrt_samples
        agg_precision_df['mean_logreg_prec'] = logreg_precision_df.mean(axis = 1)
        agg_precision_df['stderr_logreg_prec'] = logreg_precision_df.std(axis = 1) / sqrt_samples
        agg_precision_df['mean_gnb_prec'] = gnb_precision_df.mean(axis = 1)
        agg_precision_df['stderr_gnb_prec'] = gnb_precision_df.std(axis = 1) / sqrt_samples
        agg_precision_df['max_mean_prec'] = agg_precision_df[['mean_rfc_prec', 'mean_boost_prec', 'mean_logreg_prec', 'mean_gnb_prec']].max(axis = 1)

        # save the aggregate data frames
        N_save = min(len(test_out), topN_save)
        agg_precision_df = agg_precision_df[:N_save]

        agg_precision_df.to_csv(agg_precision_filename, index = False)

        top_attrs_df = pd.DataFrame(list(top_attrs.items()), columns = ['attribute', 'voteProportion'])
        top_attrs_df = top_attrs_df.set_index('attribute')
        top_attrs_df['voteProportion'] /= top_attrs_df['voteProportion'].sum()
        top_attrs_df = top_attrs_df.sort_values(by = 'voteProportion', ascending = False)
        open(top_attrs_filename, 'w').write(str(top_attrs_df))

        num_true_in_test = test_out.sum()
        num_test = len(test_out)

    # plot the nomination precision 
    if save_plot:
        N_plot = min(len(agg_precision_df), topN_plot)
        plt.fill_between(agg_precision_df.index, agg_precision_df['mean_rfc_prec'] - 2 * agg_precision_df['stderr_rfc_prec'], agg_precision_df['mean_rfc_prec'] + 2 * agg_precision_df['stderr_rfc_prec'], color = 'green', alpha = 0.25)
        rfc_plot, = plt.plot(agg_precision_df.index, agg_precision_df['mean_rfc_prec'], color = 'green', linewidth = 2, label = 'Random Forest')
        plt.fill_between(agg_precision_df.index, agg_precision_df['mean_boost_prec'] - 2 * agg_precision_df['stderr_boost_prec'], agg_precision_df['mean_boost_prec'] + 2 * agg_precision_df['stderr_boost_prec'], color = 'blue', alpha = 0.25)
        boost_plot, = plt.plot(agg_precision_df.index, agg_precision_df['mean_boost_prec'], color = 'blue', linewidth = 2, label = 'AdaBoost')
        plt.fill_between(agg_precision_df.index, agg_precision_df['mean_logreg_prec'] - 2 * agg_precision_df['stderr_logreg_prec'], agg_precision_df['mean_logreg_prec'] + 2 * agg_precision_df['stderr_logreg_prec'], color = 'red', alpha = 0.25)
        logreg_plot, = plt.plot(agg_precision_df.index, agg_precision_df['mean_logreg_prec'], color = 'red', linewidth = 2, label = 'Logistic Regression')
        plt.fill_between(agg_precision_df.index, agg_precision_df['mean_gnb_prec'] - 2 * agg_precision_df['stderr_gnb_prec'], agg_precision_df['mean_gnb_prec'] + 2 * agg_precision_df['stderr_gnb_prec'], color = 'orange', alpha = 0.25)
        gnb_plot, = plt.plot(agg_precision_df.index, agg_precision_df['mean_gnb_prec'], color = 'orange', linewidth = 2, label = 'Naive Bayes')

        guess_rate = num_true_in_test / num_test
        guess, = plt.plot([guess_rate for i in range(N_plot)], linestyle = 'dashed', linewidth = 2, color = 'black', label = 'Guess')
        plt.xlabel('rank')
        plt.ylabel('precision')
        plt.xlim((0.0, N_plot))
        plt.ylim((0.0, 1.0))
        plt.title('Vertex Nomination Precision')
        plt.legend(handles = [rfc_plot, boost_plot, logreg_plot, gnb_plot, guess])
        plt.savefig(plot_filename)

    print("\nDone!")
robust_scaler = RobustScaler()
X = robust_scaler.fit_transform(X)
y=df[target_name]
for i in range(10,55):
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=i/100, random_state=123, stratify=y)
    print(i/10)

    # Check accuracy of base rate model
    y_base_rate = base_rate_model(X_test)
    from sklearn.metrics import accuracy_score
    acc_score=accuracy_score(y_test, y_base_rate)
    print("Base rate accuracy is %2.2f" % acc_score)

    # Check accuracy of Logistic Model
    from sklearn.linear_model import LogisticRegression
    model = LogisticRegression(penalty='l2', C=1)

    model.fit(X_train, y_train)
    print ("Logistic accuracy is %2.2f" % accuracy_score(y_test, model.predict(X_test)))


    # Using 10 fold Cross-Validation to train our Logistic Regression Model
    from sklearn import model_selection
    from sklearn.linear_model import LogisticRegression
    kfold = model_selection.KFold(n_splits=10, random_state=7)
    modelCV = LogisticRegression(class_weight = "balanced")
    scoring = 'roc_auc'
    results = model_selection.cross_val_score(modelCV, X_train, y_train, cv=kfold, scoring=scoring)
    print("AUC: %.3f (%.3f)" % (results.mean(), results.std()))
    print("Model Updated")
Example #9
0
from sklearn import datasets
from sklearn import metrics
from sklearn.linear_model import LogisticRegression

# Load the iris dataset (1936!)- https://archive.ics.uci.edu/ml/datasets/iris
# 150 samples for 3 different types of irises (Setosa, Versicolour and Virginica)
# The rows are the samples and the columns are: Sepal Length, Sepal Width, Petal Length and Petal Width.
dataset = datasets.load_iris()

print(dataset.data.shape)
print(dataset.data[:10])
print(dataset.target.shape)
print(dataset.target[:10])

# Fit a logistic regression model to the data
model = LogisticRegression(solver='liblinear', multi_class='auto')
model.fit(dataset.data, dataset.target)

# Save model for future use
from sklearn.externals import joblib
joblib.dump(model, 'irismodel.pkl')

# Make predictions
expected = dataset.target
predicted = model.predict(dataset.data)

# Display metrics
# Precision measures the impact of false positives: TP/(TP+FP)
# Recall measures the impact of false negatives : TP/(TP+FN)
# F1 is the weighted average of precision and recall: (2*Recall*Precision)/(Recall+Precision)
print(metrics.classification_report(expected, predicted))
################# matplotlib 한글 구현 #############################
from matplotlib import font_manager, rc
font_name = font_manager.FontProperties(
    fname="c:/Windows/Fonts/malgun.ttf").get_name()
rc('font', family=font_name)
####################################################################

from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

X, y = mglearn.datasets.make_forge()

fig, axes = plt.subplots(1, 2, figsize=(10, 3))

for model, ax in zip([LinearSVC(), LogisticRegression()], axes):
    clf = model.fit(X, y)
    mglearn.plots.plot_2d_separator(clf,
                                    X,
                                    fill=False,
                                    eps=0.5,
                                    ax=ax,
                                    alpha=.7)
    mglearn.discrete_scatter(X[:, 0], X[:, 1], y, ax=ax)
    ax.set_title("{}".format(clf.__class__.__name__))
    ax.set_xlabel("특성 0")
    ax.set_ylabel("특성 1")
axes[0].legend()

plt.show()
Example #11
0
# knn
model_knn = KNeighborsClassifier(n_neighbors=10)
model_knn.fit(train_vectors, training_task_a_labels_list)
prediction_knn = model_knn.predict(test_vectors)

print("\nClassification report for K Nearest Neighbour")
print(classification_report(test_task_a_labels_list, prediction_knn))
accuracy_knn = round(
    accuracy_score(test_task_a_labels_list, prediction_knn) * 100, 2)
print("Accuracy (KNN) = " + str(accuracy_knn) + " %")
print("\nConfusion Matrix (KNN)")
cf_matrix_knn = confusion_matrix(test_task_a_labels_list, prediction_knn)
print(cf_matrix_knn)

# logistic regression
model_lr = LogisticRegression()
model_lr.fit(train_vectors, training_task_a_labels_list)
prediction_lr = model_lr.predict(test_vectors)

print("\nClassification report for Logistic Regression")
print(classification_report(test_task_a_labels_list, prediction_lr))
accuracy_lr = round(
    accuracy_score(test_task_a_labels_list, prediction_lr) * 100, 2)
print("Accuracy (LR)= " + str(accuracy_lr) + " %")
print("\nConfusion Matrix (LR)")
cf_matrix_lr = confusion_matrix(test_task_a_labels_list, prediction_lr)
print(cf_matrix_lr)

labels = ['NOT', 'OFF']

# graph plot svm
Example #12
0
from sklearn.cross_validation import train_test_split
import numpy as np

smote = SMOTE(kind='borderline1', ratio='auto', k=10)

param_grid = [{
    'penalty': ['l2'],
    'C': [0.001, 0.01, 0.1, 1.0, 10, 100],
    'class_weight': ['balanced'],
    'max_iter': [100, 200, 500, 800, 1000],
    'solver': ['liblinear', 'newton-cg', 'lbfgs'],
    'multi_class': ['ovr'],
    'tol': [1e-4, 1e-3, 1e-2]
}]

clf = LogisticRegression()
start = time()

f1_scorer = make_scorer(f1_score)

# Oversampling
X_tr, X_te, y_tr, y_te = train_test_split(df_reduced_train.values,
                                          y_train,
                                          test_size=0.3,
                                          stratify=y_train)
X_tr, y_tr = smote.fit_sample(X_tr, y_tr)

gs = GridSearchCV(clf, param_grid, scoring=f1_scorer, n_jobs=-1)
gs.fit(X_tr, y_tr)

y_pred = gs.predict(X_te)
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, make_union
from tpot.builtins import StackingEstimator
from sklearn.preprocessing import FunctionTransformer
from copy import copy

# NOTE: Make sure that the outcome column is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)
features = tpot_data.drop('target', axis=1)
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'], random_state=None)

# Average CV score on the training set was: 0.9643563265868098
exported_pipeline = make_pipeline(
    make_union(
        FunctionTransformer(copy),
        FunctionTransformer(copy)
    ),
    LogisticRegression(C=0.001, dual=False, penalty="l2")
)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
Example #14
0
clf_ext = ExtraTreesClassifier(
    n_estimators=240,
    max_depth=15,
    min_samples_split=0.03,
    min_samples_leaf=5,
    max_features=24,
    class_weight='balanced',
    n_jobs=4,
    bootstrap=True,
    oob_score=True,
)

# meta_classifier as logistic regression
lr_stack = LogisticRegression(class_weight='balanced',
                              solver='sag',
                              max_iter=10000,
                              n_jobs=4,
                              verbose=2)

xgb_stack = xgb.XGBClassifier(learning_rate=0.1,
                              n_estimators=600,
                              max_depth=5,
                              subsample=0.8,
                              colsample_bytree=0.8,
                              scale_pos_weight=1,
                              n_jobs=4)


def train_SVM(estimator, trainX, trainY, method, n_jobs=4, skip=False):
    # SVM
    logger = misc.init_logger(method)
Example #15
0
import pandas as pd
from create_dataset import df
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

X= df['data']
y = df['labels']

# X_train,X_test,y_train,y_test = train_test_split(X,pd.get_dummies(y),random_state=2)
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=2)

model_names = ['LogisticRegression()']
pattern = '[A-Za-z0-9]+(?=\\s+)'
for model in model_names:
    """
    TO-DOs
    1) convert labels to numbers using pd.get_dummies() ?
    2) convert text data to numeric features.
    3) Add multiple models
    """
    pl = Pipeline([
        ('vectorizer',CountVectorizer()),
        ('clf',LogisticRegression())
    ])

    pl.fit(X_train,y_train)

    accuracy = pl.score(X_test,y_test)
    print("Acuuracy for {} is {}".format(model,accuracy))
def create_logistic_vectorizer():
    vectorizer = CountVectorizer(lowercase=False, min_df=0.0, binary=True)
    lr = LogisticRegression(random_state=777)
    return Pipeline([('vectorizer', vectorizer), ('lr', lr)])
Example #17
0
#Encoding the categorical variables
#for SVM classification
train_y_svm = train_y
test_y_svm = test_y
#for other types of classification
train_y = pd.get_dummies(train_y)
# train_y_binary=pd.get_dummies(train_y_binary)
# train_y_binary=train_y_svm['benign']
test_y = pd.get_dummies(test_y)
# test_y_binary_num=pd.get_dummies(test_y_binary)

#Applying Logistic regression for multiclass classification
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
ovr_clf = OneVsRestClassifier(LogisticRegression())
ovr_clf.fit(train_x, train_y)
ovr_predicted = ovr_clf.predict(test_x)

from sklearn.metrics import confusion_matrix, precision_score, f1_score
ovr_confusion_matrix = confusion_matrix(test_y.values.argmax(axis=1),
                                        ovr_predicted.argmax(axis=1))
precision_ovr = precision_score(
    test_y, ovr_predicted, average='micro')  #gives 95.76 percent of accuracy
precision_ovr_all = precision_score(test_y, ovr_predicted,
                                    average=None)  #gives 94.98,96.52
f1_lgd = f1_score(test_y, ovr_predicted,
                  average='micro')  #gives 95.76 percent of accuracy

#Applying SVM for multiclass classification
from sklearn.svm import LinearSVC
Example #18
0
#print(len(X_test))
#knnn.fit(X_train, y_train)
#y_pred3 = knnn.predict(X_test)
#accuracy3 = ((y_test==y_pred3).sum()/len(y_test)*100)
#print('accuracy %.2f' % accuracy3)
#print("---------Logestic Refression---------")
#logreg = LogisticRegression(multi_class='auto')
#logreg.fit(X_train, y_train)
#y_pred4 = logreg.predict(X_test)
#y_pred_lr_prob = logreg.predict_log_proba(X_test)
#print(y_pred_lr_prob.shape)
#print(y_pred_lr_prob)
#accuracy4 = ((y_test==y_pred4).sum()/len(y_test)*100)

	print("---------Logestic Regression---------")
	logreg = LogisticRegression(multi_class='auto')
	parameter_grid = { 'tol': [0.0001, 0.001, 0.01, 0.1,1.0],'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'], 'max_iter': [50,100,200,500,1000,200,5000,10000]}
	gs = GridSearchCV(logreg, param_grid = parameter_grid,cv = 3)
	gs.fit(X_train, y_train)

	clf = gs.best_estimator_
	clf.fit(X_train, y_train)
	print('Train accuracy: %.3f' % gs.best_score_)
	print('Best Parameter: ', gs.best_params_)
	print('Test accuracy: %.3f' % clf.score(X_test, y_test))
	print("--------- SVM ---------")
	svc = svm.SVC(kernel="linear", random_state=1, C=1)
	parameter_grid = [{'C': [0.0001, 0.001, 0.01, 0.1,1.0, 10.0, 50,100,200,500,1000], 'gamma': [0.0001, 0.001, 0.01, 0.1,1.0, 10.0, 50,100,200,500,1000],'kernel': ['rbf']}, {'C': [0.0001, 0.001, 0.01, 0.1,1.0, 10.0, 50,100,200,500,1000],'kernel': ['linear']}]
	gs = GridSearchCV(svc, param_grid = parameter_grid,cv = 3)
	gs.fit(X_train, y_train)
KNN Classifier
"""

from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors=5,metric='minkowski', p=2)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
from sklearn.metrics import accuracy_score    
Accuracy=accuracy_score(y_test, y_pred, normalize=True, sample_weight=None)
print(Accuracy*100,'%')

"""Logistic Regression"""

from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
from sklearn.metrics import accuracy_score    
Accuracy=accuracy_score(y_test, y_pred, normalize=True, sample_weight=None)
print(Accuracy*100,'%')

"""Support Vector Machine"""

from sklearn.svm import SVC
classifier = SVC(kernel = 'linear')
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
from sklearn.metrics import accuracy_score    
Accuracy=accuracy_score(y_test, y_pred, normalize=True, sample_weight=None)
print(Accuracy*100,'%')
Example #20
0
    accuracy = cross_val_score(estimator, X_train, y_train, scoring='accuracy', cv=cv).mean()
    recall = cross_val_score(estimator, X_train, y_train, scoring='recall', cv=cv).mean()

    print("{}: auc:{:f}, recall:{:f}, accuracy:{:f}".format(name, auc, recall, accuracy))

#     skplt.plot_learning_curve(estimator, X_train, y_train)
#     plt.show()

#     estimator.fit(X_train, y_train)
#     y_probas = estimator.predict_proba(X_train)
#     skplt.plot_roc_curve(y_true=y_train, y_probas=y_probas)
#     plt.show()

estimate(XGBClassifier(learning_rate=0.1, n_estimators=20, objective='binary:logistic'), 'XGBClassifier')
estimate(RidgeClassifier(), 'RidgeClassifier')
estimate(LogisticRegression(), 'LogisticRegression')
# estimate(RandomForestClassifier(), 'RandomForestClassifier')
estimate(AdaBoostClassifier(), 'AdaBoostClassifier')
# estimate(SVC(), 'SVC')# too long to wait
# estimate(LinearSVC(), 'LinearSVC')

# XGBClassifier: auc:0.747668, recall:0.000000, accuracy:0.944575
# RidgeClassifier: auc:0.754218, recall:0.000000, accuracy:0.944433
# LogisticRegression: auc:0.758454, recall:0.015424, accuracy:0.942010
# AdaBoostClassifier: auc:0.784086, recall:0.013495, accuracy:0.943791

from sklearn.ensemble import VotingClassifier

estimators = []
# estimators.append(('RidgeClassifier', RidgeClassifier()))
estimators.append(('LogisticRegression', LogisticRegression()))
#To try: XG Boost or boosting - tree tech.
#"""
# Compare Algorithms
import matplotlib.pyplot as plt
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
# prepare configuration for cross validation test harness
seed = 7
# prepare models
models = []
models.append(('LR', LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC()))
#rename data
X=attribute
Y=drugs.iloc[:,:17]
# evaluate each model in turn
results = []
names = []
scoring = 'accuracy'
count=0      
#n=0     
#val=[]
Example #22
0
#        c40 += 1

    x.append(list(dict['BlogVector']))
    y.append(age)

#print c13, c20, c30, c40

x_train = x[:16000]
y_train = y[:16000]
x_test = x[16000:]
y_test = y[16000:]

#print len(x)
#print len(y)

lr1_age_clf = LogisticRegression()

lr1_age_clf.fit(x_train, y_train)

blog = ""

print lr1_age_clf.predict([blog])

#y_pred = lr1_age_clf.predict(x_test)

from sklearn import metrics

print "\nLogistic Regression Accuracy: ", lr1_age_clf.score(x_test, y_test)
print "\nConfusion Matrix:\n", metrics.confusion_matrix(y_test, y_pred, labels=['teens', 'twenties', 'thirties', 'forties'])

print "\nClassification Report:\n", metrics.classification_report(y_test, y_pred, labels=['teens', 'twenties', 'thirties', 'forties'])
Example #23
0
def main():
    parser = argparse.ArgumentParser()
    ##required parameter
    parser.add_argument(
        "--memo",
        default='running Li model with data using adv training ',
        type=str,
        required=False)
    parser.add_argument("--model_name",
                        type=str,
                        default='bert_Li',
                        required=False)
    parser.add_argument("--data_dir",
                        type=str,
                        default="./data/data_set",
                        required=False)
    parser.add_argument("--out_dir",
                        type=str,
                        default="./Limodel_roberta_adv",
                        required=False)
    parser.add_argument("--pretrained_model_path",
                        type=str,
                        default="./chinese_roberta_small",
                        required=False)
    parser.add_argument("--to_resume_model",
                        type=bool,
                        default=False,
                        required=False)
    parser.add_argument("--resume_model_path",
                        type=str,
                        default="./Limodel_roberta_adv/pytorch_model.bin",
                        required=False)
    parser.add_argument("--num_labels", type=int, default=3, required=False)
    ##other parameters
    parser.add_argument("--output_hidden_states",
                        type=bool,
                        default=True,
                        required=False)
    parser.add_argument("--do_kfold",
                        action="store_true",
                        default=False,
                        required=False)
    parser.add_argument("--do_ensemble",
                        action="store_true",
                        default=False,
                        required=False)
    parser.add_argument("--do_train",
                        action="store_true",
                        default=True,
                        required=False)
    parser.add_argument("--do_eval",
                        action="store_true",
                        default=True,
                        required=False)
    parser.add_argument("--test",
                        action="store_true",
                        default=True,
                        required=False)
    parser.add_argument("--folds", type=int, default=5, required=False)
    parser.add_argument("--epochs", type=int, default=5, required=False)
    parser.add_argument("--weight",
                        default=[1.0, 1.0, 1.0],
                        type=list,
                        required=False,
                        help='the weight for crossentropy')
    parser.add_argument(
        "--ensemble_models",
        default=["bert_SPRNN", "bert_RCNN", "bert_RNN", "bert_CNN", "bert_Li"],
        required=False,
        help='the ensemble model names')
    parser.add_argument("--weight_list",
                        default=[[1.0, 1.0, 1.0], [2.0, 1.0, 1.0],
                                 [2.0, 2.0, 1.0], [4.0, 2.0, 1.0],
                                 [1.0, 2.0, 1.0]],
                        type=list,
                        required=False,
                        help='weight list used in the ensemble mode')
    parser.add_argument("--batch_size", type=int, default=32, required=False)
    parser.add_argument("--eval_batch_size",
                        default=64,
                        type=int,
                        required=False)
    parser.add_argument("--max_seq_len", type=int, default=512, required=False)
    parser.add_argument("--title_seq_len",
                        type=int,
                        default=32,
                        required=False)
    parser.add_argument("--content_seq_len",
                        type=int,
                        default=512,
                        required=False)
    parser.add_argument("--no_cuda",
                        default=False,
                        action="store_true",
                        required=False)
    parser.add_argument("--log_dir", default=None, type=str, required=False)
    parser.add_argument("--dev_loss", default=0, type=float, required=False)
    parser.add_argument("--seed", default=42, type=int, required=False)
    parser.add_argument("--do_lower_case",
                        action="store_true",
                        default=False,
                        required=False)
    parser.add_argument("--optimize_steps",
                        type=int,
                        default=20000,
                        required=False)
    parser.add_argument("--learning_rate",
                        default=5e-6,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--adam_epsilon",
                        default=1e-6,
                        type=float,
                        help="Epsilon for Adam optimizer.")
    parser.add_argument("--weight_decay",
                        default=1e-3,
                        type=float,
                        help="L2 regularization.")
    # parser.add_argument("--max_grad_norm", default=1.0, type=float,
    #                     help="Max gradient norm.")
    # parser.add_argument("--num_train_epochs", default=12.0, type=float,
    #                     help="Total number of training epochs to perform.")
    # parser.add_argument("--max_steps", default=-1, type=int,
    #                     help="If > 0: set total number of training steps to perform. Override num_train_epochs.")
    parser.add_argument("--eval_steps",
                        default=200,
                        type=int,
                        required=False,
                        help="")
    parser.add_argument("--lstm_hidden_size", default=512, type=int, help="")
    parser.add_argument("--lstm_layers", default=1, type=int, help="")
    parser.add_argument("--lstm_dropout", default=0.5, type=float, help="")
    parser.add_argument("--linear_hidden_size",
                        default=1024,
                        type=float,
                        help="")
    parser.add_argument("--kernel_sizes",
                        default=[2, 3, 4, 5],
                        type=list,
                        help="set the kernel sizes for cnn model")
    parser.add_argument("--out_channels",
                        default=256,
                        type=int,
                        help="set the out channel for cnn model")
    parser.add_argument("--kmax",
                        default=2,
                        type=float,
                        help="set the features from kmax")
    parser.add_argument(
        "--meta_path",
        default=None,
        type=str,
        required=False,
        help="Path to pre-trained model or shortcut name selected in the list:"
    )

    # parser.add_argument("--report_steps", default=-1, type=int,
    #                     help="")
    # parser.add_argument("--warmup_steps", default=0, type=int,
    #                     help="Linear warmup over warmup_steps.")
    # parser.add_argument("--split_num", default=3, type=int,
    #                     help="text split")
    # parser.add_argument('--logging_steps', type=int, default=50,
    #                     help="Log every X updates steps.")
    args = parser.parse_args()
    args.n_gpus = torch.cuda.device_count()
    if not os.path.isdir(args.out_dir):
        os.mkdir(args.out_dir)  #prepare output directory
    if args.do_ensemble:
        stacker = LogisticRegression(random_state=0,
                                     solver='lbfgs',
                                     multi_class='multinomial',
                                     class_weight={
                                         0: 5.0,
                                         1: 1.0,
                                         2: 1.0
                                     })
        stacking_models(args, stacker)
        return
    if args.do_kfold:
        kfold_train(args)
        return
    if args.do_train:
        train(args)
        args.do_train = False  ###
    if args.do_eval:
        predict(args, is_eval=True)  ##output the dev set result
    if args.test:
        predict(args)
previsores[:, 8] = labelencoder_previsores.fit_transform(previsores[:, 8])
previsores[:, 9] = labelencoder_previsores.fit_transform(previsores[:, 9])
previsores[:, 13] = labelencoder_previsores.fit_transform(previsores[:, 13])

onehotencoder = ColumnTransformer(
    [('one_hot_encoder', OneHotEncoder(categories='auto'), [0])],   # The column numbers to be transformed (here is [0] but can be [0, 1, 3])
    remainder='passthrough'                                         # Leave the rest of the columns untouched
)#OneHotEncoder(categorical_features = [1,3,5,6,7,8,9,13])
previsores = onehotencoder.fit_transform(previsores).toarray()

labelencoder_classe = LabelEncoder()
classe = labelencoder_classe.fit_transform(classe)

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
previsores = scaler.fit_transform(previsores)

from sklearn.model_selection import train_test_split
previsores_treinamento, previsores_teste, classe_treinamento, classe_teste = train_test_split(previsores, classe, test_size=0.15, random_state=0)

from sklearn.linear_model import LogisticRegression
classificador = LogisticRegression()
classificador.fit(previsores_treinamento, classe_treinamento)
previsoes = classificador.predict(previsores_teste)

from sklearn.metrics import confusion_matrix, accuracy_score
precisao = accuracy_score(classe_teste, previsoes)
matriz = confusion_matrix(classe_teste, previsoes)

import collections
collections.Counter(classe_teste)
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

target = [1 if i < 12500 else 0 for i in range(25000)]
# split the train set into training data and validation data
X_train, X_val, y_train, y_val = train_test_split(
    X, target, train_size=0.75
)

accuracy = []
regularization = [0.01, 0.05, 0.25, 0.5, 1]
for c in regularization:
    lr = LogisticRegression(C=c)
    lr.fit(X_train, y_train)
    accuracy.append(accuracy_score(y_val, lr.predict(X_val)))
    print("Accuracy for C=%s: %s"
          % (c, accuracy_score(y_val, lr.predict(X_val))))


# train final model with highest c
highestC = [regularization[i] for i in range(len(regularization)) if accuracy[i] == max(accuracy)][0]
final_model = LogisticRegression(C = highestC)
final_model.fit(X, target)
y_pred = final_model.predict(X_test)
print("Final Accuracy: %s" % accuracy_score(target, y_pred))
print(str(highestC))
accuracy.append(accuracy_score(target, final_model.predict(X_test)))
regularization.append('testError C:' + str(highestC))
Example #26
0
def return_model(mode, **kwargs):
    
    
    if inspect.isclass(mode):
        assert getattr(mode, 'fit', None) is not None, 'Custom model family should have a fit() method'
        model = mode(**kwargs)
    elif mode=='logistic':
        solver = kwargs.get('solver', 'liblinear')
        n_jobs = kwargs.get('n_jobs', None)
        max_iter = kwargs.get('max_iter', 5000)
        model = LogisticRegression(solver=solver, n_jobs=n_jobs, 
                                 max_iter=max_iter, random_state=666)
    elif mode=='Tree':
        model = DecisionTreeClassifier(random_state=666)
    elif mode=='RandomForest':
        n_estimators = kwargs.get('n_estimators', 50)
        model = RandomForestClassifier(n_estimators=n_estimators, random_state=666)
    elif mode=='GB':
        n_estimators = kwargs.get('n_estimators', 50)
        model = GradientBoostingClassifier(n_estimators=n_estimators, random_state=666)
    elif mode=='AdaBoost':
        n_estimators = kwargs.get('n_estimators', 50)
        model = AdaBoostClassifier(n_estimators=n_estimators, random_state=666)
    elif mode=='SVC':
        kernel = kwargs.get('kernel', 'rbf')
        model = SVC(kernel=kernel, random_state=666)
    elif mode=='LinearSVC':
        model = LinearSVC(loss='hinge', random_state=666)
    elif mode=='GP':
        model = GaussianProcessClassifier(random_state=666)
    elif mode=='KNN':
        n_neighbors = kwargs.get('n_neighbors', 5)
        model = KNeighborsClassifier(n_neighbors=n_neighbors)
    elif mode=='NB':
        model = MultinomialNB()
    elif mode=='linear':
        model = LinearRegression(random_state=666)
    elif mode=='ridge':
        alpha = kwargs.get('alpha', 1.0)
        model = Ridge(alpha=alpha, random_state=666)
    elif 'conv' in mode:
        tf.reset_default_graph()
        address = kwargs.get('address', 'weights/conv')
        hidden_units = kwargs.get('hidden_layer_sizes', [20])
        activation = kwargs.get('activation', 'relu')
        weight_decay = kwargs.get('weight_decay', 1e-4)
        learning_rate = kwargs.get('learning_rate', 0.001)
        max_iter = kwargs.get('max_iter', 1000)
        early_stopping= kwargs.get('early_stopping', 10)
        warm_start = kwargs.get('warm_start', False)
        batch_size = kwargs.get('batch_size', 256)
        kernel_sizes = kwargs.get('kernel_sizes', [5])
        strides = kwargs.get('strides', [5])
        channels = kwargs.get('channels', [1])
        validation_fraction = kwargs.get('validation_fraction', 0.)
        global_averaging = kwargs.get('global_averaging', 0.)
        optimizer = kwargs.get('optimizer', 'sgd')
        if mode=='conv':
            model = CShapNN(mode='classification', batch_size=batch_size, max_epochs=max_iter,
                          learning_rate=learning_rate, 
                          weight_decay=weight_decay, validation_fraction=validation_fraction,
                          early_stopping=early_stopping,
                         optimizer=optimizer, warm_start=warm_start, address=address,
                          hidden_units=hidden_units,
                          strides=strides, global_averaging=global_averaging,
                         kernel_sizes=kernel_sizes, channels=channels, random_seed=666)
        elif mode=='conv_reg':
            model = CShapNN(mode='regression', batch_size=batch_size, max_epochs=max_iter,
                          learning_rate=learning_rate, 
                          weight_decay=weight_decay, validation_fraction=validation_fraction,
                          early_stopping=early_stopping,
                         optimizer=optimizer, warm_start=warm_start, address=address,
                          hidden_units=hidden_units,
                          strides=strides, global_averaging=global_averaging,
                         kernel_sizes=kernel_sizes, channels=channels, random_seed=666)
    elif 'NN' in mode:
        solver = kwargs.get('solver', 'adam')
        hidden_layer_sizes = kwargs.get('hidden_layer_sizes', (20,))
        if isinstance(hidden_layer_sizes, list):
            hidden_layer_sizes = list(hidden_layer_sizes)
        activation = kwargs.get('activation', 'relu')
        learning_rate_init = kwargs.get('learning_rate', 0.001)
        max_iter = kwargs.get('max_iter', 5000)
        early_stopping= kwargs.get('early_stopping', False)
        warm_start = kwargs.get('warm_start', False)
        if mode=='NN':
            model = MLPClassifier(solver=solver, hidden_layer_sizes=hidden_layer_sizes,
                                activation=activation, learning_rate_init=learning_rate_init,
                                warm_start = warm_start, max_iter=max_iter,
                                early_stopping=early_stopping)
        if mode=='NN_reg':
            model = MLPRegressor(solver=solver, hidden_layer_sizes=hidden_layer_sizes,
                                activation=activation, learning_rate_init=learning_rate_init,
                                warm_start = warm_start, max_iter=max_iter, early_stopping=early_stopping)
    else:
        raise ValueError("Invalid mode!")
    return model
Example #27
0
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn import svm
from sklearn.metrics import accuracy_score

# Classifiers

# 1 DecisionTreeClassifier
clf_tree = tree.DecisionTreeClassifier()

# 2 KNeighborsClassifier
clf_neigh = KNeighborsClassifier()

# 3 LogisticRegression
clf_logReg = LogisticRegression()

# 4 NaiveBayes
clf_gnb = GaussianNB()

# 5 SupportVectorMachine (SVM)
clf_svm = svm.SVC()

# Data set [height, weight, shoe size]
X = [[181, 80, 44], [177, 70, 43], [160, 60, 38], [154, 54, 37], [166, 65, 40],
     [190, 90, 47], [175, 64, 39], [177, 70, 40], [159, 55, 37], [171, 75, 42],
     [181, 85, 43]]

Y = [
    'male', 'male', 'female', 'female', 'male', 'male', 'female', 'female',
    'female', 'male', 'male'
X = dataset.iloc[:,[2,3]]
#ahora necesitamos el arreglo de las varibles dependientes
Y = dataset.iloc[:,4]
#dividimos el dataset en training set y test set
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test =train_test_split(X,Y,test_size=0.25,random_state=0)

#escalando las variables
from sklearn.preprocessing import StandardScaler
sc_X=StandardScaler()
X_train=sc_X.fit_transform(X_train)
X_test=sc_X.transform(X_test)

#regresion logistica para el set de entrenamiento
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train,Y_train)

#predecir los resultados del test set
y_pred = classifier.predict(X_test)

#creando la matriz de confusion
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(Y_test, y_pred)

#visualizando el set de entrenamiento
from matplotlib.colors import ListedColormap
X_set, y_set = X_test, Y_test
X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1, stop = X_set[:, 0].max() + 1, step = 0.01),
                     np.arange(start = X_set[:, 1].min() - 1, stop = X_set[:, 1].max() + 1, step = 0.01))
plt.contourf(X1, X2, classifier.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape),
X, Y = data[list(range(4))], data[4]
# 把文本数据进行编码,比如a b c编码为 0 1 2; 可以通过pd.Categorical(y).categories获取index对应的原始值
Y = pd.Categorical(Y).codes
X = data[[0,1]]  # 获取第1列和第二列


# 3.数据分割
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=0.8, random_state=0)


# 4.模型构建及训练
# 4.1:SVM分类器
svm = SVC(C=1, kernel='linear')
svm.fit(X_train, Y_train)
# 4.2:LogisticRegression逻辑回归
lr = LogisticRegression()
lr.fit(X_train, Y_train)
# 4.3:RidgeClassifier岭回归
rc = RidgeClassifier()
rc.fit(X_train, Y_train)
# 4.4:KNN
knn = KNeighborsClassifier()
knn.fit(X_train, Y_train)


# 5.模型评估
svm_score1 = accuracy_score(Y_train, svm.predict(X_train))
svm_score2 = accuracy_score(Y_test, svm.predict(X_test))

lr_score1 = accuracy_score(Y_train, lr.predict(X_train))
lr_score2 = accuracy_score(Y_test, lr.predict(X_test))
Example #30
0
text.apply(lambda x: len(x.split(' '))).sum()
#####
x_train, x_test, y_train, y_test = train_test_split(text, labels, test_size=0.2, random_state = 42)
x_train.todense()
####
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix

from sklearn.linear_model import LogisticRegression

logreg = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', LogisticRegression(n_jobs=1, C=1e5)),
               ])
logreg.fit(x_train, y_train)

y_pred = logreg.predict(x_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred,target_names=["not CADR", "CADR"]))

# accuracy 0.7647058823529411

#              precision    recall  f1-score   support

#    not CADR       0.66      0.81      0.72        26
#        CADR       0.86      0.74      0.79        42