コード例 #1
0
    # data = np.loadtxt(path, dtype=float, delimiter=',',
    #                   converters={4: iris_type})

    data = pd.read_csv(path, header=None)
    data[4] = pd.Categorical(data[4]).codes
    # iris_types = data[4].unique()
    # print iris_types
    # for i, type in enumerate(iris_types):
    #     data.set_value(data[4] == type, 4, i)
    x, y = np.split(data.values, (4, ), axis=1)
    # print 'x = \n', x
    # print 'y = \n', y
    # 仅使用前两列特征
    x = x[:, :2]
    lr = Pipeline([('sc', StandardScaler()),
                   ('poly', PolynomialFeatures(degree=3)),
                   ('clf', LogisticRegression())])
    lr.fit(x, y.ravel())
    y_hat = lr.predict(x)
    y_hat_prob = lr.predict_proba(x)
    np.set_printoptions(suppress=True)
    print 'y_hat = \n', y_hat
    print 'y_hat_prob = \n', y_hat_prob
    print u'准确度:%.2f%%' % (100 * np.mean(y_hat == y.ravel()))
    # 画图
    N, M = 500, 500  # 横纵各采样多少个值
    x1_min, x1_max = x[:, 0].min(), x[:, 0].max()  # 第0列的范围
    x2_min, x2_max = x[:, 1].min(), x[:, 1].max()  # 第1列的范围
    t1 = np.linspace(x1_min, x1_max, N)
    t2 = np.linspace(x2_min, x2_max, M)
    x1, x2 = np.meshgrid(t1, t2)  # 生成网格采样点
コード例 #2
0
ファイル: linear_reg.py プロジェクト: HeewonChae/PoseProject
import machine_learning.utility as utility
import enums
import time
from datetime import datetime


# 로드 데이터
any_football_data = load_data(enums.SportsType.Football, enums.CampType.Any)

# 학습 데이터셋
train_X = any_football_data.drop(label_columns, axis=1)
train_Y = any_football_data['score']  # 경기 결과 (승=0, 무=1, 패=2)

# 특성 처리
number_pipeline_home = Pipeline([
        ("poly", PolynomialFeatures(degree=2, include_bias=False)),
        ("scaler", StandardScaler()),
    ])

feature_pipeline = ColumnTransformer([
        ("num", number_pipeline_home, num_attribs),
        ("cat", OneHotEncoder(), cat_attribs_with_camp),
    ])

# 모델 훈련
prepared_train = feature_pipeline.fit_transform(train_X)
print(f'prepared_train data_set shape: {prepared_train.shape}')

start_prepared = time.time()

lin_reg = LinearRegression()
lin_reg.fit(prepared_train, train_Y)  # 훈련
コード例 #3
0
scalar = MinMaxScaler()
pca = PCA(svd_solver='randomized', random_state=42)
cv = StratifiedShuffleSplit(y, 100, random_state=42)
print "done"

# GaussianNB
from sklearn.naive_bayes import GaussianNB

naive_clf = GaussianNB()
parameters = {}
naive_clf_grid = GridSearchCV(naive_clf, parameters, cv=cv, scoring='f1')
naive_clf_grid.fit(X, y)
print "before pca f1: ", naive_clf_grid.best_score_

pca_naive_clf = Pipeline([('pca', pca), ('svc', naive_clf)])
parameters = {'pca__n_components': range(1, 5)}
pca_naive_clf_grid = GridSearchCV(pca_naive_clf,
                                  parameters,
                                  cv=cv,
                                  scoring='f1')
pca_naive_clf_grid.fit(X, y)
print "after pca f1: ", pca_naive_clf_grid.best_score_

### Task 5: Tune your classifier to achieve better than .3 precision and recall
### using our testing script. Check the tester.py script in the final project
### folder for details on the evaluation method, especially the test_classifier
### function. Because of the small size of the dataset, the script uses
### stratified shuffle split cross validation. For more info:
### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html
コード例 #4
0
        self.attribute_names = attribute_names

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.attribute_names].values


# list of attributes for the DataFrameSelector (pandas to numpy)
room_attrib = [
    attr for attr in list(room_data) if not re.search(attr, r'date|Occupancy')
]
print(room_attrib)
pipeline = Pipeline([
    ('selector', DataFrameSelector(room_attrib)),
    ('std_scaler', StandardScaler()),
])

# axis=1 implies column
room_prepared = pipeline.fit_transform(room_data)
print(room_prepared)  # %load train.py
import os as os
import numpy as np
import scipy.io
import scipy.optimize as optimization

N_EPOCH = 1

n_mis = []

コード例 #5
0
    prediction = clf.predict(predict_me)
    if prediction == y[i]:
        correct += 1

print (float(correct)/float(len(X)))
'''

from sklearn.linear_model.logistic import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn import decomposition
from sklearn.pipeline import Pipeline

logistic = LogisticRegression()
pca = decomposition.PCA()
pipe = Pipeline(steps=[('pca', pca), ('logistic', logistic)])

X = np.array(df.drop('survived', 1))
X = preprocessing.scale(X)
print X.shape
y = np.array(df['survived'])
print y.shape
clf = pca.fit_transform(X, y)
plt.figure(1, figsize=(5, 5))
plt.clf()
plt.axes([.2, .2, .7, .7])
plt.plot(pca.explained_variance_, linewidth=2)
plt.axis('tight')
plt.xlabel('n_components')
plt.ylabel('explained_variance_')
n_components = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]
コード例 #6
0
y = digits.target
# Throw away data, to be in the curse of dimension settings
y = y[:200]
X = digits.data[:200]
n_samples = len(y)
X = X.reshape((n_samples, -1))
# add 200 non-informative features
X = np.hstack((X, 2 * np.random.random((n_samples, 200))))

################################################################################
# Create a feature-selection transform and an instance of SVM that we
# combine together to have an full-blown estimator

transform = feature_selection.SelectPercentile(feature_selection.f_classif)

clf = Pipeline([('anova', transform), ('svc', svm.SVC())])

################################################################################
# Plot the cross-validation score as a function of percentile of features
score_means = list()
score_stds = list()
percentiles = (1, 3, 6, 10, 15, 20, 30, 40, 60, 80, 100)

for percentile in percentiles:
    clf.set_params(anova__percentile=percentile)
    # Compute cross-validation score using all CPUs
    this_scores = cross_validation.cross_val_score(clf, X, y, n_jobs=1)
    score_means.append(this_scores.mean())
    score_stds.append(this_scores.std())

pl.errorbar(percentiles, score_means, np.array(score_stds))
コード例 #7
0
ファイル: example1.py プロジェクト: ladin157/ML-Learning
# load data
url = os.path.join(os.getcwd(), 'pima-indians-diabetes.data.csv')
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = read_csv(url, names=names, sep=',')
# print(dataframe)
array = dataframe.values
X = array[:, 0:8]
Y = array[:, 8]

# create a feature union
features = []
features.append(('pca', PCA(n_components=3)))
features.append(('select_best', SelectKBest(k=6)))
feature_union = FeatureUnion(features)

# create a pipeline
estimators = []
# estimators.append(('standardize', StandardScaler()))
# estimators.append(('lda', LinearDiscriminantAnalysis()))
estimators.append(('feature_union', feature_union))
estimators.append(('logistic', LogisticRegression()))

model = Pipeline(estimators)

# evaluate pipeline
seed = 7
kfold = KFold(n_splits=10, random_state=seed)
results = cross_val_score(model, X, Y, cv=kfold)
print(results.mean())
コード例 #8
0
import views_utils.dbutils as dbutils
sys.path.insert(0, "../../../osa")
from osa.wrapper_sm import SMLogit

import osa.utils as osa

uname = "VIEWSADMIN"
prefix = "postgresql"
db = "views"
port = "5432"
hostname = "VIEWSHOST"
connectstring = dbutils.make_connectstring(prefix, db, uname, hostname, port)

rf_500 = RandomForestClassifier(n_estimators=500, n_jobs=10)
scaler = StandardScaler()
pipe_rf_500 = Pipeline([('scaler', scaler), ('rf', rf_500)])

output_schema = "landed_test"
output_table = "osa_pgm_acled_histonly_fcast_calib_sb"

models = [{
    "dir_pickles":
    "$SNIC_TMP/osa/pickles/osa_pgm_acled_histonly_fcast_calib_sb/pgm_acled_histonly_fcast_calib_logit_fullsample_sb",
    "estimator":
    SMLogit(),
    "features": [
        "l2_ged_dummy_sb", "l3_ged_dummy_sb", "l4_ged_dummy_sb",
        "l5_ged_dummy_sb", "l6_ged_dummy_sb", "l7_ged_dummy_sb",
        "l8_ged_dummy_sb", "l9_ged_dummy_sb", "l10_ged_dummy_sb",
        "l11_ged_dummy_sb", "l12_ged_dummy_sb", "q_1_1_l2_ged_dummy_sb",
        "q_1_1_l3_ged_dummy_sb", "l1_ged_dummy_sb", "l1_ged_dummy_ns",
コード例 #9
0
4.4 Learning curve
------------------------------------------------------------------------------------------------------------------------
'''
print('---------------------------------------------------------------------------------------------------------------\n'
      '                                         4.4 Learning curve                                                    \n'
      '---------------------------------------------------------------------------------------------------------------\n')
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

for style, width, degree in (("g-", 1, 300), ("b--", 2, 2), ("r-+", 2, 1)):
    polybig_features = PolynomialFeatures(degree=degree, include_bias=False)
    std_scaler = StandardScaler()
    lin_reg = LinearRegression()
    polynomial_regression = Pipeline([
            ("poly_features", polybig_features),
            ("std_scaler", std_scaler),
            ("lin_reg", lin_reg),
        ])
    polynomial_regression.fit(X, y)
    y_newbig = polynomial_regression.predict(X_new)
    plt.plot(X_new, y_newbig, style, label=str(degree), linewidth=width)

plt.plot(X, y, "b.", linewidth=3)
plt.legend(loc="upper left")
plt.xlabel("$x_1$", fontsize=18)
plt.ylabel("$y$", rotation=0, fontsize=18)
plt.axis([-3, 3, 0, 10])
save_fig("high_degree_polynomials_plot")
plt.show()
print()
コード例 #10
0
 def make():
     return Pipeline([('m2', mult2), ('m3', mult3), ('last', mult5)])
コード例 #11
0
data_set = DataSet()
data, label, class_names = data_set.get_train_data_set()

indexs = random.sample(range(len(data)), 50000)
data = data[indexs]
label = label[indexs]
X_train, X_test, y_train, y_test = train_test_split(data,
                                                    label,
                                                    test_size=0.33,
                                                    random_state=42)

est = [('count_vect', CountVectorizer()),
       ('tr', TruncatedSVD(n_components=10, n_iter=100, random_state=42)),
       ('clf_DT', DecisionTreeClassifier())]

pipeline_DT = Pipeline(est)

pipeline_DT = pipeline_DT.fit(X_train, y_train)
y_pred = pipeline_DT.predict(X_test)
print("F1 score - DT:",
      f1_score(y_test, pipeline_DT.predict(X_test), average='micro'))
print("Accuracy Score - DT:",
      accuracy_score(y_test, pipeline_DT.predict(X_test)))
cnf_matrix = confusion_matrix(y_test, y_pred)
plt.figure()
plt = plot_confusion_matrix(cnf_matrix,
                            classes=class_names,
                            normalize=True,
                            title='Normalized confusion matrix DT')
plt.show()
コード例 #12
0
def test_set_pipeline_step_none():
    # Test setting Pipeline steps to None
    X = np.array([[1]])
    y = np.array([1])
    mult2 = Mult(mult=2)
    mult3 = Mult(mult=3)
    mult5 = Mult(mult=5)

    def make():
        return Pipeline([('m2', mult2), ('m3', mult3), ('last', mult5)])

    pipeline = make()

    exp = 2 * 3 * 5
    assert_array_equal([[exp]], pipeline.fit_transform(X, y))
    assert_array_equal([exp], pipeline.fit(X).predict(X))
    assert_array_equal(X, pipeline.inverse_transform([[exp]]))

    pipeline.set_params(m3=None)
    exp = 2 * 5
    assert_array_equal([[exp]], pipeline.fit_transform(X, y))
    assert_array_equal([exp], pipeline.fit(X).predict(X))
    assert_array_equal(X, pipeline.inverse_transform([[exp]]))
    assert_dict_equal(
        pipeline.get_params(deep=True), {
            'steps': pipeline.steps,
            'm2': mult2,
            'm3': None,
            'last': mult5,
            'memory': None,
            'm2__mult': 2,
            'last__mult': 5,
        })

    pipeline.set_params(m2=None)
    exp = 5
    assert_array_equal([[exp]], pipeline.fit_transform(X, y))
    assert_array_equal([exp], pipeline.fit(X).predict(X))
    assert_array_equal(X, pipeline.inverse_transform([[exp]]))

    # for other methods, ensure no AttributeErrors on None:
    other_methods = [
        'predict_proba', 'predict_log_proba', 'decision_function', 'transform',
        'score'
    ]
    for method in other_methods:
        getattr(pipeline, method)(X)

    pipeline.set_params(m2=mult2)
    exp = 2 * 5
    assert_array_equal([[exp]], pipeline.fit_transform(X, y))
    assert_array_equal([exp], pipeline.fit(X).predict(X))
    assert_array_equal(X, pipeline.inverse_transform([[exp]]))

    pipeline = make()
    pipeline.set_params(last=None)
    # mult2 and mult3 are active
    exp = 6
    assert_array_equal([[exp]], pipeline.fit(X, y).transform(X))
    assert_array_equal([[exp]], pipeline.fit_transform(X, y))
    assert_array_equal(X, pipeline.inverse_transform([[exp]]))
    assert_raise_message(AttributeError,
                         "'NoneType' object has no attribute 'predict'",
                         getattr, pipeline, 'predict')

    # Check None step at construction time
    exp = 2 * 5
    pipeline = Pipeline([('m2', mult2), ('m3', None), ('last', mult5)])
    assert_array_equal([[exp]], pipeline.fit_transform(X, y))
    assert_array_equal([exp], pipeline.fit(X).predict(X))
    assert_array_equal(X, pipeline.inverse_transform([[exp]]))
コード例 #13
0
def test_pipeline_init():
    # Test the various init parameters of the pipeline.
    assert_raises(TypeError, Pipeline)
    # Check that we can't instantiate pipelines with objects without fit
    # method
    assert_raises_regex(
        TypeError, 'Last step of Pipeline should implement fit. '
        '.*NoFit.*', Pipeline, [('clf', NoFit())])
    # Smoke test with only an estimator
    clf = NoTrans()
    pipe = Pipeline([('svc', clf)])
    assert_equal(
        pipe.get_params(deep=True),
        dict(svc__a=None, svc__b=None, svc=clf, **pipe.get_params(deep=False)))

    # Check that params are set
    pipe.set_params(svc__a=0.1)
    assert_equal(clf.a, 0.1)
    assert_equal(clf.b, None)
    # Smoke test the repr:
    repr(pipe)

    # Test with two objects
    clf = SVC()
    filter1 = SelectKBest(f_classif)
    pipe = Pipeline([('anova', filter1), ('svc', clf)])

    # Check that we can't instantiate with non-transformers on the way
    # Note that NoTrans implements fit, but not transform
    assert_raises_regex(
        TypeError, 'All intermediate steps should be transformers'
        '.*\\bNoTrans\\b.*', Pipeline, [('t', NoTrans()), ('svc', clf)])

    # Check that params are set
    pipe.set_params(svc__C=0.1)
    assert_equal(clf.C, 0.1)
    # Smoke test the repr:
    repr(pipe)

    # Check that params are not set when naming them wrong
    assert_raises(ValueError, pipe.set_params, anova__C=0.1)

    # Test clone
    pipe2 = clone(pipe)
    assert_false(pipe.named_steps['svc'] is pipe2.named_steps['svc'])

    # Check that apart from estimators, the parameters are the same
    params = pipe.get_params(deep=True)
    params2 = pipe2.get_params(deep=True)

    for x in pipe.get_params(deep=False):
        params.pop(x)

    for x in pipe2.get_params(deep=False):
        params2.pop(x)

    # Remove estimators that where copied
    params.pop('svc')
    params.pop('anova')
    params2.pop('svc')
    params2.pop('anova')
    assert_equal(params, params2)
コード例 #14
0
plot.title('confusion matrix')
plot.colorbar()
plot.ylabel('expected label')
plot.xlabel('predicted label')
print classification_report(emails['label'], all_predictions)

#Dividing data set
msg_train, msg_test, label_train, label_test = \
    train_test_split(emails['message'], emails['label'], test_size=0.2)

print len(msg_train), len(msg_test), len(msg_train) + len(msg_test)

pipeline = Pipeline([
    ('bow',
     CountVectorizer(analyzer=lemmatize)),  # strings to token integer counts
    ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
    ('classifier',
     MultinomialNB()),  # train on TF-IDF vectors w/ Naive Bayes classifier
])

scores = cross_val_score(
    pipeline,  # steps to convert raw emails into models
    msg_train,  # training data
    label_train,  # training labels
    cv=10,  # split data randomly into 10 parts: 9 for training, 1 for scoring
    scoring='accuracy',  # which scoring metric?
    n_jobs=-1,  # -1 = use all cores = faster
)
print scores

コード例 #15
0
                                           categorical_transformer,
                                           select_features_cat)

# Train model
# TODO try semi supervised learning

# Setting random state forces the classifier to produce the same result in each run
n_cv = 5  # cv=5 is default
scorer = "accuracy"

# model = RandomForestClassifier(random_state=random_state)
model = xgb.XGBClassifier()

pipe = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('scaler', scaler),
    # ('pca', pca),
    ('model', model)
])

param_grid = {
    # 'preprocessor__num__imputer__strategy': ['mean', 'median'],

    # 'pca__n_components': [5, 15, 30, 45, 64],
    'model__n_estimators': [10, 50, 75, 100, 125, 200, 300],

    # usually max_depth is 6,7,8
    'model__max_depth': list(range(2, 10)),

    # learning rate is around 0.05, but small changes may make big diff
    'model__learning_rate': [0.03, 0.05, 0.07, 0.09, 0.1],
    # 'model__subsample':  list(map(lambda x: x * 0.1, range(1, 10))),
コード例 #16
0
    def test_pipeline_column_transformer(self):

        iris = datasets.load_iris()
        X = iris.data[:, :3]
        y = iris.target
        X_train = pandas.DataFrame(X, columns=["vA", "vB", "vC"])
        X_train["vcat"] = X_train["vA"].apply(lambda x: "cat1"
                                              if x > 0.5 else "cat2")
        X_train["vcat2"] = X_train["vB"].apply(lambda x: "cat3"
                                               if x > 0.5 else "cat4")
        y_train = y % 2
        numeric_features = [0, 1, 2]  # ["vA", "vB", "vC"]
        categorical_features = [3, 4]  # ["vcat", "vcat2"]

        classifier = LogisticRegression(
            C=0.01,
            class_weight=dict(zip([False, True], [0.2, 0.8])),
            n_jobs=1, max_iter=10, solver="lbfgs", tol=1e-3)

        numeric_transformer = Pipeline(steps=[
            ("imputer", SimpleImputer(strategy="median")),
            ("scaler", StandardScaler()),
        ])

        categorical_transformer = Pipeline(steps=[
            (
                "onehot",
                OneHotEncoder(sparse=True, handle_unknown="ignore"),
            ),
            (
                "tsvd",
                TruncatedSVD(n_components=1, algorithm="arpack", tol=1e-4),
            ),
        ])

        preprocessor = ColumnTransformer(transformers=[
            ("num", numeric_transformer, numeric_features),
            ("cat", categorical_transformer, categorical_features),
        ])

        model = Pipeline(steps=[("precprocessor",
                                 preprocessor), ("classifier", classifier)])

        model.fit(X_train, y_train)
        initial_type = [
            ("numfeat", FloatTensorType([None, 3])),
            ("strfeat", StringTensorType([None, 2])),
        ]

        X_train = X_train[:11]
        model_onnx = convert_sklearn(model, initial_types=initial_type,
                                     target_opset=TARGET_OPSET)

        dump_data_and_model(
            X_train, model, model_onnx,
            basename="SklearnPipelineColumnTransformerPipeliner")

        if __name__ == "__main__":
            from onnx.tools.net_drawer import GetPydotGraph, GetOpNodeProducer

            pydot_graph = GetPydotGraph(
                model_onnx.graph,
                name=model_onnx.graph.name,
                rankdir="TP",
                node_producer=GetOpNodeProducer("docstring"))
            pydot_graph.write_dot("graph.dot")

            import os

            os.system("dot -O -G=300 -Tpng graph.dot")
コード例 #17
0
print('sizes_neg = ' + str(b))
c = np.median(np.array(sizes_neu))
print('sizes_neu = ' + str(c))
'''

print('#### Preprocessing done')

# build models
print('#### Building models started')

# this is not best solution but for simplify model averaging we will
# use CountVectorizer and TfidfTransformer for 3 times on same data
pipeline_nb = Pipeline([('vect',
                         CountVectorizer(lowercase=False,
                                         max_df=0.8,
                                         max_features=50000,
                                         ngram_range=(1, 3))),
                        ('tfidf', TfidfTransformer()),
                        ('clf', MultinomialNB())])

pipeline_sgd = Pipeline([('vect',
                          CountVectorizer(lowercase=False,
                                          max_df=0.8,
                                          max_features=50000,
                                          ngram_range=(1, 3))),
                         ('tfidf', TfidfTransformer()),
                         ('clf', SGDClassifier())])

pipeline_lr = Pipeline([('vect',
                         CountVectorizer(lowercase=False,
                                         max_df=0.8,
コード例 #18
0
    def test_pipeline_column_transformer_titanic(self):

        # fit
        try:
            titanic_url = (
                "https://raw.githubusercontent.com/amueller/"
                "scipy-2017-sklearn/091d371/notebooks/datasets/titanic3.csv")
            data = pandas.read_csv(titanic_url)
        except url_error.URLError:
            # Do not fail the test if the data cannot be fetched.
            warnings.warn("Unable to fetch titanic data.")
            return
        X = data.drop("survived", axis=1)
        y = data["survived"]

        # SimpleImputer on string is not available for string
        # in ONNX-ML specifications.
        # So we do it beforehand.
        for cat in ["embarked", "sex", "pclass"]:
            X[cat].fillna("missing", inplace=True)

        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2)

        numeric_features = ["age", "fare"]
        numeric_transformer = Pipeline(steps=[
            ("imputer", SimpleImputer(strategy="median")),
            ("scaler", StandardScaler()),
        ])

        categorical_features = ["embarked", "sex", "pclass"]
        categorical_transformer = Pipeline(steps=[
            # --- SimpleImputer on string is not available
            # for string in ONNX-ML specifications.
            # ('imputer',
            #  SimpleImputer(strategy='constant', fill_value='missing')),
            ("onehot", OneHotEncoder(handle_unknown="ignore"))
        ])

        preprocessor = ColumnTransformer(transformers=[
            ("num", numeric_transformer, numeric_features),
            ("cat", categorical_transformer, categorical_features),
        ])

        clf = Pipeline(steps=[
            ("preprocessor", preprocessor),
            # ("classifier", LogisticRegression(solver="lbfgs")),
        ])

        # inputs

        def convert_dataframe_schema(df, drop=None):
            inputs = []
            for k, v in zip(df.columns, df.dtypes):
                if drop is not None and k in drop:
                    continue
                if v == 'int64':
                    t = Int64TensorType([None, 1])
                elif v == "float64":
                    t = FloatTensorType([None, 1])
                else:
                    t = StringTensorType([None, 1])
                inputs.append((k, t))
            return inputs

        to_drop = {
            "parch",
            "sibsp",
            "cabin",
            "ticket",
            "name",
            "body",
            "home.dest",
            "boat",
        }

        X_train = X_train.copy()
        X_test = X_test.copy()
        X_train['pclass'] = X_train['pclass'].astype(numpy.int64)
        X_test['pclass'] = X_test['pclass'].astype(numpy.int64)
        X_train = X_train.drop(to_drop, axis=1)
        X_test = X_test.drop(to_drop, axis=1)

        # Step 1: without classifier
        clf.fit(X_train, y_train)
        initial_inputs = convert_dataframe_schema(X_train, to_drop)
        model_onnx = convert_sklearn(clf, "pipeline_titanic", initial_inputs,
                                     target_opset=TARGET_OPSET)

        data = X_test
        pred = clf.transform(data)
        data_types = {
            'pclass': numpy.int64,
            'age': numpy.float32,
            'sex': numpy.str_,
            'fare': numpy.float32,
            'embarked': numpy.str_,
        }
        inputs = {k: data[k].values.astype(data_types[k]).reshape(-1, 1)
                  for k in data.columns}
        sess = InferenceSession(model_onnx.SerializeToString())
        run = sess.run(None, inputs)
        got = run[-1]
        assert_almost_equal(pred, got, decimal=5)

        # Step 2: with classifier
        clf = Pipeline(steps=[
            ("preprocessor", preprocessor),
            ("classifier", LogisticRegression(solver="lbfgs")),
        ]).fit(X_train, y_train)
        pred = clf.predict_proba(data)
        model_onnx = convert_sklearn(clf, "pipeline_titanic", initial_inputs,
                                     target_opset=TARGET_OPSET,
                                     options={id(clf): {'zipmap': False}})
        sess = InferenceSession(model_onnx.SerializeToString())
        run = sess.run(None, inputs)
        got = run[-1]
        assert_almost_equal(pred, got, decimal=5)
コード例 #19
0
def parse_newsdata():
	# Parse news json files
	X_buzz, y_buzz, X_poli, y_poli = [], [], [], []
	i = 0
	for dataset in datasets:
		dataset_dir = os.path.join(data_dir, dataset)
		fakenews_dir = os.path.join(dataset_dir, 'FakeNewsContent')
		realnews_dir = os.path.join(dataset_dir, 'RealNewsContent')
		no_realnews = 0
		no_fakenews = 0
		no_articles = 0
		doc_ind = []
		Realnews =  sorted(os.listdir(realnews_dir), key=lambda x:int(x.split('-')[0].split('_')[2]))
		Fakenews =  sorted(os.listdir(fakenews_dir), key=lambda x:int(x.split('-')[0].split('_')[2]))
		print Realnews
		print Fakenews
		for realnews in Realnews:
			if realnews.split('.')[1] != 'py':
				#with open(os.path.join(fakenews_dir, fakenews), 'r').read() as fd:
				f = open(os.path.join(realnews_dir, realnews), 'r').read()
				doc_ind.append(int(realnews.split('-')[0].split('_')[2])-1)
				if len(f) == 0:
					dummy_text = "no title" + " No text"
					if i == 0:
						X_buzz.append(dummy_text)
						y_buzz.append(0)
					else:
						X_poli.append(dummy_text)
						y_poli.append(0)
					no_realnews += 1
					continue
				data = json.loads(f)
				if i == 0:
					X_buzz.append(data['title'] + data['text'])
					y_buzz.append(0)
				else:
					X_poli.append(data['title'] + data['text'])
					y_poli.append(0)
				no_realnews += 1
		for fakenews in Fakenews:
			if fakenews.split('.')[1] != 'py':
				#with open(os.path.join(fakenews_dir, fakenews), 'r').read() as fd:
				f = open(os.path.join(fakenews_dir, fakenews), 'r').read()
				doc_ind.append(int(fakenews.split('-')[0].split('_')[2])-1)
				if len(f) == 0:
					dummy_text = "no title" + " No text"
					if i == 0:
						X_buzz.append(dummy_text)
						y_buzz.append(1)
					else:
						X_poli.append(dummy_text)
						y_poli.append(1)
					no_fakenews += 1
					continue
				data = json.loads(f)
				if i == 0:
					X_buzz.append(data['title'] + data['text'])
					y_buzz.append(1)
				else:
					X_poli.append(data['title'] + data['text'])
					y_poli.append(1)
				no_fakenews += 1
		no_articles = no_realnews + no_fakenews
		count_vec = CountVectorizer(ngram_range=(1,2), max_features=10000)
		#count_vec = TfidfVectorizer(use_idf=True, smooth_idf=True, ngram_range=(1,2), max_features=10000)
		#svd_model = TruncatedSVD(n_components=25)
		svd_model = NMF(n_components=45, random_state=42)
		svd_transformer = Pipeline([('tfidf', count_vec), ('svd', svd_model)])
		#svd_transformer = Pipeline([('tfidf', count_vec)])
		if i == 0:
			#X_buzz_lsi = np.array(count_vec.fit_transform(X_buzz).todense())
			X_buzz_lsi = svd_transformer.fit_transform(X_buzz)
			print X_buzz_lsi.shape
			#pdb.set_trace()
			#X_buzz_lsi = svd_transformer.fit_transform(X_buzz)
			print X_buzz_lsi
			#X_buzz_lsi = X_buzz_lsi.todense()
			f = open('buzz_lsi.npy', 'w')
			print X_buzz_lsi.shape
			#X1 = np.zeros_like(X_buzz_lsi)
			print no_articles
			#pdb.set_trace()
			#for j in xrange(no_articles):
			#	X1[j, :] = X_buzz_lsi[doc_ind[j], :]
			#X_buzz_lsi = X1	
			np.save(f, X_buzz_lsi)
		else:
			#X_poli_lsi = np.array(count_vec.fit_transform(X_poli).todense())
			#print type(X_poli_lsi)
			X_poli_lsi = svd_transformer.fit_transform(X_poli)
			print X_poli_lsi.shape
			#X1 = np.zeros_like(X_poli_lsi)
			f = open('poli_lsi.npy', 'w')
			#for j in xrange(no_articles):
			#	X1[j, :] = X_poli_lsi[doc_ind[j], :]	
			#print X1.shape
			#print no_articles
			#X_poli_lsi = X1
			np.save(f, X_poli_lsi)

		i += 1
	#y_buzz = np.array(y_buzz)
	#y_poli = np.array(y_poli)
	return X_buzz_lsi, y_buzz, X_poli_lsi, y_poli
コード例 #20
0
ファイル: f5.py プロジェクト: xxr225510/feature-selection
    def demo(self):
        import math
        import random
        import numpy as np
        from sklearn.feature_selection import SelectKBest
        from sklearn.feature_selection import chi2
        from sklearn.feature_selection import VarianceThreshold
        from sklearn.feature_selection import SelectKBest
        from numpy import array
        from sklearn.linear_model import LogisticRegression
        from sklearn.svm import LinearSVC
        from sklearn.feature_selection import SelectFromModel
        from sklearn.ensemble import ExtraTreesClassifier
        import re
        from sklearn.pipeline import Pipeline
        from sklearn.ensemble import RandomForestClassifier
        from sklearn.preprocessing import PolynomialFeatures
        from sklearn.linear_model import LinearRegression
        from sklearn.linear_model import Perceptron
        import matplotlib
        matplotlib.use('TkAgg')
        import matplotlib.pyplot as plt
        import Tkinter
        import threading
        import matplotlib
        import matplotlib.backends.backend_tkagg

        with open ('wpbc1.data','r')as open_file:
            wpbc=open_file.read()
            wpbc=wpbc.strip()
            wpbc=re.split('[\n,]',wpbc)
            for i in range (len(wpbc)):
                if wpbc[i]=='N':
                    wpbc[i]='0'
                elif wpbc[i]=='R':
                    wpbc[i]='1'
                elif wpbc[i]=='?':
                    wpbc[i]='0'
            wpbc=[wpbc[i:i+35] for i in range (0,len(wpbc),35)]
            wpbc=np.array(wpbc,dtype=float)
            X=np.delete(wpbc,[0],axis=1)
            y=wpbc.T[0]

            # feature selection
            # VarianceThreshold
            sel = VarianceThreshold(threshold=1)
            sel.fit(X, y)
            scores1 = sel.variances_
            index1 = np.argsort(scores1)
            n = index1[:-6]
            X_new_1 = np.delete(X, [n], axis=1)

            # SelectKBest
            skb = SelectKBest(chi2, k=3)
            skb.fit(X, y)
            scores2 = skb.scores_
            index2 = np.argsort(scores2)
            n = index2[:-6]
            X_new_2 = np.delete(X, [n], axis=1)

            # L1
            lsvc = LinearSVC(C=0.008, penalty="l1", dual=False)
            lsvc.fit(X, y)
            model = SelectFromModel(lsvc, prefit=True)
            X_new_3 = lsvc.transform(X)
            scores3 = lsvc.coef_
            np.abs(scores3)
            index3 = np.argsort(scores3)

            # tree
            clf = ExtraTreesClassifier()
            clf.fit(X, y)
            model = SelectFromModel(clf, prefit=True)
            scores4 = clf.feature_importances_
            index4 = np.argsort(scores4)
            n = index4[:-6]
            X_new_4 = np.delete(X, [n], axis=1)

            # pipline
            clf = Pipeline([
                ('feature_selection', SelectFromModel(LinearSVC(penalty="l2"))),
                ('classification', RandomForestClassifier())
            ])
            clf.fit(X, y)

            X = PolynomialFeatures(interaction_only=True).fit_transform(X_new_1).astype(float)
            clf = Perceptron(fit_intercept=False, n_iter=10, shuffle=False).fit(X_new_1, y)
            clf.predict(X_new_1)
            score1 = clf.score(X_new_1, y)
            X = PolynomialFeatures(interaction_only=True).fit_transform(X_new_2).astype(float)
            clf = Perceptron(fit_intercept=False, n_iter=10, shuffle=False).fit(X_new_2, y)
            clf.predict(X_new_2)
            score2 = clf.score(X_new_2, y)
            X = PolynomialFeatures(interaction_only=True).fit_transform(X_new_3).astype(float)
            clf = Perceptron(fit_intercept=False, n_iter=10, shuffle=False).fit(X_new_3, y)
            clf.predict(X_new_3)
            score3 = clf.score(X_new_3, y)
            X = PolynomialFeatures(interaction_only=True).fit_transform(X_new_4).astype(float)
            clf = Perceptron(fit_intercept=False, n_iter=10, shuffle=False).fit(X_new_4, y)
            clf.predict(X_new_4)
            score4 = clf.score(X_new_4, y)
            print score1, score2, score3, score4
            # 0.00505050505051 0.00505050505051 0.00505050505051 0.00505050505051



            # plot
        '''fig = plt.figure(1)
    kFolder = KFold(n_splits=N_FOLDS)
    fold_count = 0

    most_frequent_terms = []

    for train_index, test_index in kFolder.split(data):
        print("Memproses Tweet ke {} - {}".format(
            test_index[0] + 1,
            test_index[-1] + 1
        ))

        data_train, target_train = data[train_index], target[train_index]

        bow_pipeline = Pipeline([
            ('count_vectorizer', CountVectorizer(min_df=5, max_df=0.7, )),
            ('tf_idf_transformer', TfidfTransformer())
        ]).fit(data_train)

        pandas.DataFrame(
            bow_pipeline['count_vectorizer'].stop_words_
        ).sort_values(
            [0],
            ignore_index=True
        ).to_excel(
            "./report-extras/effective_stop_words_{}.xlsx".format(
                fold_count
            )
        )

        most_frequent_terms_in_this_fold = pandas.DataFrame(
                bow_pipeline['count_vectorizer'].transform(
コード例 #22
0
from sklearn.preprocessing import PolynomialFeatures

from sklearn.preprocessing import StandardScaler

from sklearn.base import BaseEstimator

#initiated sklearn regressor objects and a couple polynomial pipelines
models = {
    "svr": SVR(),
    "kr": KernelRidge(),
    "rf": RandomForestRegressor(),
    "gb": GradientBoostingRegressor(),

    "lr": Pipeline([
        ("poly", PolynomialFeatures(2)),
        ("regressor", LinearRegression()),
        ]),
    "hr": Pipeline([
        ("poly", PolynomialFeatures(2)),
        ("regressor", HuberRegressor())
    ]),
    "ran": Pipeline([
        ("poly", PolynomialFeatures(2)),
        ("regressor", RANSACRegressor())
    ]),

    "gpr": GaussianProcessRegressor(),

    "wei": WeightedCurver(maxfev=100000),
    "sum": SummedCurver(maxfev=2000, method="dogbox"),
}
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.savefig("MNB_cf"+str(normalize) +  ".png")


pipeline1 = Pipeline([
    ('vect', CountVectorizer(min_df=2, stop_words=text.ENGLISH_STOP_WORDS)),
    ('tfidf', TfidfTransformer()),
])

train_lsi, test_lsi = fetchLSIRepresentation(pipeline1, twenty_train,  twenty_test)


mnb_clf = MultinomialNB()
mnb_clf.fit(train_lsi, train_target_group)
mnb_predicted = mnb_clf.predict(test_lsi)
nmb_predicted_probs = mnb_clf.predict_proba(test_lsi)
print_statistics(test_target_group, mnb_predicted)
fpr, tpr, _ = roc_curve(test_target_group, nmb_predicted_probs[:,1])
plot_roc(fpr, tpr)
cnf_matrix = smet.confusion_matrix(test_target_group, mnb_predicted)
plot_confusion_matrix(cnf_matrix, classes=class_names, title='Confusion matrix without normalization')
plot_confusion_matrix(cnf_matrix, classes=class_names, normalize=True, title='Confusion matrix with normalization' )
コード例 #24
0
X, y = load_digits(return_X_y=True)
# Throw away data, to be in the curse of dimension settings
X = X[:200]
y = y[:200]
n_samples = len(y)
X = X.reshape((n_samples, -1))
# add 200 non-informative features
X = np.hstack((X, 2 * np.random.random((n_samples, 200))))

# #############################################################################
# Create a feature-selection transform and an instance of SVM that we
# combine together to have an full-blown estimator

transform = SelectPercentile(chi2)

clf = Pipeline([('anova', transform), ('svc', SVC(gamma="auto"))])

# #############################################################################
# Plot the cross-validation score as a function of percentile of features
score_means = list()
score_stds = list()
percentiles = (1, 3, 6, 10, 15, 20, 30, 40, 60, 80, 100)

for percentile in percentiles:
    clf.set_params(anova__percentile=percentile)
    # Compute cross-validation score using 1 CPU
    this_scores = cross_val_score(clf, X, y, n_jobs=1)
    score_means.append(this_scores.mean())
    score_stds.append(this_scores.std())

plt.errorbar(percentiles, score_means, np.array(score_stds))
コード例 #25
0
from sklearn.grid_search import GridSearchCV

with open('dataset.csv') as csvfile:
    reader = csv.DictReader(csvfile)
    ip = []
    target = []
    count = 1
    for row in reader:
        target.append(row['Sentiment'])
        ip.append(row['SentimentText'])
        count += 1
        if (count == 10000):
            break

pipeline = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('logReg', LogisticRegression())])

parameters = {
    'vect__max_df': (0.5, 0.75, 1.0),
    #'vect__max_features': (None, 5000, 10000, 50000),
    'vect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
    'tfidf__use_idf': (True, False),
    'tfidf__norm': ('l1', 'l2'),
    #'clf__alpha': (0.00001, 0.000001),
    #'clf__penalty': ('l2', 'elasticnet')
    #'clf__n_iter': (10, 50, 80),
    'logReg__max_iter': (10, 50, 100),
    'logReg__class_weight': ('auto', 'balanced')
}
コード例 #26
0
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer


def clean(s):
    translator = str.maketrans("", "", string.punctuation)
    return s.translate(translator)


s = session()
rows = s.query(News).filter(News.label != None).all()
X = [clean(row.title).lower() for row in rows]
y = [row.label for row in rows]

limit = int(len(rows) * 0.7)
X_train, y_train, X_test, y_test = X[:limit], y[:limit], X[limit:], y[limit:]

print('Testing my model...')
my_model = NaiveBayesClassifier(alpha=0.05)
my_model.fit(X_train, y_train)
print(my_model.score(X_test, y_test))

print('Testing sklearn...')
sk_model = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('classifier', MultinomialNB(alpha=0.05)),
])
sk_model.fit(X_train, y_train)
print(sk_model.score(X_test, y_test))
コード例 #27
0
    cm_light = mpl.colors.ListedColormap(['#77E0A0', '#FF8080', '#A0A0FF'])
    cm_dark = mpl.colors.ListedColormap(['g', 'r', 'b'])
    mpl.rcParams['font.sans-serif'] = u'SimHei'
    mpl.rcParams['axes.unicode_minus'] = False
    plt.figure(facecolor='w')
    plt.scatter(x[:, 0], x[:, 1], s=30, c=y, marker='o', cmap=cm_dark)
    plt.grid(b=True, ls=':')
    plt.xlabel(u'组份1', fontsize=14)
    plt.ylabel(u'组份2', fontsize=14)
    plt.title(u'鸢尾花数据PCA降维', fontsize=18)
    # plt.savefig('1.png')
    plt.show()

    x, x_test, y, y_test = train_test_split(x, y, train_size=0.7)
    model = Pipeline([
        ('poly', PolynomialFeatures(degree=2, include_bias=True)),
        ('lr', LogisticRegressionCV(Cs=np.logspace(-3, 4, 8), cv=5, fit_intercept=False))
    ])
    model.fit(x, y)
    print '最优参数:', model.get_params('lr')['lr'].C_
    y_hat = model.predict(x)
    print '训练集精确度:', metrics.accuracy_score(y, y_hat)
    y_test_hat = model.predict(x_test)
    print '测试集精确度:', metrics.accuracy_score(y_test, y_test_hat)

    N, M = 500, 500     # 横纵各采样多少个值
    x1_min, x1_max = extend(x[:, 0].min(), x[:, 0].max())   # 第0列的范围
    x2_min, x2_max = extend(x[:, 1].min(), x[:, 1].max())   # 第1列的范围
    t1 = np.linspace(x1_min, x1_max, N)
    t2 = np.linspace(x2_min, x2_max, M)
    x1, x2 = np.meshgrid(t1, t2)                    # 生成网格采样点
    x_show = np.stack((x1.flat, x2.flat), axis=1)   # 测试点
コード例 #28
0
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import Binarizer
from sklearn.decomposition import PCA
from sklearn.preprocessing import PolynomialFeatures
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

tree_classification_pipeline = Pipeline([
    ('tree', DecisionTreeClassifier()),

    # Forest instead of Trees
    # ('forest', RandomForestClassifier())
])

ridge_regression_pipeline = Pipeline([
    # Apply scaling to Ridge Regression
    # ('scale', StandardScaler()),
    ('ridge', Ridge())
])

lasso_regression_pipeline = Pipeline([
    # Apply scaling to Lasso Regression
    # ('scale', StandardScaler()),
    ('lasso', Lasso())
])
コード例 #29
0
X = df.drop('MEDV', axis=1)
y = df['MEDV']
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=42)

model = GradientBoostingRegressor()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(f'Accuracy: {model.score(X_test, y_test)*100:.3}%')
print(f'RMSE: {np.sqrt(mean_squared_error(y_test, y_pred))}')

steps = [('Gradient Boosting Regressor',
          GradientBoostingRegressor(n_estimators=500, max_depth=6))]
model = Pipeline(steps)
model.fit(X_train, y_train)
print('Accuracy: {:.0f}%'.format(model.score(X_test, y_test) * 100))

dump(model, 'sklearn_model.pkl')
"""# **Keras Models**"""

from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import RMSprop
from sklearn.metrics import r2_score

scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_scaled,
                                                    y,
コード例 #30
0
            'lr__penalty': ['none'],
        }
    ]

    data['param_grid'] = param_grid

    # Define pipeline and cross-validation setup

    pipeline = Pipeline(
        [
            ('pt', SubsetPeaksTransformer(n_peaks=0)),
            ('bv', BinningVectorizer(n_bins=3600, min_bin=2000,
                                     max_bin=20000)),
            ('std', StandardScaler()),
            (
                'lr',
                LogisticRegression(
                    class_weight='balanced',
                    solver='saga'  # supports L_1 and L_2 penalties
                ))
        ],
        memory=os.getenv('TMPDIR', default=None),
    )

    grid_search = GridSearchCV(
        pipeline,
        param_grid=param_grid,
        scoring='average_precision',
        cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
        n_jobs=-1,
    )