Example #1
0
    def test_classification_workflow(self):
        X, y = sklearn.datasets.fetch_openml(data_id=24, as_frame=False, return_X_y=True)
        print(type(X))

        X_train, X_test, y_train, y_test = \
            sklearn.model_selection.train_test_split(X, y, random_state=3,
                                                     train_size=0.5,
                                                     test_size=0.5)

        X_train = scipy.sparse.csc_matrix(X_train)
        X_test = scipy.sparse.csc_matrix(X_test)

        pipeline = sklearn.pipeline.Pipeline((
            ('shift', CategoryShift()),
            ('imput', SimpleImputer(strategy='constant', fill_value=2)),
            ('ohe', SparseOneHotEncoder()),
            ('tree', DecisionTreeClassifier(random_state=1)),
            ))

        pipeline.fit(X_train, y_train)
        pred_train = pipeline.predict(X_train)
        self.assertTrue((pred_train == y_train).all())
        # With an incorrect copy operation the OneHotEncoder would rearrange
        # the data in such a way that the accuracy would drop to 66%
        pred_test = pipeline.predict(X_test)
        self.assertTrue((pred_test == y_test).all())
Example #2
0
    def test_classification_workflow(self):
        task = openml.tasks.get_task(254)
        X, y = task.get_X_and_y()

        X_train, X_test, y_train, y_test = \
            sklearn.model_selection.train_test_split(X, y, random_state=3,
                                                     train_size=0.5,
                                                     test_size=0.5)

        X_train = scipy.sparse.csc_matrix(X_train)
        X_test = scipy.sparse.csc_matrix(X_test)

        pipeline = sklearn.pipeline.Pipeline((
            ('shift', CategoryShift()),
            ('imput', SimpleImputer(strategy='constant', fill_value=2)),
            ('ohe', SparseOneHotEncoder()),
            ('tree', DecisionTreeClassifier(random_state=1)),
        ))

        pipeline.fit(X_train, y_train)
        pred_train = pipeline.predict(X_train)
        self.assertTrue((pred_train == y_train).all())
        # With an incorrect copy operation the OneHotEncoder would rearrange
        # the data in such a way that the accuracy would drop to 66%
        pred_test = pipeline.predict(X_test)
        self.assertTrue((pred_test == y_test).all())
Example #3
0
def test_pipeline():
    pipeline = dl.Pipeline([("scale", StandardScaler()), ("fdr", SelectFdr()),
                            ("svm", LinearSVC())])

    pipeline = pipeline.fit(X, y)
    y2 = pipeline.predict(X)
    score = pipeline.score(X, y)

    assert isinstance(y2, di.Value)
    assert isinstance(score, di.Value)

    assert isinstance(score.compute(), float)

    assert pipeline.score(X, y).key == pipeline.score(X, y).key
    assert score.compute() == score.compute()

    y22 = y2.compute()
    assert y22.shape == y.shape
    assert y22.dtype == y.dtype
    skpipeline = sklearn.pipeline.Pipeline([("scale", StandardScaler()),
                                            ("fdr", SelectFdr()),
                                            ("svm", LinearSVC())])

    skpipeline.fit(X, y)
    sk_y2 = skpipeline.predict(X)
    sk_score = skpipeline.score(X, y)
    assert sk_score == score.compute()
Example #4
0
def main(args):
    # Use the digits dataset.
    dataset = MNIST(data_size=5000)

    # Split the dataset into a train set and a test set.
    train_data, test_data, train_target, test_target = sklearn.model_selection.train_test_split(
        dataset.data, dataset.target, test_size=args.test_size, random_state=args.seed)

    features = []
    if args.original:
        features.append(("original", sklearn.preprocessing.FunctionTransformer()))
    if args.rff:
        features.append(("rff", RFFsTransformer(args.rff, args.gamma, args.seed)))
    if args.nystroem:
        features.append(("nystroem", NystroemTransformer(args.nystroem, args.gamma, args.seed)))

    if args.svm:
        classifier = sklearn.svm.SVC()
    else:
        classifier = sklearn.linear_model.LogisticRegression(solver="saga", penalty="none", max_iter=args.max_iter, random_state=args.seed)

    pipeline = sklearn.pipeline.Pipeline([
        ("scaling", sklearn.preprocessing.MinMaxScaler()),
        ("features", sklearn.pipeline.FeatureUnion(features)),
        ("classifier", classifier),
    ])
    pipeline.fit(train_data, train_target)

    test_accuracy = sklearn.metrics.accuracy_score(test_target, pipeline.predict(test_data))
    return test_accuracy
Example #5
0
def test_pipeline():
    pipeline = dl.Pipeline([("scale", StandardScaler()),
                            ("fdr", SelectFdr()),
                            ("svm", LinearSVC())])

    pipeline = pipeline.fit(X, y)
    y2 = pipeline.predict(X)
    score = pipeline.score(X, y)

    assert isinstance(y2, di.Value)
    assert isinstance(score, di.Value)

    assert isinstance(score.compute(), float)

    assert pipeline.score(X, y).key == pipeline.score(X, y).key
    assert score.compute() == score.compute()

    y22 = y2.compute()
    assert y22.shape == y.shape
    assert y22.dtype == y.dtype
    skpipeline = sklearn.pipeline.Pipeline([("scale", StandardScaler()),
                                            ("fdr", SelectFdr()),
                                            ("svm", LinearSVC())])

    skpipeline.fit(X, y)
    sk_y2 = skpipeline.predict(X)
    sk_score = skpipeline.score(X, y)
    assert sk_score == score.compute()
Example #6
0
def make_model():
    dataset_path = 'data_sneaker_vs_sandal'
    x_all_d = pd.read_csv(os.path.join(dataset_path, 'x_train.csv'))
    x_all = x_all_d.values
    A, F = x_all.shape

    x_train_NF = x_all[:9000]
    N = 9000
    x_valid_MF = x_all[9000:]
    M = 3000

    y_all_d = pd.read_csv(os.path.join(dataset_path, 'y_train.csv'))
    y_all = y_all_d.values.reshape((A, ))
    y_train_N = y_all[:9000]
    y_valid_M = y_all[9000:]

    print("loaded data")
    feature_tfmr = sklearn.pipeline.FeatureUnion(transformer_list=[
        ('orig',
         sklearn.preprocessing.PolynomialFeatures(degree=2, include_bias=False)
         ),
    ])
    classifier = sklearn.linear_model.LogisticRegression(C=1.0,
                                                         solver='lbfgs',
                                                         max_iter=1000)
    pipeline = sklearn.pipeline.Pipeline([('step1', feature_tfmr),
                                          ('step2', classifier)])
    print("made pipeline")
    pipeline.fit(x_train_NF, y_train_N)

    print("fit pipeline")
    err = sklearn.metrics.zero_one_loss(y_valid_M,
                                        pipeline.predict(x_valid_MF) >= 0.5)
    print(err)
Example #7
0
    def test_sklearn(self, seed, experiment_run, strs):
        np = pytest.importorskip("numpy")
        sklearn = pytest.importorskip("sklearn")
        from sklearn import cluster, naive_bayes, pipeline, preprocessing

        np.random.seed(seed)
        key = strs[0]
        num_data_rows = 36
        X = np.random.random((num_data_rows, 2))
        y = np.random.randint(10, size=num_data_rows)

        pipeline = sklearn.pipeline.make_pipeline(
            sklearn.preprocessing.StandardScaler(),
            sklearn.cluster.KMeans(),
            sklearn.naive_bayes.GaussianNB(),
        )
        pipeline.fit(X, y)

        experiment_run.log_model(pipeline)
        retrieved_pipeline = experiment_run.get_model()

        assert np.allclose(pipeline.predict(X), retrieved_pipeline.predict(X))

        assert len(pipeline.steps) == len(retrieved_pipeline.steps)
        for step, retrieved_step in zip(pipeline.steps, retrieved_pipeline.steps):
            assert step[0] == retrieved_step[0]  # step name
            assert step[1].get_params() == retrieved_step[1].get_params()  # step model
    def test_classification_workflow(self):
        task = openml.tasks.get_task(254)
        X, y = task.get_X_and_y()

        ohe = OneHotEncoder(categorical_features=[True] * 22)
        tree = sklearn.tree.DecisionTreeClassifier(random_state=1)
        pipeline = sklearn.pipeline.Pipeline((('ohe', ohe), ('tree', tree)))

        X_train, X_test, y_train, y_test = \
            sklearn.model_selection.train_test_split(X, y, random_state=3,
                                                     train_size=0.5,
                                                     test_size=0.5)
        pipeline.fit(X_train, y_train)
        self.assertEqual(np.mean(y_train == pipeline.predict(X_train)), 1)
        # With an incorrect copy operation the OneHotEncoder would rearrange
        # the data in such a way that the accuracy would drop to 66%
        self.assertEqual(np.mean(y_test == pipeline.predict(X_test)), 1)
    def test_classification_workflow(self):
        task = openml.tasks.get_task(254)
        X, y = task.get_X_and_y()

        ohe = OneHotEncoder(categorical_features=[True]*22)
        tree = sklearn.tree.DecisionTreeClassifier(random_state=1)
        pipeline = sklearn.pipeline.Pipeline((('ohe', ohe), ('tree', tree)))

        X_train, X_test, y_train, y_test = \
            sklearn.model_selection.train_test_split(X, y, random_state=3,
                                                     train_size=0.5,
                                                     test_size=0.5)
        pipeline.fit(X_train, y_train)
        self.assertEqual(np.mean(y_train == pipeline.predict(X_train)), 1)
        # With an incorrect copy operation the OneHotEncoder would rearrange
        # the data in such a way that the accuracy would drop to 66%
        self.assertEqual(np.mean(y_test == pipeline.predict(X_test)), 1)
Example #10
0
 def test_multiple_estimators_predict_predict_proba(self):
     pipeline = (StandardScaler() >>
                 (LogisticRegression() & PCA()) >> ConcatFeatures() >>
                 (NoOp() & LinearSVC()) >> ConcatFeatures() >>
                 KNeighborsClassifier())
     pipeline.fit(self.X_train, self.y_train)
     _ = pipeline.predict_proba(self.X_test)
     _ = pipeline.predict(self.X_test)
Example #11
0
def train_model():
    select = SelectKBest(k=10)
    train = load_train_set()
    test = load_test_set()
    target = 'condition'
    hrv_features = list(train)
    hrv_features = [x for x in hrv_features if x not in [target]]

    classifiers = [
        #MultinomialNB(),
        #SVC(C=20, kernel='rbf'),
        ('rdf', RandomForestClassifier())
    ]
    for clf in classifiers:
        count_time = time.time()
        X_train = train[hrv_features]
        y_train = train[target]
        X_test = test[hrv_features]
        y_test = test[target]

        name = str(clf).split('(')[0]
        """if 'multinomialnb'==name.lower():
            scaler = MinMaxScaler()
            scaler.fit(X_train)
            X_train = scaler.transform(X_train)
            X_test = scaler.transform(X_test)
        else:
            scaler = StandardScaler()
            scaler.fit(X_train)
            X_train = scaler.transform(X_train)
            X_test = scaler.transform(X_test)"""
        print(name)
        """steps = [('feature_selection', select),
             ('model', clf)]"""
        steps = [('scaler', StandardScaler()), ('feature_selection', select),
                 ('model', clf)]
        pipeline = sklearn.pipeline.Pipeline(steps)
        pipeline.fit(X_train, y_train)
        y_prediction = pipeline.predict(X_test)
        print("----------------------------{0}---------------------------".
              format(name))
        print(sklearn.metrics.classification_report(y_test, y_prediction))
        count_time = time.time() - count_time
        print("time: ", count_time)
        print()
        print()
        pickle.dump(pipeline, open('model_stress.pkl', 'wb'))
        #joblib.dump(pipeline, 'model_stress.pkl')
        print("done")
def report(clf, features_train, features_test, labels_train, labels_test):
    ##input: 
    # clf: classifier you set
    ##output: accuracy, recall, precision and f1 score you have got.
    steps = [('classifier', clf)]

    pipeline = sklearn.pipeline.Pipeline(steps)

    pipeline.fit(features_train, labels_train)

    y_prediction = pipeline.predict( features_test )

    report = sklearn.metrics.classification_report( labels_test, y_prediction )

    return report
Example #13
0
def estimate_simple(vectorizer, model, streamer):
    """
    Generate predictions for an estimator

    Arguments:
        * vectorizer: a sklearn Vectorizer (or pipeline)
        * model: a quantgov.estimator.Estimator
        * streamer: a quantgov.corpora.CorpusStreamer

    Yields:
        2-tuples of docindex, prediction

    """
    pipeline = get_pipeline(vectorizer, model)
    texts = (doc.text for doc in streamer)
    yield from zip(streamer.index, pipeline.predict(texts))
    def im_displays():
        for patch_rows in patch_row_chunks:
            y = pipeline.predict(patch_rows)

            # Map to [0, 1) so that imshow scales across entire colormap spectrum
            y = y / n_clusters

            newshape = (im_height - d + 1, im_width - d + 1, )
            segmentation = np.reshape(y, newshape)

            # Apply color map and remove alpha channel
            cmap = plt.cm.Set1
            colored_segmentation = cmap(segmentation)[:, :, :3]
            colored_segmentation = (colored_segmentation * 255).astype(np.uint8)

            yield colored_segmentation
def main(args):
    # make data for yourself
    X, y = sklearn.datasets.make_classification(n_samples=args.data_size)

    train_data, test_data, train_target, test_target = sklearn.model_selection.train_test_split(
        X, y, test_size=args.test_size, random_state=args.seed)

    features = []
    if args.original:
        # like identity transformer
        # when you don't feed any function
        # it doesn't do anything to features
        features.append(
            ("original", sklearn.preprocessing.FunctionTransformer()))
    if args.rff:
        features.append(("rff", RFFsTransformer(args.rff, args.gamma,
                                                args.seed)))
    if args.nystroem:
        features.append(("nystroem",
                         NystroemTransformer(args.nystroem, args.gamma,
                                             args.seed)))

    if args.svm:
        classifier = sklearn.svm.SVC()
    else:
        classifier = sklearn.linear_model.LogisticRegression(
            solver="saga",
            penalty="none",
            max_iter=args.max_iter,
            random_state=args.seed)

    pipeline = sklearn.pipeline.Pipeline([
        ("scaling", sklearn.preprocessing.StandardScaler()),
        ("features", sklearn.pipeline.FeatureUnion(features)),
        ("classifier", classifier),
    ])

    pipeline.fit(train_data, train_target)

    test_accuracy = sklearn.metrics.accuracy_score(test_target,
                                                   pipeline.predict(test_data))
    return test_accuracy
Example #16
0
	def sklearn_pipeline(self, train_proportion=0.8, joke_limit=5000, debug=False):
		test_proportion = 1 - train_proportion

		### get random sample of jokes where joke["categories"] isn't empty
		jokes_to_use = random.sample(list(filter(lambda joke: joke["categories"], self._jokes)), joke_limit)

		### create CountVectorizer
		vectorizer = sklearn.feature_extraction.text.CountVectorizer(
			input="content",
			analyzer=u"word",
			token_pattern=r"\b\w+\b", # tokenize string by extracting words of at least 1 letter. I think default is r"\b\w{2,}\b"
			ngram_range=(1,1), # TODO: experiment with this
			binary=False,
		)

		### create data and target vectors
		X = vectorizer.fit_transform(joke["content"] for joke in jokes_to_use)
		y = np.fromiter((self._categoryIDs[joke["categories"][0]] for joke in jokes_to_use), np.int8)

		X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(X, y, test_size=test_proportion)

		### setting up pipeline. feel free to experiment here
		select = sklearn.feature_selection.SelectKBest(k=100)
		clf = sklearn.naive_bayes.MultinomialNB()
		steps = [("feature_selection", select),
		        ("naive_bayes", clf)]

		pipeline = sklearn.pipeline.Pipeline(steps)

		### fit your pipeline on X_train and y_train
		pipeline.fit(X_train, y_train)
		### call pipeline.predict() on your X_test data to make a set of test predictions
		y_prediction = pipeline.predict(X_test)
		### test your predictions using sklearn.classification_report()
		report = sklearn.metrics.classification_report(y_test, y_prediction)
		### and print the report
		print(report)
		print("overall accuracy: {:.2f}%".format(sklearn.metrics.accuracy_score(y_test, y_prediction) * 100))
		print()
		for index, category in enumerate(self._categories):
			print("{}: {} ({} jokes)".format(index, category, self._categories[category]))
Example #17
0
def simple_model_evaluation():
    select = SelectKBest(k=20)
    train = load_train_set()
    test = load_test_set()
    target = 'condition'
    hrv_features = list(train)
    hrv_features = [x for x in hrv_features if x not in [target]]
    X_train = train[hrv_features]
    y_train = train[target]
    X_test = test[hrv_features]
    y_test = test[target]
    classifiers = [
        RandomForestClassifier(n_estimators=100,
                               max_features='log2',
                               n_jobs=-1),
        SVC(C=20, kernel='rbf'),
    ]
    for clf in classifiers:
        name = str(clf).split('(')[0]
        if 'svc' == name.lower():
            # Normalize the attribute values to mean=0 and variance=1
            from sklearn.preprocessing import StandardScaler
            scaler = StandardScaler()
            scaler.fit(X_train)
            X_train = scaler.transform(X_train)
            X_test = scaler.transform(X_test)
        clf = RandomForestClassifier()
        steps = [('feature_selection', select), ('model', clf)]
        pipeline = sklearn.pipeline.Pipeline(steps)
        pipeline.fit(X_train, y_train)
        y_prediction = pipeline.predict(X_test)
        print("----------------------------{0}---------------------------".
              format(name))
        print(sklearn.metrics.classification_report(y_test, y_prediction))
        print()
        print()
Example #18
0
#Stores scores result from SVC
scores_SVC = []
#Building pipeline for each classifier
pipeline = build_pipeline()
pipelineLR = build_pipeline_LR()
pipelineSVC = build_pipeline_SVC()
#Looping through each set in X_train(data set) and storing results from each model
for train_index, test_index in kf.split(X_train):
    print("Train:", train_index, "Validation:", test_index)
    X_tr, X_tt = X_train.iloc[train_index], X_train.iloc[test_index]
    y_tr, y_tt = y_train.iloc[train_index], y_train.iloc[test_index]

    pipeline.fit(X_tr, y_tr)
    pipelineLR.fit(X_tr, y_tr)
    pipelineSVC.fit(X_tr, y_tr)
    predictions = pipeline.predict(X_tt)
    confusion += confusion_matrix(y_tt, predictions)
    scores.append(f1_score(y_tt, predictions))

    predictionsLR = pipelineLR.predict(X_tt)
    confusion_LR += confusion_matrix(y_tt, predictionsLR)
    scores_LR.append(f1_score(y_tt, predictions))

    predictionsSVC = pipelineSVC.predict(X_tt)
    confusion_SVC += confusion_matrix(y_tt, predictionsSVC)
    scores_SVC.append(f1_score(y_tt, predictionsSVC))

#clfLR=LogisticRegression().fit(X_train_counts,y_train)
#clfSVC = svm.SVC(gamma='auto')

#clfSVC.fit(X_train_counts, y_train)
import os
mapper = DataFrameMapper([('feature1', None), ('feature2', None),
                          ('feature3', None)])

classifier = ModelDesign.model_fn()
classifier = KerasClassifier(build_fn=ModelDesign.model_fn,
                             batch_size=64,
                             epochs=2)

pipeline = pipeline.Pipeline([("mapper", mapper), ('model', classifier)])
train = pd.DataFrame([{
    "feature1": 45,
    "feature2": 32,
    "feature3": 33
}, {
    "feature1": 45,
    "feature2": 32,
    "feature3": 36
}])
labels = pd.DataFrame([{"labels": 1}, {"labels": 2}])
test = pd.DataFrame([{"feature1": 20, "feature2": 12, "feature3": 31}])
pipeline.fit(train, labels)
pipeline.predict(test)
if not os.path.exists(r"E:\ml_resources\storage\dnn_keras"):
    os.makedirs(r"E:\ml_resources\storage\dnn_keras")
pipeline.named_steps['model'].model.save(
    'E:\ml_resources\storage\dnn_keras\keras_model.h5')
pipeline.named_steps['model'].model = None
joblib.dump(pipeline, 'E:\ml_resources\storage\dnn_keras\keras_pipeline.pkl')
keras.backend.clear_session()
Example #20
0
messages_tfidf = tfidf_transformer.transform(messages_bow)

# %%
## MODEL FOR DETECTING SPAM OR HAM
# Splitting the data between train and test data
msg_train, msg_test, label_train, label_test = train_test_split(
    messages['message'], messages['label'], test_size=0.3)

# Summarizing all the step we just did into a pipeline, that way we do not have to
# repeat (code) each step again for different sets of data. We should pass a list
# of things we want to do (tuple with the name of the thing we want to do). We will
# treat the pipeline as a normal estimatior
pipeline = Pipeline([
    ('bow', CountVectorizer(analyzer=text_process)),
    ('tfidf', TfidfTransformer()), ('classifier', MultinomialNB())
    #   ('classifier',RandomForestClassifier())  # instead of MultinomialNB()
])

# Fitting our pipeline (our entire model)
#spam_detect_model = MultinomialNB().fit(messages_tfidf,messages['label'])
pipeline.fit(msg_train, label_train)

# %%
# Predicting messages
#all_pred = spam_detect_model.predict(messages_tfidf)
#all_pred
predictions = pipeline.predict(msg_test)

# %%
# Checking performance
print(classification_report(label_test, predictions))
Example #21
0
import onnxruntime as rt
import joblib
from numpy import load
import sklearn.pipeline

sess = rt.InferenceSession("output/model.onnx")
train_data = load("train_data.npy", allow_pickle=True)

print('---', train_data[0])
inputs = {'input': train_data[:1]}
pred_onx = sess.run(None, inputs)

print("onnx predict_proba")
print("predict", pred_onx[0])
print("predict_proba", pred_onx[1])
print("skl predict_proba")
print("predict", pipeline.predict(train_data[:1]))
print("predict_proba", pipeline.predict_proba(train_data[:1]))
data = pd.read_csv('features_and_class.csv', na_values=['--'])
features = list(data.columns)[1:-1]
obs_class = data['flooded']

pipeline = sklearn.pipeline.Pipeline([
    ('Replace NaNs', preprocessing.Imputer(strategy='mean')),
    ('Scale data', preprocessing.StandardScaler()),
    ('Classification',
     ensemble.RandomForestClassifier(
         n_estimators=100,
         n_jobs=-1,
     )),
])

pipeline.fit(data[features].values, obs_class.values)

df = geopandas.read_file('prediction_features.geojson', driver='GeoJSON')
pred = pipeline.predict(df[features].values)

df['prediction'] = pred
df.to_file('prediction.geojson', driver='GeoJSON')

grid = pred.reshape(78, 69)

import matplotlib
matplotlib.use('agg')
import matplotlib.pyplot as plt
fig, ax = plt.subplots()
ax.imshow(grid)
fig.savefig('prediction.png')
    clf = MLPClassifier(solver='lbfgs',
                        learning_rate='constant',
                        activation='tanh')
    #Decomposition techniques
    kernel = KernelPCA()
    #Making pipeline using KernelPCA Decomposition
    pipeline: Pipeline = make_pipeline(kernel, clf)
    #Model fitting
    pipeline.fit(X_train, Y_Train)

    print("train score: ", RDF.score(X_train, Y_Train))
    print("test score:  ", RDF.score(X_test, Y_Test))
    from sklearn import metrics
    from sklearn.metrics import classification_report
    #Data Prediction
    MLP_pred = pipeline.predict(X_test)

    MLPscore = accuracy_score(Y_Test, MLP_pred)
    MLP_precision_score = precision_score(Y_Test, MLP_pred)
    MLP_recall_score = recall_score(Y_Test, MLP_pred)
    MLP_score = f1_score(Y_Test, MLP_pred)
    print("precision score = ", MLP_precision_score)
    print("recall score = ", MLP_recall_score)
    print("f1 score = ", MLPscore)
    print("accuracy score of ANN Algorithm = ", MLP_score)

    #Plotting seaborn comapritive Accuracies Graph
    scores = [DT_Score, KNN_Score, SVC_Score, RDF_Score, NB_Score, MLP_score]
    algorithms = [
        "Decision Tree", "KNeighbors", "SVM ", "RandomForest", "Naive Bayes",
        "ANN"
Example #24
0
def load_test(pipeline, hrv_features):
    test = load_test_set()
    X_test = test[hrv_features]
    X_test = scaler.transform(X_test)
    y_prediction = pipeline.predict(X_test)
    return y_prediction[-1]
selector = feature_selection.SelectKBest(k=100)
classifier = naive_bayes.MultinomialNB(class_prior = np.reshape(np.repeat(np.array([[1.0/14.0]]),14,axis=1), (14,))) #flat priors

steps = [('feature_selection', selector), ('multinomial_nb', classifier)]

pipeline = pipeline.Pipeline(steps)
 
t0 = time()
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X_matrix, label_dum, test_size=0.33, random_state=30)
print("X_train dimensions: " + str(X_train.shape))
print("y_train dimensions: " + str(y_train.shape))

### fit your pipeline on X_train and y_train
pipeline.fit( X_train, y_train )
### call pipeline.predict() on your X_test data to make a set of test predictions
y_prediction = pipeline.predict( X_test )
### test your predictions using sklearn.classification_report()
report = metrics.classification_report( y_test, y_prediction )
### and print the report
print("Classifying unlabeled data done in: %fs" % (time()-t0))
print(report)

kfeatures = np.asarray(selector.get_support(indices=True))
print(np.asarray(vectorizer.get_feature_names())[kfeatures])

#################################################################
###### 3. Use classifier on unlabelled data

pred_unlab = pipeline.predict(X_matrix_unlab).tolist()

directory = 'results'
Example #26
0
#variabel menampung data cv
cv_scores = []

#ANOVA F-value between label/feature for classification tasks.
select = SelectKBest(score_func=f_classif, k=20)

for k in k_list:

    #jarak manhattan p= 1, jarak menggunakan euclidean p=2 ,
    knn = KNeighborsClassifier(n_neighbors=k, p=2)
    #seleksi fitur
    steps = [('feature_selection', select), ('model', knn)]
    pipeline = sklearn.pipeline.Pipeline(steps)
    pipeline.fit(x_train, y_train)
    y_predKNN = pipeline.predict(x_test)

    #knn.fit(x_train, y_train)
    #y_predKNN = knn.predict(x_test)

    accuracy = accuracy_score(y_test, y_predKNN)
    print("Accuracy: %.2f%%" % (accuracy * 100.0))
    print(classification_report(y_test, y_predKNN))

    print("Time:", "%s seconds" % (time.time() - start_time))
    scores = cross_val_score(knn, x_train, y_train, cv=10, scoring='accuracy')
    cv_scores.append(scores.mean())
print("==========================================")
#membuat grafik crossval
MSE = [1 - x for x in cv_scores]
Example #27
0
                        y_resampled = np.array(y_train)
                        X_resampled, y_resampled = os_object.fit_transform(X_resampled,y_resampled)

                    else:
                        X_resampled = X_train
                        y_resampled = y_train

                    t0 = time.clock()
                    pipeline.fit(X_resampled, y_resampled)
                    time_to_fit = (time.clock() - t0)
                    print("done fitting in {}".format(time_to_fit))

                    '''
                    Predictions
                    '''
                    predicted = pipeline.predict(X_test)

                    try:
                        predicted_prob = pipeline.predict_proba(X_test)
                        predicted_prob = predicted_prob[:, 1]  # probability that label is 1

                    except:
                        print("Model has no predict_proba method")

                    '''
                    Evaluation Statistics
                    '''
                    print()
                    print("Evaluation Statistics")
                    if model_name=='KNN':
                        print("Getting feature support")
steps = [('feature_selection', select),
        ('random_forest', clf)]
​
# using pipeline for tightening up the steps code
pipeline = sklearn.pipeline.Pipeline(steps)


################## sampling #######################
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(X, y, test_size=0.33, random_state=42)
​

################## MODEL FITTING & PREDICTION REPORT #######################
### fit your pipeline on X_train and y_train
pipeline.fit( X_train, y_train )
### call pipeline.predict() on your X_test data to make a set of test predictions
y_prediction = pipeline.predict( X_test )
### test your predictions using sklearn.classification_report()
report = sklearn.metrics.classification_report( y_test, y_prediction )
### and print the report
print(report)

######## GRID SEARCH CV ##########################
# importing the grid search package
import sklearn.grid_search

# defining the feature selection parameters and random forest estimators along with sample split
parameters = dict(feature_selection__k=[100, 200], 
              random_forest__n_estimators=[50, 100, 200],
              random_forest__min_samples_split=[2, 3, 4, 5, 10])

# using gridsearchcv and pipeline built above, we pass parameters defined in previous command
Example #29
0
train, test = load_data()

# create pipeline
clf = LogisticRegression(C=1.55,
                         penalty='elasticnet',
                         max_iter=256,
                         solver='saga',
                         n_jobs=-1,
                         verbose=4,
                         multi_class='ovr',
                         l1_ratio=0.5,
                         tol=5e-4)
pipeline = pipeline.make_pipeline(TfidfVectorizer(), MaxAbsScaler(), clf)

print('Fitting Pipeline...')
# fit pipeline
pipeline.fit(train.review, train.label)

# save model
filename = 'lr_pipeline.sav'
joblib.dump(pipeline, filename)
print(f'Model saved as \'{filename}\'')

# make predictions
print('Predicting...')
predicted = pipeline.predict(test.review)
print(f'{filename}\nAccuracy: {np.mean(predicted == test.label)}')
print(
    metrics.classification_report(test.label,
                                  predicted,
                                  target_names=['negative', 'positive']))
    X_train, X_test, Y_Train, Y_Test = train_test_split(X_resampled, y_resampled, test_size=0.25)

    #Feature scaling
    from sklearn.preprocessing import StandardScaler
    sc_X = StandardScaler()
    X_train = sc_X.fit_transform(X_train)
    X_test = sc_X.transform(X_test)

    #Using Pipeline
    import sklearn.pipeline
    from sklearn.neural_network import MLPClassifier
    from sklearn.decomposition import KernelPCA
    from imblearn.pipeline import make_pipeline
    
    #select = sklearn.feature_selection.SelectPercentile(sklearn.feature_selection.f_classif)
    clf = MLPClassifier(solver='lbfgs', learning_rate='constant', activation='tanh')
    kernel = KernelPCA()
    
    pipeline = make_pipeline(kernel, clf)
    pipeline.fit(X_train, Y_Train)

    #User-input
    v = []
    for i in column_names[:]:
        v.append(input(i+": "))
    answer = np.array(v)
    answer = answer.reshape(1,-1)
    answer = sc_X.transform(answer)
    print ("Predicts:"+ str(pipeline.predict(answer)))
    #print ("("Predicts: " + str(pipeline.predict(answer))")