def test_classification_workflow(self): X, y = sklearn.datasets.fetch_openml(data_id=24, as_frame=False, return_X_y=True) print(type(X)) X_train, X_test, y_train, y_test = \ sklearn.model_selection.train_test_split(X, y, random_state=3, train_size=0.5, test_size=0.5) X_train = scipy.sparse.csc_matrix(X_train) X_test = scipy.sparse.csc_matrix(X_test) pipeline = sklearn.pipeline.Pipeline(( ('shift', CategoryShift()), ('imput', SimpleImputer(strategy='constant', fill_value=2)), ('ohe', SparseOneHotEncoder()), ('tree', DecisionTreeClassifier(random_state=1)), )) pipeline.fit(X_train, y_train) pred_train = pipeline.predict(X_train) self.assertTrue((pred_train == y_train).all()) # With an incorrect copy operation the OneHotEncoder would rearrange # the data in such a way that the accuracy would drop to 66% pred_test = pipeline.predict(X_test) self.assertTrue((pred_test == y_test).all())
def test_classification_workflow(self): task = openml.tasks.get_task(254) X, y = task.get_X_and_y() X_train, X_test, y_train, y_test = \ sklearn.model_selection.train_test_split(X, y, random_state=3, train_size=0.5, test_size=0.5) X_train = scipy.sparse.csc_matrix(X_train) X_test = scipy.sparse.csc_matrix(X_test) pipeline = sklearn.pipeline.Pipeline(( ('shift', CategoryShift()), ('imput', SimpleImputer(strategy='constant', fill_value=2)), ('ohe', SparseOneHotEncoder()), ('tree', DecisionTreeClassifier(random_state=1)), )) pipeline.fit(X_train, y_train) pred_train = pipeline.predict(X_train) self.assertTrue((pred_train == y_train).all()) # With an incorrect copy operation the OneHotEncoder would rearrange # the data in such a way that the accuracy would drop to 66% pred_test = pipeline.predict(X_test) self.assertTrue((pred_test == y_test).all())
def test_pipeline(): pipeline = dl.Pipeline([("scale", StandardScaler()), ("fdr", SelectFdr()), ("svm", LinearSVC())]) pipeline = pipeline.fit(X, y) y2 = pipeline.predict(X) score = pipeline.score(X, y) assert isinstance(y2, di.Value) assert isinstance(score, di.Value) assert isinstance(score.compute(), float) assert pipeline.score(X, y).key == pipeline.score(X, y).key assert score.compute() == score.compute() y22 = y2.compute() assert y22.shape == y.shape assert y22.dtype == y.dtype skpipeline = sklearn.pipeline.Pipeline([("scale", StandardScaler()), ("fdr", SelectFdr()), ("svm", LinearSVC())]) skpipeline.fit(X, y) sk_y2 = skpipeline.predict(X) sk_score = skpipeline.score(X, y) assert sk_score == score.compute()
def main(args): # Use the digits dataset. dataset = MNIST(data_size=5000) # Split the dataset into a train set and a test set. train_data, test_data, train_target, test_target = sklearn.model_selection.train_test_split( dataset.data, dataset.target, test_size=args.test_size, random_state=args.seed) features = [] if args.original: features.append(("original", sklearn.preprocessing.FunctionTransformer())) if args.rff: features.append(("rff", RFFsTransformer(args.rff, args.gamma, args.seed))) if args.nystroem: features.append(("nystroem", NystroemTransformer(args.nystroem, args.gamma, args.seed))) if args.svm: classifier = sklearn.svm.SVC() else: classifier = sklearn.linear_model.LogisticRegression(solver="saga", penalty="none", max_iter=args.max_iter, random_state=args.seed) pipeline = sklearn.pipeline.Pipeline([ ("scaling", sklearn.preprocessing.MinMaxScaler()), ("features", sklearn.pipeline.FeatureUnion(features)), ("classifier", classifier), ]) pipeline.fit(train_data, train_target) test_accuracy = sklearn.metrics.accuracy_score(test_target, pipeline.predict(test_data)) return test_accuracy
def make_model(): dataset_path = 'data_sneaker_vs_sandal' x_all_d = pd.read_csv(os.path.join(dataset_path, 'x_train.csv')) x_all = x_all_d.values A, F = x_all.shape x_train_NF = x_all[:9000] N = 9000 x_valid_MF = x_all[9000:] M = 3000 y_all_d = pd.read_csv(os.path.join(dataset_path, 'y_train.csv')) y_all = y_all_d.values.reshape((A, )) y_train_N = y_all[:9000] y_valid_M = y_all[9000:] print("loaded data") feature_tfmr = sklearn.pipeline.FeatureUnion(transformer_list=[ ('orig', sklearn.preprocessing.PolynomialFeatures(degree=2, include_bias=False) ), ]) classifier = sklearn.linear_model.LogisticRegression(C=1.0, solver='lbfgs', max_iter=1000) pipeline = sklearn.pipeline.Pipeline([('step1', feature_tfmr), ('step2', classifier)]) print("made pipeline") pipeline.fit(x_train_NF, y_train_N) print("fit pipeline") err = sklearn.metrics.zero_one_loss(y_valid_M, pipeline.predict(x_valid_MF) >= 0.5) print(err)
def test_sklearn(self, seed, experiment_run, strs): np = pytest.importorskip("numpy") sklearn = pytest.importorskip("sklearn") from sklearn import cluster, naive_bayes, pipeline, preprocessing np.random.seed(seed) key = strs[0] num_data_rows = 36 X = np.random.random((num_data_rows, 2)) y = np.random.randint(10, size=num_data_rows) pipeline = sklearn.pipeline.make_pipeline( sklearn.preprocessing.StandardScaler(), sklearn.cluster.KMeans(), sklearn.naive_bayes.GaussianNB(), ) pipeline.fit(X, y) experiment_run.log_model(pipeline) retrieved_pipeline = experiment_run.get_model() assert np.allclose(pipeline.predict(X), retrieved_pipeline.predict(X)) assert len(pipeline.steps) == len(retrieved_pipeline.steps) for step, retrieved_step in zip(pipeline.steps, retrieved_pipeline.steps): assert step[0] == retrieved_step[0] # step name assert step[1].get_params() == retrieved_step[1].get_params() # step model
def test_classification_workflow(self): task = openml.tasks.get_task(254) X, y = task.get_X_and_y() ohe = OneHotEncoder(categorical_features=[True] * 22) tree = sklearn.tree.DecisionTreeClassifier(random_state=1) pipeline = sklearn.pipeline.Pipeline((('ohe', ohe), ('tree', tree))) X_train, X_test, y_train, y_test = \ sklearn.model_selection.train_test_split(X, y, random_state=3, train_size=0.5, test_size=0.5) pipeline.fit(X_train, y_train) self.assertEqual(np.mean(y_train == pipeline.predict(X_train)), 1) # With an incorrect copy operation the OneHotEncoder would rearrange # the data in such a way that the accuracy would drop to 66% self.assertEqual(np.mean(y_test == pipeline.predict(X_test)), 1)
def test_classification_workflow(self): task = openml.tasks.get_task(254) X, y = task.get_X_and_y() ohe = OneHotEncoder(categorical_features=[True]*22) tree = sklearn.tree.DecisionTreeClassifier(random_state=1) pipeline = sklearn.pipeline.Pipeline((('ohe', ohe), ('tree', tree))) X_train, X_test, y_train, y_test = \ sklearn.model_selection.train_test_split(X, y, random_state=3, train_size=0.5, test_size=0.5) pipeline.fit(X_train, y_train) self.assertEqual(np.mean(y_train == pipeline.predict(X_train)), 1) # With an incorrect copy operation the OneHotEncoder would rearrange # the data in such a way that the accuracy would drop to 66% self.assertEqual(np.mean(y_test == pipeline.predict(X_test)), 1)
def test_multiple_estimators_predict_predict_proba(self): pipeline = (StandardScaler() >> (LogisticRegression() & PCA()) >> ConcatFeatures() >> (NoOp() & LinearSVC()) >> ConcatFeatures() >> KNeighborsClassifier()) pipeline.fit(self.X_train, self.y_train) _ = pipeline.predict_proba(self.X_test) _ = pipeline.predict(self.X_test)
def train_model(): select = SelectKBest(k=10) train = load_train_set() test = load_test_set() target = 'condition' hrv_features = list(train) hrv_features = [x for x in hrv_features if x not in [target]] classifiers = [ #MultinomialNB(), #SVC(C=20, kernel='rbf'), ('rdf', RandomForestClassifier()) ] for clf in classifiers: count_time = time.time() X_train = train[hrv_features] y_train = train[target] X_test = test[hrv_features] y_test = test[target] name = str(clf).split('(')[0] """if 'multinomialnb'==name.lower(): scaler = MinMaxScaler() scaler.fit(X_train) X_train = scaler.transform(X_train) X_test = scaler.transform(X_test) else: scaler = StandardScaler() scaler.fit(X_train) X_train = scaler.transform(X_train) X_test = scaler.transform(X_test)""" print(name) """steps = [('feature_selection', select), ('model', clf)]""" steps = [('scaler', StandardScaler()), ('feature_selection', select), ('model', clf)] pipeline = sklearn.pipeline.Pipeline(steps) pipeline.fit(X_train, y_train) y_prediction = pipeline.predict(X_test) print("----------------------------{0}---------------------------". format(name)) print(sklearn.metrics.classification_report(y_test, y_prediction)) count_time = time.time() - count_time print("time: ", count_time) print() print() pickle.dump(pipeline, open('model_stress.pkl', 'wb')) #joblib.dump(pipeline, 'model_stress.pkl') print("done")
def report(clf, features_train, features_test, labels_train, labels_test): ##input: # clf: classifier you set ##output: accuracy, recall, precision and f1 score you have got. steps = [('classifier', clf)] pipeline = sklearn.pipeline.Pipeline(steps) pipeline.fit(features_train, labels_train) y_prediction = pipeline.predict( features_test ) report = sklearn.metrics.classification_report( labels_test, y_prediction ) return report
def estimate_simple(vectorizer, model, streamer): """ Generate predictions for an estimator Arguments: * vectorizer: a sklearn Vectorizer (or pipeline) * model: a quantgov.estimator.Estimator * streamer: a quantgov.corpora.CorpusStreamer Yields: 2-tuples of docindex, prediction """ pipeline = get_pipeline(vectorizer, model) texts = (doc.text for doc in streamer) yield from zip(streamer.index, pipeline.predict(texts))
def im_displays(): for patch_rows in patch_row_chunks: y = pipeline.predict(patch_rows) # Map to [0, 1) so that imshow scales across entire colormap spectrum y = y / n_clusters newshape = (im_height - d + 1, im_width - d + 1, ) segmentation = np.reshape(y, newshape) # Apply color map and remove alpha channel cmap = plt.cm.Set1 colored_segmentation = cmap(segmentation)[:, :, :3] colored_segmentation = (colored_segmentation * 255).astype(np.uint8) yield colored_segmentation
def main(args): # make data for yourself X, y = sklearn.datasets.make_classification(n_samples=args.data_size) train_data, test_data, train_target, test_target = sklearn.model_selection.train_test_split( X, y, test_size=args.test_size, random_state=args.seed) features = [] if args.original: # like identity transformer # when you don't feed any function # it doesn't do anything to features features.append( ("original", sklearn.preprocessing.FunctionTransformer())) if args.rff: features.append(("rff", RFFsTransformer(args.rff, args.gamma, args.seed))) if args.nystroem: features.append(("nystroem", NystroemTransformer(args.nystroem, args.gamma, args.seed))) if args.svm: classifier = sklearn.svm.SVC() else: classifier = sklearn.linear_model.LogisticRegression( solver="saga", penalty="none", max_iter=args.max_iter, random_state=args.seed) pipeline = sklearn.pipeline.Pipeline([ ("scaling", sklearn.preprocessing.StandardScaler()), ("features", sklearn.pipeline.FeatureUnion(features)), ("classifier", classifier), ]) pipeline.fit(train_data, train_target) test_accuracy = sklearn.metrics.accuracy_score(test_target, pipeline.predict(test_data)) return test_accuracy
def sklearn_pipeline(self, train_proportion=0.8, joke_limit=5000, debug=False): test_proportion = 1 - train_proportion ### get random sample of jokes where joke["categories"] isn't empty jokes_to_use = random.sample(list(filter(lambda joke: joke["categories"], self._jokes)), joke_limit) ### create CountVectorizer vectorizer = sklearn.feature_extraction.text.CountVectorizer( input="content", analyzer=u"word", token_pattern=r"\b\w+\b", # tokenize string by extracting words of at least 1 letter. I think default is r"\b\w{2,}\b" ngram_range=(1,1), # TODO: experiment with this binary=False, ) ### create data and target vectors X = vectorizer.fit_transform(joke["content"] for joke in jokes_to_use) y = np.fromiter((self._categoryIDs[joke["categories"][0]] for joke in jokes_to_use), np.int8) X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(X, y, test_size=test_proportion) ### setting up pipeline. feel free to experiment here select = sklearn.feature_selection.SelectKBest(k=100) clf = sklearn.naive_bayes.MultinomialNB() steps = [("feature_selection", select), ("naive_bayes", clf)] pipeline = sklearn.pipeline.Pipeline(steps) ### fit your pipeline on X_train and y_train pipeline.fit(X_train, y_train) ### call pipeline.predict() on your X_test data to make a set of test predictions y_prediction = pipeline.predict(X_test) ### test your predictions using sklearn.classification_report() report = sklearn.metrics.classification_report(y_test, y_prediction) ### and print the report print(report) print("overall accuracy: {:.2f}%".format(sklearn.metrics.accuracy_score(y_test, y_prediction) * 100)) print() for index, category in enumerate(self._categories): print("{}: {} ({} jokes)".format(index, category, self._categories[category]))
def simple_model_evaluation(): select = SelectKBest(k=20) train = load_train_set() test = load_test_set() target = 'condition' hrv_features = list(train) hrv_features = [x for x in hrv_features if x not in [target]] X_train = train[hrv_features] y_train = train[target] X_test = test[hrv_features] y_test = test[target] classifiers = [ RandomForestClassifier(n_estimators=100, max_features='log2', n_jobs=-1), SVC(C=20, kernel='rbf'), ] for clf in classifiers: name = str(clf).split('(')[0] if 'svc' == name.lower(): # Normalize the attribute values to mean=0 and variance=1 from sklearn.preprocessing import StandardScaler scaler = StandardScaler() scaler.fit(X_train) X_train = scaler.transform(X_train) X_test = scaler.transform(X_test) clf = RandomForestClassifier() steps = [('feature_selection', select), ('model', clf)] pipeline = sklearn.pipeline.Pipeline(steps) pipeline.fit(X_train, y_train) y_prediction = pipeline.predict(X_test) print("----------------------------{0}---------------------------". format(name)) print(sklearn.metrics.classification_report(y_test, y_prediction)) print() print()
#Stores scores result from SVC scores_SVC = [] #Building pipeline for each classifier pipeline = build_pipeline() pipelineLR = build_pipeline_LR() pipelineSVC = build_pipeline_SVC() #Looping through each set in X_train(data set) and storing results from each model for train_index, test_index in kf.split(X_train): print("Train:", train_index, "Validation:", test_index) X_tr, X_tt = X_train.iloc[train_index], X_train.iloc[test_index] y_tr, y_tt = y_train.iloc[train_index], y_train.iloc[test_index] pipeline.fit(X_tr, y_tr) pipelineLR.fit(X_tr, y_tr) pipelineSVC.fit(X_tr, y_tr) predictions = pipeline.predict(X_tt) confusion += confusion_matrix(y_tt, predictions) scores.append(f1_score(y_tt, predictions)) predictionsLR = pipelineLR.predict(X_tt) confusion_LR += confusion_matrix(y_tt, predictionsLR) scores_LR.append(f1_score(y_tt, predictions)) predictionsSVC = pipelineSVC.predict(X_tt) confusion_SVC += confusion_matrix(y_tt, predictionsSVC) scores_SVC.append(f1_score(y_tt, predictionsSVC)) #clfLR=LogisticRegression().fit(X_train_counts,y_train) #clfSVC = svm.SVC(gamma='auto') #clfSVC.fit(X_train_counts, y_train)
import os mapper = DataFrameMapper([('feature1', None), ('feature2', None), ('feature3', None)]) classifier = ModelDesign.model_fn() classifier = KerasClassifier(build_fn=ModelDesign.model_fn, batch_size=64, epochs=2) pipeline = pipeline.Pipeline([("mapper", mapper), ('model', classifier)]) train = pd.DataFrame([{ "feature1": 45, "feature2": 32, "feature3": 33 }, { "feature1": 45, "feature2": 32, "feature3": 36 }]) labels = pd.DataFrame([{"labels": 1}, {"labels": 2}]) test = pd.DataFrame([{"feature1": 20, "feature2": 12, "feature3": 31}]) pipeline.fit(train, labels) pipeline.predict(test) if not os.path.exists(r"E:\ml_resources\storage\dnn_keras"): os.makedirs(r"E:\ml_resources\storage\dnn_keras") pipeline.named_steps['model'].model.save( 'E:\ml_resources\storage\dnn_keras\keras_model.h5') pipeline.named_steps['model'].model = None joblib.dump(pipeline, 'E:\ml_resources\storage\dnn_keras\keras_pipeline.pkl') keras.backend.clear_session()
messages_tfidf = tfidf_transformer.transform(messages_bow) # %% ## MODEL FOR DETECTING SPAM OR HAM # Splitting the data between train and test data msg_train, msg_test, label_train, label_test = train_test_split( messages['message'], messages['label'], test_size=0.3) # Summarizing all the step we just did into a pipeline, that way we do not have to # repeat (code) each step again for different sets of data. We should pass a list # of things we want to do (tuple with the name of the thing we want to do). We will # treat the pipeline as a normal estimatior pipeline = Pipeline([ ('bow', CountVectorizer(analyzer=text_process)), ('tfidf', TfidfTransformer()), ('classifier', MultinomialNB()) # ('classifier',RandomForestClassifier()) # instead of MultinomialNB() ]) # Fitting our pipeline (our entire model) #spam_detect_model = MultinomialNB().fit(messages_tfidf,messages['label']) pipeline.fit(msg_train, label_train) # %% # Predicting messages #all_pred = spam_detect_model.predict(messages_tfidf) #all_pred predictions = pipeline.predict(msg_test) # %% # Checking performance print(classification_report(label_test, predictions))
import onnxruntime as rt import joblib from numpy import load import sklearn.pipeline sess = rt.InferenceSession("output/model.onnx") train_data = load("train_data.npy", allow_pickle=True) print('---', train_data[0]) inputs = {'input': train_data[:1]} pred_onx = sess.run(None, inputs) print("onnx predict_proba") print("predict", pred_onx[0]) print("predict_proba", pred_onx[1]) print("skl predict_proba") print("predict", pipeline.predict(train_data[:1])) print("predict_proba", pipeline.predict_proba(train_data[:1]))
data = pd.read_csv('features_and_class.csv', na_values=['--']) features = list(data.columns)[1:-1] obs_class = data['flooded'] pipeline = sklearn.pipeline.Pipeline([ ('Replace NaNs', preprocessing.Imputer(strategy='mean')), ('Scale data', preprocessing.StandardScaler()), ('Classification', ensemble.RandomForestClassifier( n_estimators=100, n_jobs=-1, )), ]) pipeline.fit(data[features].values, obs_class.values) df = geopandas.read_file('prediction_features.geojson', driver='GeoJSON') pred = pipeline.predict(df[features].values) df['prediction'] = pred df.to_file('prediction.geojson', driver='GeoJSON') grid = pred.reshape(78, 69) import matplotlib matplotlib.use('agg') import matplotlib.pyplot as plt fig, ax = plt.subplots() ax.imshow(grid) fig.savefig('prediction.png')
clf = MLPClassifier(solver='lbfgs', learning_rate='constant', activation='tanh') #Decomposition techniques kernel = KernelPCA() #Making pipeline using KernelPCA Decomposition pipeline: Pipeline = make_pipeline(kernel, clf) #Model fitting pipeline.fit(X_train, Y_Train) print("train score: ", RDF.score(X_train, Y_Train)) print("test score: ", RDF.score(X_test, Y_Test)) from sklearn import metrics from sklearn.metrics import classification_report #Data Prediction MLP_pred = pipeline.predict(X_test) MLPscore = accuracy_score(Y_Test, MLP_pred) MLP_precision_score = precision_score(Y_Test, MLP_pred) MLP_recall_score = recall_score(Y_Test, MLP_pred) MLP_score = f1_score(Y_Test, MLP_pred) print("precision score = ", MLP_precision_score) print("recall score = ", MLP_recall_score) print("f1 score = ", MLPscore) print("accuracy score of ANN Algorithm = ", MLP_score) #Plotting seaborn comapritive Accuracies Graph scores = [DT_Score, KNN_Score, SVC_Score, RDF_Score, NB_Score, MLP_score] algorithms = [ "Decision Tree", "KNeighbors", "SVM ", "RandomForest", "Naive Bayes", "ANN"
def load_test(pipeline, hrv_features): test = load_test_set() X_test = test[hrv_features] X_test = scaler.transform(X_test) y_prediction = pipeline.predict(X_test) return y_prediction[-1]
selector = feature_selection.SelectKBest(k=100) classifier = naive_bayes.MultinomialNB(class_prior = np.reshape(np.repeat(np.array([[1.0/14.0]]),14,axis=1), (14,))) #flat priors steps = [('feature_selection', selector), ('multinomial_nb', classifier)] pipeline = pipeline.Pipeline(steps) t0 = time() X_train, X_test, y_train, y_test = cross_validation.train_test_split(X_matrix, label_dum, test_size=0.33, random_state=30) print("X_train dimensions: " + str(X_train.shape)) print("y_train dimensions: " + str(y_train.shape)) ### fit your pipeline on X_train and y_train pipeline.fit( X_train, y_train ) ### call pipeline.predict() on your X_test data to make a set of test predictions y_prediction = pipeline.predict( X_test ) ### test your predictions using sklearn.classification_report() report = metrics.classification_report( y_test, y_prediction ) ### and print the report print("Classifying unlabeled data done in: %fs" % (time()-t0)) print(report) kfeatures = np.asarray(selector.get_support(indices=True)) print(np.asarray(vectorizer.get_feature_names())[kfeatures]) ################################################################# ###### 3. Use classifier on unlabelled data pred_unlab = pipeline.predict(X_matrix_unlab).tolist() directory = 'results'
#variabel menampung data cv cv_scores = [] #ANOVA F-value between label/feature for classification tasks. select = SelectKBest(score_func=f_classif, k=20) for k in k_list: #jarak manhattan p= 1, jarak menggunakan euclidean p=2 , knn = KNeighborsClassifier(n_neighbors=k, p=2) #seleksi fitur steps = [('feature_selection', select), ('model', knn)] pipeline = sklearn.pipeline.Pipeline(steps) pipeline.fit(x_train, y_train) y_predKNN = pipeline.predict(x_test) #knn.fit(x_train, y_train) #y_predKNN = knn.predict(x_test) accuracy = accuracy_score(y_test, y_predKNN) print("Accuracy: %.2f%%" % (accuracy * 100.0)) print(classification_report(y_test, y_predKNN)) print("Time:", "%s seconds" % (time.time() - start_time)) scores = cross_val_score(knn, x_train, y_train, cv=10, scoring='accuracy') cv_scores.append(scores.mean()) print("==========================================") #membuat grafik crossval MSE = [1 - x for x in cv_scores]
y_resampled = np.array(y_train) X_resampled, y_resampled = os_object.fit_transform(X_resampled,y_resampled) else: X_resampled = X_train y_resampled = y_train t0 = time.clock() pipeline.fit(X_resampled, y_resampled) time_to_fit = (time.clock() - t0) print("done fitting in {}".format(time_to_fit)) ''' Predictions ''' predicted = pipeline.predict(X_test) try: predicted_prob = pipeline.predict_proba(X_test) predicted_prob = predicted_prob[:, 1] # probability that label is 1 except: print("Model has no predict_proba method") ''' Evaluation Statistics ''' print() print("Evaluation Statistics") if model_name=='KNN': print("Getting feature support")
steps = [('feature_selection', select), ('random_forest', clf)] # using pipeline for tightening up the steps code pipeline = sklearn.pipeline.Pipeline(steps) ################## sampling ####################### X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(X, y, test_size=0.33, random_state=42) ################## MODEL FITTING & PREDICTION REPORT ####################### ### fit your pipeline on X_train and y_train pipeline.fit( X_train, y_train ) ### call pipeline.predict() on your X_test data to make a set of test predictions y_prediction = pipeline.predict( X_test ) ### test your predictions using sklearn.classification_report() report = sklearn.metrics.classification_report( y_test, y_prediction ) ### and print the report print(report) ######## GRID SEARCH CV ########################## # importing the grid search package import sklearn.grid_search # defining the feature selection parameters and random forest estimators along with sample split parameters = dict(feature_selection__k=[100, 200], random_forest__n_estimators=[50, 100, 200], random_forest__min_samples_split=[2, 3, 4, 5, 10]) # using gridsearchcv and pipeline built above, we pass parameters defined in previous command
train, test = load_data() # create pipeline clf = LogisticRegression(C=1.55, penalty='elasticnet', max_iter=256, solver='saga', n_jobs=-1, verbose=4, multi_class='ovr', l1_ratio=0.5, tol=5e-4) pipeline = pipeline.make_pipeline(TfidfVectorizer(), MaxAbsScaler(), clf) print('Fitting Pipeline...') # fit pipeline pipeline.fit(train.review, train.label) # save model filename = 'lr_pipeline.sav' joblib.dump(pipeline, filename) print(f'Model saved as \'{filename}\'') # make predictions print('Predicting...') predicted = pipeline.predict(test.review) print(f'{filename}\nAccuracy: {np.mean(predicted == test.label)}') print( metrics.classification_report(test.label, predicted, target_names=['negative', 'positive']))
X_train, X_test, Y_Train, Y_Test = train_test_split(X_resampled, y_resampled, test_size=0.25) #Feature scaling from sklearn.preprocessing import StandardScaler sc_X = StandardScaler() X_train = sc_X.fit_transform(X_train) X_test = sc_X.transform(X_test) #Using Pipeline import sklearn.pipeline from sklearn.neural_network import MLPClassifier from sklearn.decomposition import KernelPCA from imblearn.pipeline import make_pipeline #select = sklearn.feature_selection.SelectPercentile(sklearn.feature_selection.f_classif) clf = MLPClassifier(solver='lbfgs', learning_rate='constant', activation='tanh') kernel = KernelPCA() pipeline = make_pipeline(kernel, clf) pipeline.fit(X_train, Y_Train) #User-input v = [] for i in column_names[:]: v.append(input(i+": ")) answer = np.array(v) answer = answer.reshape(1,-1) answer = sc_X.transform(answer) print ("Predicts:"+ str(pipeline.predict(answer))) #print ("("Predicts: " + str(pipeline.predict(answer))")