def test_classification_workflow(self): task = openml.tasks.get_task(254) X, y = task.get_X_and_y() X_train, X_test, y_train, y_test = \ sklearn.model_selection.train_test_split(X, y, random_state=3, train_size=0.5, test_size=0.5) X_train = scipy.sparse.csc_matrix(X_train) X_test = scipy.sparse.csc_matrix(X_test) pipeline = sklearn.pipeline.Pipeline(( ('shift', CategoryShift()), ('imput', SimpleImputer(strategy='constant', fill_value=2)), ('ohe', SparseOneHotEncoder()), ('tree', DecisionTreeClassifier(random_state=1)), )) pipeline.fit(X_train, y_train) pred_train = pipeline.predict(X_train) self.assertTrue((pred_train == y_train).all()) # With an incorrect copy operation the OneHotEncoder would rearrange # the data in such a way that the accuracy would drop to 66% pred_test = pipeline.predict(X_test) self.assertTrue((pred_test == y_test).all())
def runClassifications(self, estimatorDict): for estimatorName in estimatorDict: print estimatorName + "in Progress..." print "Selecting K - Best features" select = sklearn.feature_selection.SelectKBest(k=40000) clf = estimatorDict[estimatorName] steps = [('feature_selection', select), ('random_forest', clf)] #Implementing Pipeline for Report Generation pipeline = sklearn.pipeline.Pipeline(steps) pipeline.fit(self.prcs.X_train, self.prcs.y_train) Pred_y = clf.predict(self.prcs.X_test) Accuracy = np.mean(Pred_y == self.prcs.y_test) report = sklearn.metrics.classification_report( self.prcs.y_test, Pred_y) #Preparing the report using pipeline print(report) print "Model's Accuracy: %f" % Accuracy print 'Prediction in progress...' y_fp = clf.predict(self.prcs.Pred_x) #Decoding the labels self.prcs.Decode_Labels(y_fp, estimatorName + '_PredictedOut' + '.csv')
def main(args): # Use the digits dataset. dataset = MNIST(data_size=5000) # Split the dataset into a train set and a test set. train_data, test_data, train_target, test_target = sklearn.model_selection.train_test_split( dataset.data, dataset.target, test_size=args.test_size, random_state=args.seed) features = [] if args.original: features.append(("original", sklearn.preprocessing.FunctionTransformer())) if args.rff: features.append(("rff", RFFsTransformer(args.rff, args.gamma, args.seed))) if args.nystroem: features.append(("nystroem", NystroemTransformer(args.nystroem, args.gamma, args.seed))) if args.svm: classifier = sklearn.svm.SVC() else: classifier = sklearn.linear_model.LogisticRegression(solver="saga", penalty="none", max_iter=args.max_iter, random_state=args.seed) pipeline = sklearn.pipeline.Pipeline([ ("scaling", sklearn.preprocessing.MinMaxScaler()), ("features", sklearn.pipeline.FeatureUnion(features)), ("classifier", classifier), ]) pipeline.fit(train_data, train_target) test_accuracy = sklearn.metrics.accuracy_score(test_target, pipeline.predict(test_data)) return test_accuracy
def test_two_estimators_predict_proba1(self): pipeline = ( StandardScaler() >> (PCA() & Nystroem() & PassiveAggressiveClassifier()) >> ConcatFeatures() >> NoOp() >> PassiveAggressiveClassifier()) pipeline.fit(self.X_train, self.y_train) pipeline.predict_proba(self.X_test)
def score_solution(model, save=0): ''' Added a model and save parameter: model ~ hold a classification model save ~ Flag used to save the best model on file using jobLib ''' # Ask the solution for the model pipeline. import solution pipeline = solution.get_pipeline(model) error_message = 'Your `solution.get_pipeline` implementation should ' \ 'return an `sklearn.pipeline.Pipeline`.' assert isinstance(pipeline, sklearn.pipeline.Pipeline), error_message # Train the model on the training DataFrame. X_train, y_train = get_data(subset='train') pipeline.fit(X_train, y_train) # Apply the model to the test DataFrame. X_test, y_test = get_data(subset='test') y_pred = pipeline.predict_proba(X_test) # Check that the predicted probabilities have an sklearn-compatible shape. assert (y_pred.ndim == 1) or \ (y_pred.ndim == 2 and y_pred.shape[1] == 2), \ 'The predicted probabilities should match sklearn''s ' \ '`predict_proba` output shape`.' y_pred = y_pred if y_pred.ndim == 1 else y_pred[:, 1] # Evaluate the predictions with the AUC of the ROC curve. if (save == 1): joblib.dump(pipeline, 'Best_Estimator.sav') return sklearn.metrics.roc_auc_score(y_test, y_pred)
def make_model(): dataset_path = 'data_sneaker_vs_sandal' x_all_d = pd.read_csv(os.path.join(dataset_path, 'x_train.csv')) x_all = x_all_d.values A, F = x_all.shape x_train_NF = x_all[:9000] N = 9000 x_valid_MF = x_all[9000:] M = 3000 y_all_d = pd.read_csv(os.path.join(dataset_path, 'y_train.csv')) y_all = y_all_d.values.reshape((A, )) y_train_N = y_all[:9000] y_valid_M = y_all[9000:] print("loaded data") feature_tfmr = sklearn.pipeline.FeatureUnion(transformer_list=[ ('orig', sklearn.preprocessing.PolynomialFeatures(degree=2, include_bias=False) ), ]) classifier = sklearn.linear_model.LogisticRegression(C=1.0, solver='lbfgs', max_iter=1000) pipeline = sklearn.pipeline.Pipeline([('step1', feature_tfmr), ('step2', classifier)]) print("made pipeline") pipeline.fit(x_train_NF, y_train_N) print("fit pipeline") err = sklearn.metrics.zero_one_loss(y_valid_M, pipeline.predict(x_valid_MF) >= 0.5) print(err)
def test_classification_workflow(self): X, y = sklearn.datasets.fetch_openml(data_id=24, as_frame=False, return_X_y=True) print(type(X)) X_train, X_test, y_train, y_test = \ sklearn.model_selection.train_test_split(X, y, random_state=3, train_size=0.5, test_size=0.5) X_train = scipy.sparse.csc_matrix(X_train) X_test = scipy.sparse.csc_matrix(X_test) pipeline = sklearn.pipeline.Pipeline(( ('shift', CategoryShift()), ('imput', SimpleImputer(strategy='constant', fill_value=2)), ('ohe', SparseOneHotEncoder()), ('tree', DecisionTreeClassifier(random_state=1)), )) pipeline.fit(X_train, y_train) pred_train = pipeline.predict(X_train) self.assertTrue((pred_train == y_train).all()) # With an incorrect copy operation the OneHotEncoder would rearrange # the data in such a way that the accuracy would drop to 66% pred_test = pipeline.predict(X_test) self.assertTrue((pred_test == y_test).all())
def test_sklearn(self, seed, experiment_run, strs): np = pytest.importorskip("numpy") sklearn = pytest.importorskip("sklearn") from sklearn import cluster, naive_bayes, pipeline, preprocessing np.random.seed(seed) key = strs[0] num_data_rows = 36 X = np.random.random((num_data_rows, 2)) y = np.random.randint(10, size=num_data_rows) pipeline = sklearn.pipeline.make_pipeline( sklearn.preprocessing.StandardScaler(), sklearn.cluster.KMeans(), sklearn.naive_bayes.GaussianNB(), ) pipeline.fit(X, y) experiment_run.log_model(pipeline) retrieved_pipeline = experiment_run.get_model() assert np.allclose(pipeline.predict(X), retrieved_pipeline.predict(X)) assert len(pipeline.steps) == len(retrieved_pipeline.steps) for step, retrieved_step in zip(pipeline.steps, retrieved_pipeline.steps): assert step[0] == retrieved_step[0] # step name assert step[1].get_params() == retrieved_step[1].get_params() # step model
def feature_importance_on_openml_task(task: openml.tasks.OpenMLSupervisedTask, n_trees: int, random_state: int) -> pd.DataFrame: X, y = task.get_X_and_y() pipeline = sklearn.pipeline.make_pipeline( sklearn.impute.SimpleImputer(strategy='median'), sklearn.ensemble.RandomForestClassifier(n_estimators=n_trees, random_state=random_state)) pipeline.fit(X, y) importances = pipeline.steps[-1][-1].feature_importances_ if len(importances) != X.shape[1]: raise ValueError( 'Did not obtain feature importance for all attributes,' 'probably due to constant missing val') features = task.get_dataset().features results = list() for idx, importance in enumerate(importances): results.append({ 'idx': idx, 'importance': importance, 'name': features[idx].name, 'data_type': features[idx].data_type }) df = pd.DataFrame(results) df = df.set_index('idx') return df
def test_multiple_estimators_predict_predict_proba(self): pipeline = (StandardScaler() >> (LogisticRegression() & PCA()) >> ConcatFeatures() >> (NoOp() & LinearSVC()) >> ConcatFeatures() >> KNeighborsClassifier()) pipeline.fit(self.X_train, self.y_train) _ = pipeline.predict_proba(self.X_test) _ = pipeline.predict(self.X_test)
def train_knn(train_X, train_y): # Features selected with forward selection: columns = ['education_num', 'marital_status_Married-civ-spouse', 'net_capital'] # Same arrangement as train_naive_bayes: idxs = [train_X.columns.get_loc(c) for c in columns] pipeline = sklearn.pipeline.make_pipeline( sklearn.preprocessing.FunctionTransformer(lambda x: x[:, idxs]), sklearn.neighbors.KNeighborsClassifier(11, weights="distance", n_jobs=-1), ) pipeline.fit(train_X, train_y) return pipeline
def gen_transformer(self, stransformer_path): scaler = sklearn.preprocessing.StandardScaler() featurizer = sklearn.pipeline.FeatureUnion([ ("rbf1", RBFSampler(gamma=5.0, n_components=100)), ("rbf2", RBFSampler(gamma=2.0, n_components=100)), ("rbf3", RBFSampler(gamma=1.0, n_components=100)), ("rbf4", RBFSampler(gamma=0.5, n_components=100)) ]) pipeline = Pipeline([('scaler', scaler), ('feat', featurizer)]) env = gym.envs.make(ENV_NAME) obs = np.array([env.observation_space.sample() for x in range(10000)]) pipeline.fit(obs) pickle.dump(pipeline, open(stransformer_path, 'wb+'))
def train_model(): select = SelectKBest(k=10) train = load_train_set() test = load_test_set() target = 'condition' hrv_features = list(train) hrv_features = [x for x in hrv_features if x not in [target]] classifiers = [ #MultinomialNB(), #SVC(C=20, kernel='rbf'), ('rdf', RandomForestClassifier()) ] for clf in classifiers: count_time = time.time() X_train = train[hrv_features] y_train = train[target] X_test = test[hrv_features] y_test = test[target] name = str(clf).split('(')[0] """if 'multinomialnb'==name.lower(): scaler = MinMaxScaler() scaler.fit(X_train) X_train = scaler.transform(X_train) X_test = scaler.transform(X_test) else: scaler = StandardScaler() scaler.fit(X_train) X_train = scaler.transform(X_train) X_test = scaler.transform(X_test)""" print(name) """steps = [('feature_selection', select), ('model', clf)]""" steps = [('scaler', StandardScaler()), ('feature_selection', select), ('model', clf)] pipeline = sklearn.pipeline.Pipeline(steps) pipeline.fit(X_train, y_train) y_prediction = pipeline.predict(X_test) print("----------------------------{0}---------------------------". format(name)) print(sklearn.metrics.classification_report(y_test, y_prediction)) count_time = time.time() - count_time print("time: ", count_time) print() print() pickle.dump(pipeline, open('model_stress.pkl', 'wb')) #joblib.dump(pipeline, 'model_stress.pkl') print("done")
def report(clf, features_train, features_test, labels_train, labels_test): ##input: # clf: classifier you set ##output: accuracy, recall, precision and f1 score you have got. steps = [('classifier', clf)] pipeline = sklearn.pipeline.Pipeline(steps) pipeline.fit(features_train, labels_train) y_prediction = pipeline.predict( features_test ) report = sklearn.metrics.classification_report( labels_test, y_prediction ) return report
def test_subsampler_tica(): n_traj, n_samples, n_features = 1, 500, 4 lag_time = 2 X_all_0 = [random.normal(size=(n_samples, n_features)) for i in range(n_traj)] tica_0 = mixtape.tica.tICA(lag_time=lag_time) tica_0.fit(X_all_0) subsampler = mixtape.utils.Subsampler(lag_time=lag_time) tica_1 = mixtape.tica.tICA() pipeline = sklearn.pipeline.Pipeline([("subsampler", subsampler), ('tica', tica_1)]) pipeline.fit(X_all_0) eq(tica_0.n_features, tica_1.n_features) # Obviously true eq(tica_0.n_observations_, tica_1.n_observations_) eq(tica_0.eigenvalues_, tica_1.eigenvalues_) # The eigenvalues should be the same. NOT the timescales, as tica_1 has timescales calculated in a different time unit
def test_pipeline_create(self): from lale.operators import Pipeline pipeline = Pipeline(([("pca1", PCA()), ("lr1", LogisticRegression())])) trained = pipeline.fit(self.X_train, self.y_train) predictions = trained.predict(self.X_test) accuracy_score(self.y_test, predictions)
def test_pipeline(): pipeline = dl.Pipeline([("scale", StandardScaler()), ("fdr", SelectFdr()), ("svm", LinearSVC())]) pipeline = pipeline.fit(X, y) y2 = pipeline.predict(X) score = pipeline.score(X, y) assert isinstance(y2, di.Value) assert isinstance(score, di.Value) assert isinstance(score.compute(), float) assert pipeline.score(X, y).key == pipeline.score(X, y).key assert score.compute() == score.compute() y22 = y2.compute() assert y22.shape == y.shape assert y22.dtype == y.dtype skpipeline = sklearn.pipeline.Pipeline([("scale", StandardScaler()), ("fdr", SelectFdr()), ("svm", LinearSVC())]) skpipeline.fit(X, y) sk_y2 = skpipeline.predict(X) sk_score = skpipeline.score(X, y) assert sk_score == score.compute()
def test_autoai_libs_tam_4(self): import autoai_libs.cognito.transforms.transform_utils import numpy as np import sklearn.decomposition import sklearn.linear_model import sklearn.pipeline import lale.helpers import lale.operators import lale.pretty_print sklearn_pipeline = sklearn.pipeline.make_pipeline( autoai_libs.cognito.transforms.transform_utils.TAM( tans_class=sklearn.decomposition.PCA(), name="pca", col_names=["a", "b", "c"], col_dtypes=[ np.dtype("float32"), np.dtype("float32"), np.dtype("float32"), ], ), sklearn.linear_model.LogisticRegression(solver="liblinear", multi_class="ovr"), ) pipeline = lale.helpers.import_from_sklearn_pipeline(sklearn_pipeline, fitted=False) assert isinstance(pipeline, lale.operators.TrainableOperator) expected = """from autoai_libs.cognito.transforms.transform_utils import TAM from sklearn.decomposition import PCA import numpy as np from sklearn.linear_model import LogisticRegression import lale lale.wrap_imported_operators() tam = TAM( tans_class=PCA(), name="pca", col_names=["a", "b", "c"], col_dtypes=[ np.dtype("float32"), np.dtype("float32"), np.dtype("float32"), ], ) logistic_regression = LogisticRegression( multi_class="ovr", solver="liblinear" ) pipeline = tam >> logistic_regression""" self._roundtrip(expected, lale.pretty_print.to_string(pipeline)) import numpy as np import pandas as pd test = pd.DataFrame( np.random.randint(0, 100, size=(15, 3)), columns=["a", "b", "c"], dtype=np.dtype("float32"), ) trained = pipeline.fit(test.to_numpy(), [0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1]) trained.predict(test.to_numpy())
def create(x_train, y_train, **_): '''Create zero-hypothesis predictor. ''' # Even DummyClassifier does not use the incoming data, # sklearn will explode if the data provided to classifier # is not flat. transform = sklearn.preprocessing.FunctionTransformer( lambda X: np.zeros((len(X), 1)), validate=False, ) classifier = sklearn.dummy.DummyClassifier( 'constant', constant=audiolabel.preprocess.k_hot_encode(['/m/04rlf']), ) pipeline = sklearn.pipeline.Pipeline([ ('transform', transform), ('classifier', classifier), ]) predictor = pipeline.fit(x_train, y_train) return audiolabel.util.Predictor(predictor.predict)
def test_autoai_libs_tam_4(self): import autoai_libs.cognito.transforms.transform_utils import lale.helpers import numpy as np import sklearn.cluster.hierarchical import sklearn.linear_model import sklearn.pipeline sklearn_pipeline = sklearn.pipeline.make_pipeline( autoai_libs.cognito.transforms.transform_utils.TAM(tans_class=sklearn.decomposition.PCA(), name='pca', col_names=['a', 'b', 'c'], col_dtypes=[np.dtype('float32'), np.dtype('float32'), np.dtype('float32')]), sklearn.linear_model.LogisticRegression(solver='liblinear', multi_class='ovr')) pipeline = lale.helpers.import_from_sklearn_pipeline(sklearn_pipeline, fitted=False) expected = \ """from lale.lib.autoai_libs import TAM from lale.lib.sklearn import PCA import numpy as np from lale.lib.sklearn import LogisticRegression import lale lale.wrap_imported_operators() tam = TAM(tans_class=PCA(), name='pca', col_names=['a', 'b', 'c'], col_dtypes=[np.dtype('float32'), np.dtype('float32'), np.dtype('float32')]) pipeline = tam >> LogisticRegression()""" self._roundtrip(expected, lale.pretty_print.to_string(pipeline)) import pandas as pd import numpy as np test = pd.DataFrame(np.random.randint(0,100,size=(15, 3)), columns=['a','b','c'], dtype=np.dtype('float32')) trained = pipeline.fit(test.to_numpy(), [0,1,1,0,0,0,1,1,1,1,0,0,1,0,1]) trained.predict(test.to_numpy())
def knn(X, y, K, test_ratio=0.2): X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=test_ratio) sc = StandardScaler() X_train_scaled = sc.fit_transform(X_train) X_test_scaled = sc.fit_transform(X_test) pipeline = make_pipeline(StandardScaler(), KNeighborsClassifier(n_neighbors=K)) model = pipeline.fit(X_train_scaled, y_train) y_pred = model.predict(X_test_scaled) print("Result of KNN WITH PREPROCESSING using K={} is: \n".format(K)) labels = np.unique(y) cm = confusion_matrix(y_test, y_pred, labels) print("Confusion Matrix: \n", cm) sns.heatmap(cm, annot=True, fmt='d', xticklabels=labels, yticklabels=labels) print("Classification report: \n", classification_report(y_test, y_pred)) print("Training accuracy: ", metrics.accuracy_score(y_test, y_pred))
def test_sklearn_random_forest_newsgroups(): # note: this test used to fail in native TreeExplainer code due to memory corruption newsgroups_train, newsgroups_test, _ = create_binary_newsgroups_data() pipeline = create_random_forest_vectorizer() pipeline.fit(newsgroups_train.data, newsgroups_train.target) rf = pipeline.named_steps['rf'] vectorizer = pipeline.named_steps['vectorizer'] densifier = pipeline.named_steps['to_dense'] dense_bg = densifier.transform(vectorizer.transform(newsgroups_test.data[0:20])) test_row = newsgroups_test.data[83:84] explainer = shap.TreeExplainer(rf, dense_bg, feature_perturbation="interventional") vec_row = vectorizer.transform(test_row) dense_row = densifier.transform(vec_row) explainer.shap_values(dense_row)
def test_classification_workflow(self): task = openml.tasks.get_task(254) X, y = task.get_X_and_y() ohe = OneHotEncoder(categorical_features=[True]*22) tree = sklearn.tree.DecisionTreeClassifier(random_state=1) pipeline = sklearn.pipeline.Pipeline((('ohe', ohe), ('tree', tree))) X_train, X_test, y_train, y_test = \ sklearn.model_selection.train_test_split(X, y, random_state=3, train_size=0.5, test_size=0.5) pipeline.fit(X_train, y_train) self.assertEqual(np.mean(y_train == pipeline.predict(X_train)), 1) # With an incorrect copy operation the OneHotEncoder would rearrange # the data in such a way that the accuracy would drop to 66% self.assertEqual(np.mean(y_test == pipeline.predict(X_test)), 1)
def test_classification_workflow(self): task = openml.tasks.get_task(254) X, y = task.get_X_and_y() ohe = OneHotEncoder(categorical_features=[True] * 22) tree = sklearn.tree.DecisionTreeClassifier(random_state=1) pipeline = sklearn.pipeline.Pipeline((('ohe', ohe), ('tree', tree))) X_train, X_test, y_train, y_test = \ sklearn.model_selection.train_test_split(X, y, random_state=3, train_size=0.5, test_size=0.5) pipeline.fit(X_train, y_train) self.assertEqual(np.mean(y_train == pipeline.predict(X_train)), 1) # With an incorrect copy operation the OneHotEncoder would rearrange # the data in such a way that the accuracy would drop to 66% self.assertEqual(np.mean(y_test == pipeline.predict(X_test)), 1)
def run_pipeline(df, pipeline, pipeline_name=''): X = pd.Series(df["text"]) y = preprocessing.LabelEncoder().fit_transform(df.author.values) rskf = StratifiedKFold(n_splits=5, random_state=1) losses = [] for train_index, test_index in rskf.split(X, y): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] pipeline.fit(X_train, y_train) predictions = pipeline.predict_proba(X_test) log_loss = metrics.log_loss(y_test, predictions) losses.append(log_loss) print(" Log loss: " + str(log_loss)) print(" Accuracy : %0.3f " % calculate_accuracy(y_test, predictions)) print(f'{pipeline_name} mean log loss: {round(pd.np.mean(losses), 3)}')
def main(args): dataset = getattr(sklearn.datasets, "load_{}".format(args.dataset))() X = np.array(dataset.data) Y = np.array(dataset.target) # TODO: Split the dataset into a train set and a test set. # Use `sklearn.model_selection.train_test_split` method call, passing # arguments `test_size=args.test_size, random_state=args.seed`. X_train, X_test, Y_train, Y_test = train_test_split( X, Y, test_size=args.test_size, random_state=args.seed) X_train = pd.DataFrame(X_train) X_test = pd.DataFrame(X_test) # TODO: Process the input columns in the following way: # # - if a column has only integer values, consider it a categorical column # (days in a week, dog breed, ...; in general integer values can also # represent numerical non-categorical values, but we use this assumption # for the sake of an exercise). Encode the values with one-hot encoding # using `sklearn.preprocessing.OneHotEncoder` (note that its output is by # default sparse, you can use `sparse=False` to generate dense output; # also use `handle_unknown="ignore"` to ignore missing values in test set). # # - for the rest of the columns, normalize their values so that they # have mean 0 and variance 1; use `sklearn.preprocessing.StandardScaler`. # # In the output, there should be first all the one-hot categorical features, # and then the real-valued features. To process different dataset columns # differently, you can use `sklearn.compose.ColumnTransformer`. # Check categorical columns categ_check = np.all(X.astype(int) == X, axis=0) categ_colnames = [i for i, x in enumerate(categ_check) if x] non_categ_colnames = [i for i, x in enumerate(categ_check) if not x] col_trans = sklearn.compose.ColumnTransformer([ ('1hot', sklearn.preprocessing.OneHotEncoder(sparse=False, handle_unknown='ignore'), categ_colnames), ('standard', sklearn.preprocessing.StandardScaler(), non_categ_colnames) ]) # TODO: Generate polynomial features of order 2 from the current features. # If the input values are [a, b, c, d], you should generate # [a^2, ab, ac, ad, b^2, bc, bd, c^2, cd, d^2]. You can generate such polynomial # features either manually, or using # `sklearn.preprocessing.PolynomialFeatures(2, include_bias=False)`. poly = sklearn.preprocessing.PolynomialFeatures(2, include_bias=False) pipeline = sklearn.pipeline.Pipeline([('col_trans', col_trans), ('poly', poly)]) fit = pipeline.fit(X_train) train_data = fit.transform(X_train) test_data = fit.transform(X_test) return train_data, test_data
def test_sample_1(): # Test that the code actually runs and gives something non-crazy # Make an ergodic dataset with two gaussian centers offset by 25 units. chunk = np.random.normal(size=(20000, 3)) data = [np.vstack((chunk, chunk + 25)), np.vstack((chunk + 25, chunk))] clusterer = cluster.KMeans(n_clusters=2) msm = MarkovStateModel() pipeline = sklearn.pipeline.Pipeline([("clusterer", clusterer), ("msm", msm)]) pipeline.fit(data) trimmed_assignments = pipeline.transform(data) # Now let's make make the output assignments start with # zero at the first position. i0 = trimmed_assignments[0][0] if i0 == 1: for m in trimmed_assignments: m *= -1 m += 1 pairs = msm.draw_samples(trimmed_assignments, 2000) samples = map_drawn_samples(pairs, data) mu = np.mean(samples, axis=1) eq(mu, np.array([[0., 0., 0.0], [25., 25., 25.]]), decimal=1) # We should make sure we can sample from Trajectory objects too... # Create a fake topology with 1 atom to match our input dataset top = md.Topology.from_dataframe(pd.DataFrame({ "serial": [0], "name": ["HN"], "element": ["H"], "resSeq": [1], "resName": "RES", "chainID": [0] }), bonds=np.zeros(shape=(0, 2), dtype='int')) # np.newaxis reshapes the data to have a 40000 frames, 1 atom, 3 xyz trajectories = [md.Trajectory(x[:, np.newaxis], top) for x in data] trj_samples = map_drawn_samples(pairs, trajectories) mu = np.array([t.xyz.mean(0)[0] for t in trj_samples]) eq(mu, np.array([[0., 0., 0.0], [25., 25., 25.]]), decimal=1)
def main(): #X, y = load_data() X_train, X_test, y_train, y_test = train_test_split(X, Y) # build pipeline pipeline = sklearn.pipeline.Pipeline([ ('vect', CountVectorizer(tokenizer=tokenize)), ('tfidf', TfidfTransformer()), ('clf', RandomForestClassifier()) ]) # train classifier pipeline.fit(X_train,y_train) # predict on test data # display results display_results(y_test, y_pred)
def test_two_estimators_predict_proba(self): pipeline = ( StandardScaler() >> (PCA() & Nystroem() & LogisticRegression()) >> ConcatFeatures() >> NoOp() >> LogisticRegression() ) trained = pipeline.fit(self.X_train, self.y_train) trained.predict_proba(self.X_test)
def test_sample_1(): # Test that the code actually runs and gives something non-crazy # Make an ergodic dataset with two gaussian centers offset by 25 units. chunk = np.random.normal(size=(20000, 3)) data = [np.vstack((chunk, chunk + 25)), np.vstack((chunk + 25, chunk))] clusterer = cluster.KMeans(n_clusters=2) msm = MarkovStateModel() pipeline = sklearn.pipeline.Pipeline( [("clusterer", clusterer), ("msm", msm)] ) pipeline.fit(data) trimmed_assignments = pipeline.transform(data) # Now let's make make the output assignments start with # zero at the first position. i0 = trimmed_assignments[0][0] if i0 == 1: for m in trimmed_assignments: m *= -1 m += 1 pairs = msm.draw_samples(trimmed_assignments, 2000) samples = map_drawn_samples(pairs, data) mu = np.mean(samples, axis=1) eq(mu, np.array([[0., 0., 0.0], [25., 25., 25.]]), decimal=1) # We should make sure we can sample from Trajectory objects too... # Create a fake topology with 1 atom to match our input dataset top = md.Topology.from_dataframe( pd.DataFrame({ "serial": [0], "name": ["HN"], "element": ["H"], "resSeq": [1], "resName": "RES", "chainID": [0] }), bonds=np.zeros(shape=(0, 2), dtype='int') ) # np.newaxis reshapes the data to have a 40000 frames, 1 atom, 3 xyz trajectories = [md.Trajectory(x[:, np.newaxis], top) for x in data] trj_samples = map_drawn_samples(pairs, trajectories) mu = np.array([t.xyz.mean(0)[0] for t in trj_samples]) eq(mu, np.array([[0., 0., 0.0], [25., 25., 25.]]), decimal=1)
def train_naive_bayes(train_X, train_y): # GaussianNB (which I'm assuming is what 'naive_bayes' meant) has # seemingly no hyperparameters to speak of, so feature # selection/transformation is where I started. The below features # were found via a couple runs of forward selection: columns = ['net_capital', 'education_Prof-school', 'education_Doctorate', 'occupation_Transport-moving', 'education_Masters', 'marital_status_Never-married', 'education_Bachelors', 'relationship_Not-in-family', 'occupation_Exec-managerial'] # Turn columns to indices, as FunctionTransformer seems to receive # normal NumPy arrays (not dataframes): idxs = [train_X.columns.get_loc(c) for c in columns] pipeline = sklearn.pipeline.make_pipeline( sklearn.preprocessing.FunctionTransformer(lambda x: x[:, idxs]), sklearn.naive_bayes.GaussianNB(), ) pipeline.fit(train_X, train_y) return pipeline
def main(args): # make data for yourself X, y = sklearn.datasets.make_classification(n_samples=args.data_size) train_data, test_data, train_target, test_target = sklearn.model_selection.train_test_split( X, y, test_size=args.test_size, random_state=args.seed) features = [] if args.original: # like identity transformer # when you don't feed any function # it doesn't do anything to features features.append( ("original", sklearn.preprocessing.FunctionTransformer())) if args.rff: features.append(("rff", RFFsTransformer(args.rff, args.gamma, args.seed))) if args.nystroem: features.append(("nystroem", NystroemTransformer(args.nystroem, args.gamma, args.seed))) if args.svm: classifier = sklearn.svm.SVC() else: classifier = sklearn.linear_model.LogisticRegression( solver="saga", penalty="none", max_iter=args.max_iter, random_state=args.seed) pipeline = sklearn.pipeline.Pipeline([ ("scaling", sklearn.preprocessing.StandardScaler()), ("features", sklearn.pipeline.FeatureUnion(features)), ("classifier", classifier), ]) pipeline.fit(train_data, train_target) test_accuracy = sklearn.metrics.accuracy_score(test_target, pipeline.predict(test_data)) return test_accuracy
def test_subsampler_tica(): n_traj, n_samples, n_features = 1, 500, 4 lag_time = 2 X_all_0 = [ random.normal(size=(n_samples, n_features)) for i in range(n_traj) ] tica_0 = tICA(lag_time=lag_time) tica_0.fit(X_all_0) subsampler = Subsampler(lag_time=lag_time) tica_1 = tICA() pipeline = sklearn.pipeline.Pipeline([("subsampler", subsampler), ('tica', tica_1)]) pipeline.fit(X_all_0) eq(tica_0.n_features, tica_1.n_features) # Obviously true eq(tica_0.n_observations_, tica_1.n_observations_) eq( tica_0.eigenvalues_, tica_1.eigenvalues_ ) # The eigenvalues should be the same. NOT the timescales, as tica_1 has timescales calculated in a different time unit
def sklearn_pipeline(self, train_proportion=0.8, joke_limit=5000, debug=False): test_proportion = 1 - train_proportion ### get random sample of jokes where joke["categories"] isn't empty jokes_to_use = random.sample(list(filter(lambda joke: joke["categories"], self._jokes)), joke_limit) ### create CountVectorizer vectorizer = sklearn.feature_extraction.text.CountVectorizer( input="content", analyzer=u"word", token_pattern=r"\b\w+\b", # tokenize string by extracting words of at least 1 letter. I think default is r"\b\w{2,}\b" ngram_range=(1,1), # TODO: experiment with this binary=False, ) ### create data and target vectors X = vectorizer.fit_transform(joke["content"] for joke in jokes_to_use) y = np.fromiter((self._categoryIDs[joke["categories"][0]] for joke in jokes_to_use), np.int8) X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(X, y, test_size=test_proportion) ### setting up pipeline. feel free to experiment here select = sklearn.feature_selection.SelectKBest(k=100) clf = sklearn.naive_bayes.MultinomialNB() steps = [("feature_selection", select), ("naive_bayes", clf)] pipeline = sklearn.pipeline.Pipeline(steps) ### fit your pipeline on X_train and y_train pipeline.fit(X_train, y_train) ### call pipeline.predict() on your X_test data to make a set of test predictions y_prediction = pipeline.predict(X_test) ### test your predictions using sklearn.classification_report() report = sklearn.metrics.classification_report(y_test, y_prediction) ### and print the report print(report) print("overall accuracy: {:.2f}%".format(sklearn.metrics.accuracy_score(y_test, y_prediction) * 100)) print() for index, category in enumerate(self._categories): print("{}: {} ({} jokes)".format(index, category, self._categories[category]))
def test_pipeline_shares_structure(): pipeline = dl.Pipeline([("scale", StandardScaler()), ("fdr", SelectFdr()), ("svm", LinearSVC())]) pipeline1 = pipeline.fit(X, y) score1 = pipeline1.score(X, y) pipeline2 = pipeline.set_params(svm__C=0.1) pipeline2 = pipeline2.fit(X, y) score2 = pipeline2.score(X, y) assert (len(merge(score1.dask, score2.dask)) <= (len(score1.dask) + len(score2.dask)) * 0.75) assert score1.key != score2.key
clf__min_samples_split=range(1,4,3), clf__min_samples_leaf=range(2,4,1), clf__min_weight_fraction_leaf=[0], ) #grid_search = sklearn.grid_search.GridSearchCV( # pipeline, n_jobs=1, param_grid=param_grid, verbose=100, # scoring=youdenJ,score_func=youdenJ, # cv=sklearn.cross_validation.PredefinedSplit(testidx)) #grid_search.fit(trainFact[:,rfecv.support_], labels) #results1=([sklearn.metrics.confusion_matrix(labels,grid_search.best_estimator_.predict(train))]) #grid_search_results1=(grid_search.grid_scores_) #kwargs=grid_search.best_params_ #pipeline.set_params(**kwargs) pipeline.fit(train[train.columns[rfecv.support_]],labels) predictions=(pipeline.predict_proba(test[train.columns[rfecv.support_]])[:,1]>=0.02)*1 predictionstrain=(pipeline.predict_proba(train[train.columns[rfecv.support_]])[:,1]>=0.02)*1 print Youdens_func(labels,predictionstrain) # create predictions and submission file sample['WnvPresent'] = predictions sample.to_csv('testpredicts5.csv', index=False) print sum(predictions)
except: print('Could not impute column:{}'.format(col)) continue # Resample minority class if over_sampler != "None": X_resampled = np.array(X_train) y_resampled = np.array(y_train) X_resampled, y_resampled = os_object.fit_transform(X_resampled,y_resampled) else: X_resampled = X_train y_resampled = y_train t0 = time.clock() pipeline.fit(X_resampled, y_resampled) time_to_fit = (time.clock() - t0) print("done fitting in {}".format(time_to_fit)) ''' Predictions ''' predicted = pipeline.predict(X_test) try: predicted_prob = pipeline.predict_proba(X_test) predicted_prob = predicted_prob[:, 1] # probability that label is 1 except: print("Model has no predict_proba method")
############ feature select, classify, test-set validate, report selector = feature_selection.SelectKBest(k=100) classifier = naive_bayes.MultinomialNB(class_prior = np.reshape(np.repeat(np.array([[1.0/14.0]]),14,axis=1), (14,))) #flat priors steps = [('feature_selection', selector), ('multinomial_nb', classifier)] pipeline = pipeline.Pipeline(steps) t0 = time() X_train, X_test, y_train, y_test = cross_validation.train_test_split(X_matrix, label_dum, test_size=0.33, random_state=30) print("X_train dimensions: " + str(X_train.shape)) print("y_train dimensions: " + str(y_train.shape)) ### fit your pipeline on X_train and y_train pipeline.fit( X_train, y_train ) ### call pipeline.predict() on your X_test data to make a set of test predictions y_prediction = pipeline.predict( X_test ) ### test your predictions using sklearn.classification_report() report = metrics.classification_report( y_test, y_prediction ) ### and print the report print("Classifying unlabeled data done in: %fs" % (time()-t0)) print(report) kfeatures = np.asarray(selector.get_support(indices=True)) print(np.asarray(vectorizer.get_feature_names())[kfeatures]) ################################################################# ###### 3. Use classifier on unlabelled data pred_unlab = pipeline.predict(X_matrix_unlab).tolist()
import mixtape.featurizer, mixtape.tica, mixtape.cluster, mixtape.markovstatemodel, mixtape.ghmm import numpy as np import mdtraj as md from parameters import load_trajectories, build_full_featurizer import sklearn.pipeline, sklearn.externals.joblib import mixtape.utils n_choose = 50 stride = 1 lag_time = 1 n_components = 2 trj0, trajectories, filenames = load_trajectories(stride=stride) train = trajectories[0::2] test = trajectories[1::2] featurizer = sklearn.externals.joblib.load("./featurizer-%d-%d.job" % (n_components, n_choose)) for n_states in [10, 20, 30, 40, 50]: n_components = n_components tica = mixtape.tica.tICA(n_components=n_components, lag_time=lag_time) msm = mixtape.markovstatemodel.MarkovStateModel(n_timescales=5) cluster = mixtape.cluster.KMeans(n_clusters=n_states) pipeline = sklearn.pipeline.Pipeline([("features", featurizer), ('tica', tica), ("cluster", cluster), ("msm", msm)]) pipeline.fit(train) print(pipeline.score(train), pipeline.score(test)) pipeline.fit(trajectories) print(msm.timescales_)
def cluster_frames(): seed = 0 np.random.seed(seed) parser = argparse.ArgumentParser() parser.add_argument('input_filename') parser.add_argument("data_proportion", nargs='?', type=float, default=1., help="Proportion of full dataset to be used") parser.add_argument("--log", type=str, default='INFO', help="Logging setting (e.g., INFO, DEBUG)") parser.add_argument('-o', '--output_filename', help='Filename of video to be saved (default: does not save)') args = parser.parse_args() # Setting logging parameters numeric_level = getattr(logging, args.log.upper(), None) if not isinstance(numeric_level, int): raise ValueError('Invalid log level: %s' % loglevel) logging.basicConfig(level=numeric_level, format='%(asctime)s %(message)s') sample_inds = [212, 699, 988, 1105, 2190, 2318] logging.info('Loading %i images... ', len(sample_inds)) # Load data d = 6 # size of patch all_frames = util.grab_frame(args.input_filename) im_originals = list(util.index(all_frames, sample_inds)) im_height, im_width = im_originals[0].shape[:2] all_patch_rows = np.array(list( patch.ravel() for im in im_originals for patch in util.yield_windows(im, (d, d), (1, 1)) )) num_rows_per_im = len(all_patch_rows) // len(im_originals) num_im = len(im_originals) logging.info('Loaded %i examples from %i images', len(all_patch_rows), len(im_originals)) # Randomly sample a subset of the data sample_size = int(args.data_proportion * len(all_patch_rows)) inds = np.random.choice(len(all_patch_rows), sample_size) X = all_patch_rows[inds] logging.info('Sampled %.1f%% of dataset = %i', 100 * args.data_proportion, sample_size) ############################# Define pipeline ############################# std_scaler = (sklearn.preprocessing.StandardScaler, {}) coates_scaler = (CoatesScaler.CoatesScaler, {}) pca = (sklearn.decomposition.PCA, {'whiten':True, 'copy':True} ) zca = (ZCA.ZCA, {'regularization': .1}) n_clusters = 100 mbkmeans = (sklearn.cluster.MiniBatchKMeans, { 'n_clusters': n_clusters, 'batch_size': 3000, }) skmeans = (SphericalKMeans.SphericalKMeans, { 'n_clusters': n_clusters, 'max_iter': 10, }) kmeans = (sklearn.cluster.KMeans, { 'n_clusters': n_clusters, #'random_state': np.random.RandomState, #'n_jobs': -1, #'n_init': 1, #'max_iter': 10, }) # Define pipeline steps = [coates_scaler, zca, kmeans] pipeline = sklearn.pipeline.make_pipeline( *[fun(**kwargs) for fun, kwargs in steps]) # Define pointers to certain steps for future processing whitener = pipeline.steps[1][1] # second step dic = pipeline.steps[-1][1] # last step steps = [(obj.__class__, obj.get_params()) for name, obj in pipeline.steps] util.print_steps(steps) ######################### Train pipeline ################################## logging.info('Training model...') pipeline.fit(X) logging.info('done.') ######################### Display atoms of dictionary ##################### frames = util.grab_frame(args.input_filename) patch_row_chunks = ( np.array(list( patch.ravel() for patch in util.yield_windows(im, (d, d), (1, 1)))) for im in frames) def im_displays(): for patch_rows in patch_row_chunks: y = pipeline.predict(patch_rows) # Map to [0, 1) so that imshow scales across entire colormap spectrum y = y / n_clusters newshape = (im_height - d + 1, im_width - d + 1, ) segmentation = np.reshape(y, newshape) # Apply color map and remove alpha channel cmap = plt.cm.Set1 colored_segmentation = cmap(segmentation)[:, :, :3] colored_segmentation = (colored_segmentation * 255).astype(np.uint8) yield colored_segmentation #frames = itertools.islice(im_displays(), 5) frames = im_displays() save_video = args.output_filename is not None if save_video: write_frames_to_disk(frames, args.output_filename) else: display_frames(frames) return logging.info('Displaying atoms of dictionary') # Inverse whiten atoms of dictionary atom_rows = dic.cluster_centers_ if hasattr(whitener, 'inverse_transform'): atom_rows = whitener.inverse_transform(atom_rows) plt.figure() for i, atom_row in enumerate(atom_rows): patch = atom_row.reshape(d, d, -1)[::-1] plt.subplot(10, 10, i + 1) plt.imshow(patch, interpolation='nearest') plt.xticks(()) plt.yticks(()) plt.suptitle('Atoms of dictionary learnt from %i patches by %s' % \ (len(atom_rows), dic.__class__.__name__)) plt.figure() displayed_patches = X[np.random.choice(len(X), 100)] for i, patch in enumerate(displayed_patches): plt.subplot(10, 10, i + 1) plt.imshow(patch.reshape([d, d, -1])[:,:,::-1], interpolation='nearest') plt.xticks(()) plt.yticks(()) plt.show()
import sklearn.pipeline, sklearn.externals.joblib import mixtape.utils # Copy paste from optimize ala n_timescales = 4 n_states = 6 tica = mixtape.tica.tICA(n_components=n_components, lag_time=lag_time) msm = mixtape.markovstatemodel.MarkovStateModel(n_timescales=n_timescales) cluster = mixtape.cluster.GMM(n_components=n_states, covariance_type='full') feature_pipeline = sklearn.pipeline.Pipeline([("features", featurizer), ('tica', tica)]) cluster_pipeline = sklearn.pipeline.Pipeline([("features", featurizer), ('tica', tica), ("cluster", cluster)]) pipeline = sklearn.pipeline.Pipeline([("features", featurizer), ('tica', tica), ("cluster", cluster), ("msm", msm)]) pipeline.fit(train) print(pipeline.score(train), pipeline.score(test)) X_all = feature_pipeline.transform(trajectories) q = np.concatenate(X_all) covars_ = cluster.covars_ covars_ = cluster.covars_.diagonal(axis1=1, axis2=2) for i, j in [(0, 1)]: figure() title("%d" % n_states) hexbin(q[:,i], q[:, j], bins='log') errorbar(cluster.means_[:, i], cluster.means_[:, j], xerr=covars_[:,i] ** 0.5, yerr=covars_[:, j] ** 0.5, fmt='kx', linewidth=4) offset = np.ones(2) * 0.05 for state in range(n_states):