コード例 #1
0
    def test_classification_workflow(self):
        task = openml.tasks.get_task(254)
        X, y = task.get_X_and_y()

        X_train, X_test, y_train, y_test = \
            sklearn.model_selection.train_test_split(X, y, random_state=3,
                                                     train_size=0.5,
                                                     test_size=0.5)

        X_train = scipy.sparse.csc_matrix(X_train)
        X_test = scipy.sparse.csc_matrix(X_test)

        pipeline = sklearn.pipeline.Pipeline((
            ('shift', CategoryShift()),
            ('imput', SimpleImputer(strategy='constant', fill_value=2)),
            ('ohe', SparseOneHotEncoder()),
            ('tree', DecisionTreeClassifier(random_state=1)),
        ))

        pipeline.fit(X_train, y_train)
        pred_train = pipeline.predict(X_train)
        self.assertTrue((pred_train == y_train).all())
        # With an incorrect copy operation the OneHotEncoder would rearrange
        # the data in such a way that the accuracy would drop to 66%
        pred_test = pipeline.predict(X_test)
        self.assertTrue((pred_test == y_test).all())
    def runClassifications(self, estimatorDict):

        for estimatorName in estimatorDict:

            print estimatorName + "in Progress..."
            print "Selecting K - Best features"
            select = sklearn.feature_selection.SelectKBest(k=40000)

            clf = estimatorDict[estimatorName]

            steps = [('feature_selection', select), ('random_forest', clf)]

            #Implementing Pipeline for Report Generation
            pipeline = sklearn.pipeline.Pipeline(steps)
            pipeline.fit(self.prcs.X_train, self.prcs.y_train)
            Pred_y = clf.predict(self.prcs.X_test)

            Accuracy = np.mean(Pred_y == self.prcs.y_test)

            report = sklearn.metrics.classification_report(
                self.prcs.y_test, Pred_y)
            #Preparing the report using pipeline
            print(report)

            print "Model's Accuracy: %f" % Accuracy

            print 'Prediction in progress...'

            y_fp = clf.predict(self.prcs.Pred_x)
            #Decoding the labels
            self.prcs.Decode_Labels(y_fp,
                                    estimatorName + '_PredictedOut' + '.csv')
コード例 #3
0
def main(args):
    # Use the digits dataset.
    dataset = MNIST(data_size=5000)

    # Split the dataset into a train set and a test set.
    train_data, test_data, train_target, test_target = sklearn.model_selection.train_test_split(
        dataset.data, dataset.target, test_size=args.test_size, random_state=args.seed)

    features = []
    if args.original:
        features.append(("original", sklearn.preprocessing.FunctionTransformer()))
    if args.rff:
        features.append(("rff", RFFsTransformer(args.rff, args.gamma, args.seed)))
    if args.nystroem:
        features.append(("nystroem", NystroemTransformer(args.nystroem, args.gamma, args.seed)))

    if args.svm:
        classifier = sklearn.svm.SVC()
    else:
        classifier = sklearn.linear_model.LogisticRegression(solver="saga", penalty="none", max_iter=args.max_iter, random_state=args.seed)

    pipeline = sklearn.pipeline.Pipeline([
        ("scaling", sklearn.preprocessing.MinMaxScaler()),
        ("features", sklearn.pipeline.FeatureUnion(features)),
        ("classifier", classifier),
    ])
    pipeline.fit(train_data, train_target)

    test_accuracy = sklearn.metrics.accuracy_score(test_target, pipeline.predict(test_data))
    return test_accuracy
コード例 #4
0
 def test_two_estimators_predict_proba1(self):
     pipeline = (
         StandardScaler() >>
         (PCA() & Nystroem() & PassiveAggressiveClassifier()) >>
         ConcatFeatures() >> NoOp() >> PassiveAggressiveClassifier())
     pipeline.fit(self.X_train, self.y_train)
     pipeline.predict_proba(self.X_test)
コード例 #5
0
def score_solution(model, save=0):
    '''
    Added a model and save parameter:
        model ~ hold a classification model 
        save ~ Flag used to save the best model on file using jobLib
    '''
    # Ask the solution for the model pipeline.
    import solution
    pipeline = solution.get_pipeline(model)
    error_message = 'Your `solution.get_pipeline` implementation should ' \
        'return an `sklearn.pipeline.Pipeline`.'
    assert isinstance(pipeline, sklearn.pipeline.Pipeline), error_message
    # Train the model on the training DataFrame.
    X_train, y_train = get_data(subset='train')
    pipeline.fit(X_train, y_train)
    # Apply the model to the test DataFrame.
    X_test, y_test = get_data(subset='test')
    y_pred = pipeline.predict_proba(X_test)
    # Check that the predicted probabilities have an sklearn-compatible shape.
    assert (y_pred.ndim == 1) or \
        (y_pred.ndim == 2 and y_pred.shape[1] == 2), \
        'The predicted probabilities should match sklearn''s ' \
        '`predict_proba` output shape`.'
    y_pred = y_pred if y_pred.ndim == 1 else y_pred[:, 1]
    # Evaluate the predictions with the AUC of the ROC curve.
    if (save == 1): joblib.dump(pipeline, 'Best_Estimator.sav')
    return sklearn.metrics.roc_auc_score(y_test, y_pred)
コード例 #6
0
ファイル: feat_transform.py プロジェクト: cjabswalsh/MLProj1
def make_model():
    dataset_path = 'data_sneaker_vs_sandal'
    x_all_d = pd.read_csv(os.path.join(dataset_path, 'x_train.csv'))
    x_all = x_all_d.values
    A, F = x_all.shape

    x_train_NF = x_all[:9000]
    N = 9000
    x_valid_MF = x_all[9000:]
    M = 3000

    y_all_d = pd.read_csv(os.path.join(dataset_path, 'y_train.csv'))
    y_all = y_all_d.values.reshape((A, ))
    y_train_N = y_all[:9000]
    y_valid_M = y_all[9000:]

    print("loaded data")
    feature_tfmr = sklearn.pipeline.FeatureUnion(transformer_list=[
        ('orig',
         sklearn.preprocessing.PolynomialFeatures(degree=2, include_bias=False)
         ),
    ])
    classifier = sklearn.linear_model.LogisticRegression(C=1.0,
                                                         solver='lbfgs',
                                                         max_iter=1000)
    pipeline = sklearn.pipeline.Pipeline([('step1', feature_tfmr),
                                          ('step2', classifier)])
    print("made pipeline")
    pipeline.fit(x_train_NF, y_train_N)

    print("fit pipeline")
    err = sklearn.metrics.zero_one_loss(y_valid_M,
                                        pipeline.predict(x_valid_MF) >= 0.5)
    print(err)
コード例 #7
0
    def test_classification_workflow(self):
        X, y = sklearn.datasets.fetch_openml(data_id=24, as_frame=False, return_X_y=True)
        print(type(X))

        X_train, X_test, y_train, y_test = \
            sklearn.model_selection.train_test_split(X, y, random_state=3,
                                                     train_size=0.5,
                                                     test_size=0.5)

        X_train = scipy.sparse.csc_matrix(X_train)
        X_test = scipy.sparse.csc_matrix(X_test)

        pipeline = sklearn.pipeline.Pipeline((
            ('shift', CategoryShift()),
            ('imput', SimpleImputer(strategy='constant', fill_value=2)),
            ('ohe', SparseOneHotEncoder()),
            ('tree', DecisionTreeClassifier(random_state=1)),
            ))

        pipeline.fit(X_train, y_train)
        pred_train = pipeline.predict(X_train)
        self.assertTrue((pred_train == y_train).all())
        # With an incorrect copy operation the OneHotEncoder would rearrange
        # the data in such a way that the accuracy would drop to 66%
        pred_test = pipeline.predict(X_test)
        self.assertTrue((pred_test == y_test).all())
コード例 #8
0
    def test_sklearn(self, seed, experiment_run, strs):
        np = pytest.importorskip("numpy")
        sklearn = pytest.importorskip("sklearn")
        from sklearn import cluster, naive_bayes, pipeline, preprocessing

        np.random.seed(seed)
        key = strs[0]
        num_data_rows = 36
        X = np.random.random((num_data_rows, 2))
        y = np.random.randint(10, size=num_data_rows)

        pipeline = sklearn.pipeline.make_pipeline(
            sklearn.preprocessing.StandardScaler(),
            sklearn.cluster.KMeans(),
            sklearn.naive_bayes.GaussianNB(),
        )
        pipeline.fit(X, y)

        experiment_run.log_model(pipeline)
        retrieved_pipeline = experiment_run.get_model()

        assert np.allclose(pipeline.predict(X), retrieved_pipeline.predict(X))

        assert len(pipeline.steps) == len(retrieved_pipeline.steps)
        for step, retrieved_step in zip(pipeline.steps, retrieved_pipeline.steps):
            assert step[0] == retrieved_step[0]  # step name
            assert step[1].get_params() == retrieved_step[1].get_params()  # step model
コード例 #9
0
def feature_importance_on_openml_task(task: openml.tasks.OpenMLSupervisedTask,
                                      n_trees: int,
                                      random_state: int) -> pd.DataFrame:
    X, y = task.get_X_and_y()

    pipeline = sklearn.pipeline.make_pipeline(
        sklearn.impute.SimpleImputer(strategy='median'),
        sklearn.ensemble.RandomForestClassifier(n_estimators=n_trees,
                                                random_state=random_state))

    pipeline.fit(X, y)
    importances = pipeline.steps[-1][-1].feature_importances_
    if len(importances) != X.shape[1]:
        raise ValueError(
            'Did not obtain feature importance for all attributes,'
            'probably due to constant missing val')

    features = task.get_dataset().features

    results = list()
    for idx, importance in enumerate(importances):
        results.append({
            'idx': idx,
            'importance': importance,
            'name': features[idx].name,
            'data_type': features[idx].data_type
        })
    df = pd.DataFrame(results)
    df = df.set_index('idx')
    return df
コード例 #10
0
ファイル: test_core_pipeline.py プロジェクト: IBM/lale
 def test_multiple_estimators_predict_predict_proba(self):
     pipeline = (StandardScaler() >>
                 (LogisticRegression() & PCA()) >> ConcatFeatures() >>
                 (NoOp() & LinearSVC()) >> ConcatFeatures() >>
                 KNeighborsClassifier())
     pipeline.fit(self.X_train, self.y_train)
     _ = pipeline.predict_proba(self.X_test)
     _ = pipeline.predict(self.X_test)
コード例 #11
0
def train_knn(train_X, train_y):
    # Features selected with forward selection:
    columns = ['education_num', 'marital_status_Married-civ-spouse',
               'net_capital']
    # Same arrangement as train_naive_bayes:
    idxs = [train_X.columns.get_loc(c) for c in columns]
    pipeline = sklearn.pipeline.make_pipeline(
        sklearn.preprocessing.FunctionTransformer(lambda x: x[:, idxs]),
        sklearn.neighbors.KNeighborsClassifier(11, weights="distance", n_jobs=-1),
    )
    pipeline.fit(train_X, train_y)
    return pipeline
コード例 #12
0
ファイル: utils.py プロジェクト: cthorey/rl_playground
    def gen_transformer(self, stransformer_path):
        scaler = sklearn.preprocessing.StandardScaler()
        featurizer = sklearn.pipeline.FeatureUnion([
            ("rbf1", RBFSampler(gamma=5.0, n_components=100)),
            ("rbf2", RBFSampler(gamma=2.0, n_components=100)),
            ("rbf3", RBFSampler(gamma=1.0, n_components=100)),
            ("rbf4", RBFSampler(gamma=0.5, n_components=100))
        ])
        pipeline = Pipeline([('scaler', scaler), ('feat', featurizer)])

        env = gym.envs.make(ENV_NAME)
        obs = np.array([env.observation_space.sample() for x in range(10000)])
        pipeline.fit(obs)
        pickle.dump(pipeline, open(stransformer_path, 'wb+'))
コード例 #13
0
def train_model():
    select = SelectKBest(k=10)
    train = load_train_set()
    test = load_test_set()
    target = 'condition'
    hrv_features = list(train)
    hrv_features = [x for x in hrv_features if x not in [target]]

    classifiers = [
        #MultinomialNB(),
        #SVC(C=20, kernel='rbf'),
        ('rdf', RandomForestClassifier())
    ]
    for clf in classifiers:
        count_time = time.time()
        X_train = train[hrv_features]
        y_train = train[target]
        X_test = test[hrv_features]
        y_test = test[target]

        name = str(clf).split('(')[0]
        """if 'multinomialnb'==name.lower():
            scaler = MinMaxScaler()
            scaler.fit(X_train)
            X_train = scaler.transform(X_train)
            X_test = scaler.transform(X_test)
        else:
            scaler = StandardScaler()
            scaler.fit(X_train)
            X_train = scaler.transform(X_train)
            X_test = scaler.transform(X_test)"""
        print(name)
        """steps = [('feature_selection', select),
             ('model', clf)]"""
        steps = [('scaler', StandardScaler()), ('feature_selection', select),
                 ('model', clf)]
        pipeline = sklearn.pipeline.Pipeline(steps)
        pipeline.fit(X_train, y_train)
        y_prediction = pipeline.predict(X_test)
        print("----------------------------{0}---------------------------".
              format(name))
        print(sklearn.metrics.classification_report(y_test, y_prediction))
        count_time = time.time() - count_time
        print("time: ", count_time)
        print()
        print()
        pickle.dump(pipeline, open('model_stress.pkl', 'wb'))
        #joblib.dump(pipeline, 'model_stress.pkl')
        print("done")
コード例 #14
0
def report(clf, features_train, features_test, labels_train, labels_test):
    ##input: 
    # clf: classifier you set
    ##output: accuracy, recall, precision and f1 score you have got.
    steps = [('classifier', clf)]

    pipeline = sklearn.pipeline.Pipeline(steps)

    pipeline.fit(features_train, labels_train)

    y_prediction = pipeline.predict( features_test )

    report = sklearn.metrics.classification_report( labels_test, y_prediction )

    return report
コード例 #15
0
ファイル: test_utils.py プロジェクト: jchodera/mixtape
def test_subsampler_tica():
    n_traj, n_samples, n_features = 1, 500, 4
    lag_time = 2
    X_all_0 = [random.normal(size=(n_samples, n_features)) for i in range(n_traj)]
    tica_0 = mixtape.tica.tICA(lag_time=lag_time)
    tica_0.fit(X_all_0)

    subsampler = mixtape.utils.Subsampler(lag_time=lag_time)
    tica_1 = mixtape.tica.tICA()
    pipeline = sklearn.pipeline.Pipeline([("subsampler", subsampler), ('tica', tica_1)])    
    pipeline.fit(X_all_0)

    eq(tica_0.n_features, tica_1.n_features)  # Obviously true
    eq(tica_0.n_observations_, tica_1.n_observations_)
    eq(tica_0.eigenvalues_, tica_1.eigenvalues_)  # The eigenvalues should be the same.  NOT the timescales, as tica_1 has timescales calculated in a different time unit
コード例 #16
0
ファイル: test_core_pipeline.py プロジェクト: IBM/lale
    def test_pipeline_create(self):
        from lale.operators import Pipeline

        pipeline = Pipeline(([("pca1", PCA()), ("lr1", LogisticRegression())]))
        trained = pipeline.fit(self.X_train, self.y_train)
        predictions = trained.predict(self.X_test)
        accuracy_score(self.y_test, predictions)
コード例 #17
0
ファイル: test_pipeline.py プロジェクト: konggas/dasklearn
def test_pipeline():
    pipeline = dl.Pipeline([("scale", StandardScaler()),
                            ("fdr", SelectFdr()),
                            ("svm", LinearSVC())])

    pipeline = pipeline.fit(X, y)
    y2 = pipeline.predict(X)
    score = pipeline.score(X, y)

    assert isinstance(y2, di.Value)
    assert isinstance(score, di.Value)

    assert isinstance(score.compute(), float)

    assert pipeline.score(X, y).key == pipeline.score(X, y).key
    assert score.compute() == score.compute()

    y22 = y2.compute()
    assert y22.shape == y.shape
    assert y22.dtype == y.dtype
    skpipeline = sklearn.pipeline.Pipeline([("scale", StandardScaler()),
                                            ("fdr", SelectFdr()),
                                            ("svm", LinearSVC())])

    skpipeline.fit(X, y)
    sk_y2 = skpipeline.predict(X)
    sk_score = skpipeline.score(X, y)
    assert sk_score == score.compute()
コード例 #18
0
ファイル: test_json_pretty_viz.py プロジェクト: IBM/lale
    def test_autoai_libs_tam_4(self):
        import autoai_libs.cognito.transforms.transform_utils
        import numpy as np
        import sklearn.decomposition
        import sklearn.linear_model
        import sklearn.pipeline

        import lale.helpers
        import lale.operators
        import lale.pretty_print

        sklearn_pipeline = sklearn.pipeline.make_pipeline(
            autoai_libs.cognito.transforms.transform_utils.TAM(
                tans_class=sklearn.decomposition.PCA(),
                name="pca",
                col_names=["a", "b", "c"],
                col_dtypes=[
                    np.dtype("float32"),
                    np.dtype("float32"),
                    np.dtype("float32"),
                ],
            ),
            sklearn.linear_model.LogisticRegression(solver="liblinear",
                                                    multi_class="ovr"),
        )
        pipeline = lale.helpers.import_from_sklearn_pipeline(sklearn_pipeline,
                                                             fitted=False)
        assert isinstance(pipeline, lale.operators.TrainableOperator)

        expected = """from autoai_libs.cognito.transforms.transform_utils import TAM
from sklearn.decomposition import PCA
import numpy as np
from sklearn.linear_model import LogisticRegression
import lale

lale.wrap_imported_operators()
tam = TAM(
    tans_class=PCA(),
    name="pca",
    col_names=["a", "b", "c"],
    col_dtypes=[
        np.dtype("float32"), np.dtype("float32"), np.dtype("float32"),
    ],
)
logistic_regression = LogisticRegression(
    multi_class="ovr", solver="liblinear"
)
pipeline = tam >> logistic_regression"""
        self._roundtrip(expected, lale.pretty_print.to_string(pipeline))
        import numpy as np
        import pandas as pd

        test = pd.DataFrame(
            np.random.randint(0, 100, size=(15, 3)),
            columns=["a", "b", "c"],
            dtype=np.dtype("float32"),
        )
        trained = pipeline.fit(test.to_numpy(),
                               [0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1])
        trained.predict(test.to_numpy())
コード例 #19
0
def create(x_train, y_train, **_):
    '''Create zero-hypothesis predictor.
    '''

    # Even DummyClassifier does not use the incoming data,
    # sklearn will explode if the data provided to classifier
    # is not flat.
    transform = sklearn.preprocessing.FunctionTransformer(
        lambda X: np.zeros((len(X), 1)),
        validate=False,
    )

    classifier = sklearn.dummy.DummyClassifier(
        'constant',
        constant=audiolabel.preprocess.k_hot_encode(['/m/04rlf']),
    )

    pipeline = sklearn.pipeline.Pipeline([
        ('transform', transform),
        ('classifier', classifier),
    ])

    predictor = pipeline.fit(x_train, y_train)

    return audiolabel.util.Predictor(predictor.predict)
コード例 #20
0
ファイル: test_json_pretty_viz.py プロジェクト: krprls/lale
    def test_autoai_libs_tam_4(self):
        import autoai_libs.cognito.transforms.transform_utils
        import lale.helpers
        import numpy as np
        import sklearn.cluster.hierarchical
        import sklearn.linear_model
        import sklearn.pipeline
        sklearn_pipeline = sklearn.pipeline.make_pipeline(
            autoai_libs.cognito.transforms.transform_utils.TAM(tans_class=sklearn.decomposition.PCA(), name='pca', col_names=['a', 'b', 'c'], col_dtypes=[np.dtype('float32'), np.dtype('float32'), np.dtype('float32')]),
            sklearn.linear_model.LogisticRegression(solver='liblinear', multi_class='ovr'))
        pipeline = lale.helpers.import_from_sklearn_pipeline(sklearn_pipeline, fitted=False)
        expected = \
"""from lale.lib.autoai_libs import TAM
from lale.lib.sklearn import PCA
import numpy as np
from lale.lib.sklearn import LogisticRegression
import lale
lale.wrap_imported_operators()

tam = TAM(tans_class=PCA(), name='pca', col_names=['a', 'b', 'c'], col_dtypes=[np.dtype('float32'), np.dtype('float32'), np.dtype('float32')])
pipeline = tam >> LogisticRegression()"""
        self._roundtrip(expected, lale.pretty_print.to_string(pipeline))
        import pandas as pd
        import numpy as np
        test = pd.DataFrame(np.random.randint(0,100,size=(15, 3)), columns=['a','b','c'], dtype=np.dtype('float32'))
        trained = pipeline.fit(test.to_numpy(), [0,1,1,0,0,0,1,1,1,1,0,0,1,0,1])
        trained.predict(test.to_numpy())
コード例 #21
0
ファイル: KNN.py プロジェクト: Nehabhoi/KNN
def knn(X, y, K, test_ratio=0.2):
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        random_state=0,
                                                        test_size=test_ratio)

    sc = StandardScaler()
    X_train_scaled = sc.fit_transform(X_train)
    X_test_scaled = sc.fit_transform(X_test)

    pipeline = make_pipeline(StandardScaler(),
                             KNeighborsClassifier(n_neighbors=K))
    model = pipeline.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)

    print("Result of KNN WITH PREPROCESSING using K={} is: \n".format(K))
    labels = np.unique(y)
    cm = confusion_matrix(y_test, y_pred, labels)
    print("Confusion Matrix: \n", cm)
    sns.heatmap(cm,
                annot=True,
                fmt='d',
                xticklabels=labels,
                yticklabels=labels)

    print("Classification report: \n", classification_report(y_test, y_pred))
    print("Training accuracy: ", metrics.accuracy_score(y_test, y_pred))
コード例 #22
0
def test_sklearn_random_forest_newsgroups():
    # note: this test used to fail in native TreeExplainer code due to memory corruption
    newsgroups_train, newsgroups_test, _ = create_binary_newsgroups_data()
    pipeline = create_random_forest_vectorizer()
    pipeline.fit(newsgroups_train.data, newsgroups_train.target)
    rf = pipeline.named_steps['rf']
    vectorizer = pipeline.named_steps['vectorizer']
    densifier = pipeline.named_steps['to_dense']

    dense_bg = densifier.transform(vectorizer.transform(newsgroups_test.data[0:20]))

    test_row = newsgroups_test.data[83:84]
    explainer = shap.TreeExplainer(rf, dense_bg, feature_perturbation="interventional")
    vec_row = vectorizer.transform(test_row)
    dense_row = densifier.transform(vec_row)
    explainer.shap_values(dense_row)
コード例 #23
0
def test_pipeline():
    pipeline = dl.Pipeline([("scale", StandardScaler()), ("fdr", SelectFdr()),
                            ("svm", LinearSVC())])

    pipeline = pipeline.fit(X, y)
    y2 = pipeline.predict(X)
    score = pipeline.score(X, y)

    assert isinstance(y2, di.Value)
    assert isinstance(score, di.Value)

    assert isinstance(score.compute(), float)

    assert pipeline.score(X, y).key == pipeline.score(X, y).key
    assert score.compute() == score.compute()

    y22 = y2.compute()
    assert y22.shape == y.shape
    assert y22.dtype == y.dtype
    skpipeline = sklearn.pipeline.Pipeline([("scale", StandardScaler()),
                                            ("fdr", SelectFdr()),
                                            ("svm", LinearSVC())])

    skpipeline.fit(X, y)
    sk_y2 = skpipeline.predict(X)
    sk_score = skpipeline.score(X, y)
    assert sk_score == score.compute()
コード例 #24
0
    def test_classification_workflow(self):
        task = openml.tasks.get_task(254)
        X, y = task.get_X_and_y()

        ohe = OneHotEncoder(categorical_features=[True]*22)
        tree = sklearn.tree.DecisionTreeClassifier(random_state=1)
        pipeline = sklearn.pipeline.Pipeline((('ohe', ohe), ('tree', tree)))

        X_train, X_test, y_train, y_test = \
            sklearn.model_selection.train_test_split(X, y, random_state=3,
                                                     train_size=0.5,
                                                     test_size=0.5)
        pipeline.fit(X_train, y_train)
        self.assertEqual(np.mean(y_train == pipeline.predict(X_train)), 1)
        # With an incorrect copy operation the OneHotEncoder would rearrange
        # the data in such a way that the accuracy would drop to 66%
        self.assertEqual(np.mean(y_test == pipeline.predict(X_test)), 1)
コード例 #25
0
    def test_classification_workflow(self):
        task = openml.tasks.get_task(254)
        X, y = task.get_X_and_y()

        ohe = OneHotEncoder(categorical_features=[True] * 22)
        tree = sklearn.tree.DecisionTreeClassifier(random_state=1)
        pipeline = sklearn.pipeline.Pipeline((('ohe', ohe), ('tree', tree)))

        X_train, X_test, y_train, y_test = \
            sklearn.model_selection.train_test_split(X, y, random_state=3,
                                                     train_size=0.5,
                                                     test_size=0.5)
        pipeline.fit(X_train, y_train)
        self.assertEqual(np.mean(y_train == pipeline.predict(X_train)), 1)
        # With an incorrect copy operation the OneHotEncoder would rearrange
        # the data in such a way that the accuracy would drop to 66%
        self.assertEqual(np.mean(y_test == pipeline.predict(X_test)), 1)
コード例 #26
0
def run_pipeline(df, pipeline, pipeline_name=''):
    X = pd.Series(df["text"])
    y = preprocessing.LabelEncoder().fit_transform(df.author.values)

    rskf = StratifiedKFold(n_splits=5, random_state=1)
    losses = []
    for train_index, test_index in rskf.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        pipeline.fit(X_train, y_train)
        predictions = pipeline.predict_proba(X_test)
        log_loss = metrics.log_loss(y_test, predictions)
        losses.append(log_loss)
        print(" Log loss: " + str(log_loss))
        print(" Accuracy : %0.3f " % calculate_accuracy(y_test, predictions))

    print(f'{pipeline_name} mean log loss: {round(pd.np.mean(losses), 3)}')
コード例 #27
0
def main(args):
    dataset = getattr(sklearn.datasets, "load_{}".format(args.dataset))()

    X = np.array(dataset.data)
    Y = np.array(dataset.target)

    # TODO: Split the dataset into a train set and a test set.
    # Use `sklearn.model_selection.train_test_split` method call, passing
    # arguments `test_size=args.test_size, random_state=args.seed`.
    X_train, X_test, Y_train, Y_test = train_test_split(
        X, Y, test_size=args.test_size, random_state=args.seed)
    X_train = pd.DataFrame(X_train)
    X_test = pd.DataFrame(X_test)

    # TODO: Process the input columns in the following way:
    #
    # - if a column has only integer values, consider it a categorical column
    #   (days in a week, dog breed, ...; in general integer values can also
    #   represent numerical non-categorical values, but we use this assumption
    #   for the sake of an exercise). Encode the values with one-hot encoding
    #   using `sklearn.preprocessing.OneHotEncoder` (note that its output is by
    #   default sparse, you can use `sparse=False` to generate dense output;
    #   also use `handle_unknown="ignore"` to ignore missing values in test set).
    #
    # - for the rest of the columns, normalize their values so that they
    #   have mean 0 and variance 1; use `sklearn.preprocessing.StandardScaler`.
    #
    # In the output, there should be first all the one-hot categorical features,
    # and then the real-valued features. To process different dataset columns
    # differently, you can use `sklearn.compose.ColumnTransformer`.

    # Check categorical columns
    categ_check = np.all(X.astype(int) == X, axis=0)
    categ_colnames = [i for i, x in enumerate(categ_check) if x]
    non_categ_colnames = [i for i, x in enumerate(categ_check) if not x]

    col_trans = sklearn.compose.ColumnTransformer([
        ('1hot',
         sklearn.preprocessing.OneHotEncoder(sparse=False,
                                             handle_unknown='ignore'),
         categ_colnames),
        ('standard', sklearn.preprocessing.StandardScaler(),
         non_categ_colnames)
    ])

    # TODO: Generate polynomial features of order 2 from the current features.
    # If the input values are [a, b, c, d], you should generate
    # [a^2, ab, ac, ad, b^2, bc, bd, c^2, cd, d^2]. You can generate such polynomial
    # features either manually, or using
    # `sklearn.preprocessing.PolynomialFeatures(2, include_bias=False)`.
    poly = sklearn.preprocessing.PolynomialFeatures(2, include_bias=False)
    pipeline = sklearn.pipeline.Pipeline([('col_trans', col_trans),
                                          ('poly', poly)])
    fit = pipeline.fit(X_train)
    train_data = fit.transform(X_train)
    test_data = fit.transform(X_test)

    return train_data, test_data
コード例 #28
0
def test_sample_1():
    # Test that the code actually runs and gives something non-crazy
    # Make an ergodic dataset with two gaussian centers offset by 25 units.
    chunk = np.random.normal(size=(20000, 3))
    data = [np.vstack((chunk, chunk + 25)), np.vstack((chunk + 25, chunk))]

    clusterer = cluster.KMeans(n_clusters=2)
    msm = MarkovStateModel()
    pipeline = sklearn.pipeline.Pipeline([("clusterer", clusterer),
                                          ("msm", msm)])
    pipeline.fit(data)
    trimmed_assignments = pipeline.transform(data)

    # Now let's make make the output assignments start with
    # zero at the first position.
    i0 = trimmed_assignments[0][0]
    if i0 == 1:
        for m in trimmed_assignments:
            m *= -1
            m += 1

    pairs = msm.draw_samples(trimmed_assignments, 2000)

    samples = map_drawn_samples(pairs, data)
    mu = np.mean(samples, axis=1)
    eq(mu, np.array([[0., 0., 0.0], [25., 25., 25.]]), decimal=1)

    # We should make sure we can sample from Trajectory objects too...
    # Create a fake topology with 1 atom to match our input dataset
    top = md.Topology.from_dataframe(pd.DataFrame({
        "serial": [0],
        "name": ["HN"],
        "element": ["H"],
        "resSeq": [1],
        "resName": "RES",
        "chainID": [0]
    }),
                                     bonds=np.zeros(shape=(0, 2), dtype='int'))
    # np.newaxis reshapes the data to have a 40000 frames, 1 atom, 3 xyz
    trajectories = [md.Trajectory(x[:, np.newaxis], top) for x in data]

    trj_samples = map_drawn_samples(pairs, trajectories)
    mu = np.array([t.xyz.mean(0)[0] for t in trj_samples])
    eq(mu, np.array([[0., 0., 0.0], [25., 25., 25.]]), decimal=1)
コード例 #29
0
def main():
    #X, y = load_data()
    X_train, X_test, y_train, y_test = train_test_split(X, Y)

    # build pipeline
    pipeline = sklearn.pipeline.Pipeline([
        ('vect', CountVectorizer(tokenizer=tokenize)),
        ('tfidf', TfidfTransformer()),
        ('clf', RandomForestClassifier())
    ])

      
        
    # train classifier
    pipeline.fit(X_train,y_train)
    # predict on test data

    # display results
    display_results(y_test, y_pred)
コード例 #30
0
 def test_two_estimators_predict_proba(self):
     pipeline = (
         StandardScaler()
         >> (PCA() & Nystroem() & LogisticRegression())
         >> ConcatFeatures()
         >> NoOp()
         >> LogisticRegression()
     )
     trained = pipeline.fit(self.X_train, self.y_train)
     trained.predict_proba(self.X_test)
コード例 #31
0
ファイル: test_msm.py プロジェクト: back2mars/msmbuilder
def test_sample_1():
    # Test that the code actually runs and gives something non-crazy
    # Make an ergodic dataset with two gaussian centers offset by 25 units.
    chunk = np.random.normal(size=(20000, 3))
    data = [np.vstack((chunk, chunk + 25)), np.vstack((chunk + 25, chunk))]

    clusterer = cluster.KMeans(n_clusters=2)
    msm = MarkovStateModel()
    pipeline = sklearn.pipeline.Pipeline(
        [("clusterer", clusterer), ("msm", msm)]
    )
    pipeline.fit(data)
    trimmed_assignments = pipeline.transform(data)

    # Now let's make make the output assignments start with
    # zero at the first position.
    i0 = trimmed_assignments[0][0]
    if i0 == 1:
        for m in trimmed_assignments:
            m *= -1
            m += 1

    pairs = msm.draw_samples(trimmed_assignments, 2000)

    samples = map_drawn_samples(pairs, data)
    mu = np.mean(samples, axis=1)
    eq(mu, np.array([[0., 0., 0.0], [25., 25., 25.]]), decimal=1)

    # We should make sure we can sample from Trajectory objects too...
    # Create a fake topology with 1 atom to match our input dataset
    top = md.Topology.from_dataframe(
        pd.DataFrame({
            "serial": [0], "name": ["HN"], "element": ["H"], "resSeq": [1],
            "resName": "RES", "chainID": [0]
        }), bonds=np.zeros(shape=(0, 2), dtype='int')
    )
    # np.newaxis reshapes the data to have a 40000 frames, 1 atom, 3 xyz
    trajectories = [md.Trajectory(x[:, np.newaxis], top)
                    for x in data]

    trj_samples = map_drawn_samples(pairs, trajectories)
    mu = np.array([t.xyz.mean(0)[0] for t in trj_samples])
    eq(mu, np.array([[0., 0., 0.0], [25., 25., 25.]]), decimal=1)
コード例 #32
0
def train_naive_bayes(train_X, train_y):
    # GaussianNB (which I'm assuming is what 'naive_bayes' meant) has
    # seemingly no hyperparameters to speak of, so feature
    # selection/transformation is where I started.  The below features
    # were found via a couple runs of forward selection:
    columns = ['net_capital', 'education_Prof-school',
               'education_Doctorate', 'occupation_Transport-moving',
               'education_Masters', 'marital_status_Never-married',
               'education_Bachelors', 'relationship_Not-in-family',
               'occupation_Exec-managerial']
    # Turn columns to indices, as FunctionTransformer seems to receive
    # normal NumPy arrays (not dataframes):
    idxs = [train_X.columns.get_loc(c) for c in columns]
    pipeline = sklearn.pipeline.make_pipeline(
        sklearn.preprocessing.FunctionTransformer(lambda x: x[:, idxs]),
        sklearn.naive_bayes.GaussianNB(),
    )
    pipeline.fit(train_X, train_y)
    return pipeline
def main(args):
    # make data for yourself
    X, y = sklearn.datasets.make_classification(n_samples=args.data_size)

    train_data, test_data, train_target, test_target = sklearn.model_selection.train_test_split(
        X, y, test_size=args.test_size, random_state=args.seed)

    features = []
    if args.original:
        # like identity transformer
        # when you don't feed any function
        # it doesn't do anything to features
        features.append(
            ("original", sklearn.preprocessing.FunctionTransformer()))
    if args.rff:
        features.append(("rff", RFFsTransformer(args.rff, args.gamma,
                                                args.seed)))
    if args.nystroem:
        features.append(("nystroem",
                         NystroemTransformer(args.nystroem, args.gamma,
                                             args.seed)))

    if args.svm:
        classifier = sklearn.svm.SVC()
    else:
        classifier = sklearn.linear_model.LogisticRegression(
            solver="saga",
            penalty="none",
            max_iter=args.max_iter,
            random_state=args.seed)

    pipeline = sklearn.pipeline.Pipeline([
        ("scaling", sklearn.preprocessing.StandardScaler()),
        ("features", sklearn.pipeline.FeatureUnion(features)),
        ("classifier", classifier),
    ])

    pipeline.fit(train_data, train_target)

    test_accuracy = sklearn.metrics.accuracy_score(test_target,
                                                   pipeline.predict(test_data))
    return test_accuracy
コード例 #34
0
def test_subsampler_tica():
    n_traj, n_samples, n_features = 1, 500, 4
    lag_time = 2
    X_all_0 = [
        random.normal(size=(n_samples, n_features)) for i in range(n_traj)
    ]
    tica_0 = tICA(lag_time=lag_time)
    tica_0.fit(X_all_0)

    subsampler = Subsampler(lag_time=lag_time)
    tica_1 = tICA()
    pipeline = sklearn.pipeline.Pipeline([("subsampler", subsampler),
                                          ('tica', tica_1)])
    pipeline.fit(X_all_0)

    eq(tica_0.n_features, tica_1.n_features)  # Obviously true
    eq(tica_0.n_observations_, tica_1.n_observations_)
    eq(
        tica_0.eigenvalues_, tica_1.eigenvalues_
    )  # The eigenvalues should be the same.  NOT the timescales, as tica_1 has timescales calculated in a different time unit
コード例 #35
0
	def sklearn_pipeline(self, train_proportion=0.8, joke_limit=5000, debug=False):
		test_proportion = 1 - train_proportion

		### get random sample of jokes where joke["categories"] isn't empty
		jokes_to_use = random.sample(list(filter(lambda joke: joke["categories"], self._jokes)), joke_limit)

		### create CountVectorizer
		vectorizer = sklearn.feature_extraction.text.CountVectorizer(
			input="content",
			analyzer=u"word",
			token_pattern=r"\b\w+\b", # tokenize string by extracting words of at least 1 letter. I think default is r"\b\w{2,}\b"
			ngram_range=(1,1), # TODO: experiment with this
			binary=False,
		)

		### create data and target vectors
		X = vectorizer.fit_transform(joke["content"] for joke in jokes_to_use)
		y = np.fromiter((self._categoryIDs[joke["categories"][0]] for joke in jokes_to_use), np.int8)

		X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(X, y, test_size=test_proportion)

		### setting up pipeline. feel free to experiment here
		select = sklearn.feature_selection.SelectKBest(k=100)
		clf = sklearn.naive_bayes.MultinomialNB()
		steps = [("feature_selection", select),
		        ("naive_bayes", clf)]

		pipeline = sklearn.pipeline.Pipeline(steps)

		### fit your pipeline on X_train and y_train
		pipeline.fit(X_train, y_train)
		### call pipeline.predict() on your X_test data to make a set of test predictions
		y_prediction = pipeline.predict(X_test)
		### test your predictions using sklearn.classification_report()
		report = sklearn.metrics.classification_report(y_test, y_prediction)
		### and print the report
		print(report)
		print("overall accuracy: {:.2f}%".format(sklearn.metrics.accuracy_score(y_test, y_prediction) * 100))
		print()
		for index, category in enumerate(self._categories):
			print("{}: {} ({} jokes)".format(index, category, self._categories[category]))
コード例 #36
0
ファイル: test_pipeline.py プロジェクト: konggas/dasklearn
def test_pipeline_shares_structure():
    pipeline = dl.Pipeline([("scale", StandardScaler()),
                            ("fdr", SelectFdr()),
                            ("svm", LinearSVC())])

    pipeline1 = pipeline.fit(X, y)
    score1 = pipeline1.score(X, y)

    pipeline2 = pipeline.set_params(svm__C=0.1)
    pipeline2 = pipeline2.fit(X, y)
    score2 = pipeline2.score(X, y)

    assert (len(merge(score1.dask, score2.dask))
         <= (len(score1.dask) + len(score2.dask)) * 0.75)
    assert score1.key != score2.key
コード例 #37
0
                  clf__min_samples_split=range(1,4,3),
                  clf__min_samples_leaf=range(2,4,1),
                  clf__min_weight_fraction_leaf=[0],

                    )
#grid_search = sklearn.grid_search.GridSearchCV(
#    pipeline, n_jobs=1, param_grid=param_grid, verbose=100,
#    scoring=youdenJ,score_func=youdenJ,
#    cv=sklearn.cross_validation.PredefinedSplit(testidx))
#grid_search.fit(trainFact[:,rfecv.support_], labels)
#results1=([sklearn.metrics.confusion_matrix(labels,grid_search.best_estimator_.predict(train))])
#grid_search_results1=(grid_search.grid_scores_)
#kwargs=grid_search.best_params_
#pipeline.set_params(**kwargs)

pipeline.fit(train[train.columns[rfecv.support_]],labels)

predictions=(pipeline.predict_proba(test[train.columns[rfecv.support_]])[:,1]>=0.02)*1
predictionstrain=(pipeline.predict_proba(train[train.columns[rfecv.support_]])[:,1]>=0.02)*1


print Youdens_func(labels,predictionstrain)

# create predictions and submission file
sample['WnvPresent'] = predictions
sample.to_csv('testpredicts5.csv', index=False)

print sum(predictions)


コード例 #38
0
ファイル: pipeline.py プロジェクト: ladyson/police-complaints
                        except:
                            print('Could not impute column:{}'.format(col))
                            continue

                    # Resample minority class
                    if over_sampler != "None":
                        X_resampled = np.array(X_train)
                        y_resampled = np.array(y_train)
                        X_resampled, y_resampled = os_object.fit_transform(X_resampled,y_resampled)

                    else:
                        X_resampled = X_train
                        y_resampled = y_train

                    t0 = time.clock()
                    pipeline.fit(X_resampled, y_resampled)
                    time_to_fit = (time.clock() - t0)
                    print("done fitting in {}".format(time_to_fit))

                    '''
                    Predictions
                    '''
                    predicted = pipeline.predict(X_test)

                    try:
                        predicted_prob = pipeline.predict_proba(X_test)
                        predicted_prob = predicted_prob[:, 1]  # probability that label is 1

                    except:
                        print("Model has no predict_proba method")
コード例 #39
0
############ feature select, classify, test-set validate, report

selector = feature_selection.SelectKBest(k=100)
classifier = naive_bayes.MultinomialNB(class_prior = np.reshape(np.repeat(np.array([[1.0/14.0]]),14,axis=1), (14,))) #flat priors

steps = [('feature_selection', selector), ('multinomial_nb', classifier)]

pipeline = pipeline.Pipeline(steps)
 
t0 = time()
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X_matrix, label_dum, test_size=0.33, random_state=30)
print("X_train dimensions: " + str(X_train.shape))
print("y_train dimensions: " + str(y_train.shape))

### fit your pipeline on X_train and y_train
pipeline.fit( X_train, y_train )
### call pipeline.predict() on your X_test data to make a set of test predictions
y_prediction = pipeline.predict( X_test )
### test your predictions using sklearn.classification_report()
report = metrics.classification_report( y_test, y_prediction )
### and print the report
print("Classifying unlabeled data done in: %fs" % (time()-t0))
print(report)

kfeatures = np.asarray(selector.get_support(indices=True))
print(np.asarray(vectorizer.get_feature_names())[kfeatures])

#################################################################
###### 3. Use classifier on unlabelled data

pred_unlab = pipeline.predict(X_matrix_unlab).tolist()
コード例 #40
0
ファイル: cv_kmeans.py プロジェクト: kyleabeauchamp/PMTStuff
import mixtape.featurizer, mixtape.tica, mixtape.cluster, mixtape.markovstatemodel, mixtape.ghmm
import numpy as np
import mdtraj as md
from parameters import load_trajectories, build_full_featurizer
import sklearn.pipeline, sklearn.externals.joblib
import mixtape.utils

n_choose = 50
stride = 1
lag_time = 1
n_components = 2

trj0, trajectories, filenames = load_trajectories(stride=stride)

train = trajectories[0::2]
test = trajectories[1::2]

featurizer = sklearn.externals.joblib.load("./featurizer-%d-%d.job" % (n_components, n_choose))


for n_states in [10, 20, 30, 40, 50]:
    n_components = n_components
    tica = mixtape.tica.tICA(n_components=n_components, lag_time=lag_time)
    msm = mixtape.markovstatemodel.MarkovStateModel(n_timescales=5)
    cluster = mixtape.cluster.KMeans(n_clusters=n_states)
    pipeline = sklearn.pipeline.Pipeline([("features", featurizer), ('tica', tica), ("cluster", cluster), ("msm", msm)])
    pipeline.fit(train)
    print(pipeline.score(train), pipeline.score(test))
    pipeline.fit(trajectories)
    print(msm.timescales_)
コード例 #41
0
def cluster_frames():

    seed = 0
    np.random.seed(seed)

    parser = argparse.ArgumentParser()
    parser.add_argument('input_filename')
    parser.add_argument("data_proportion", nargs='?', type=float, default=1.,
            help="Proportion of full dataset to be used")
    parser.add_argument("--log", type=str, default='INFO',
            help="Logging setting (e.g., INFO, DEBUG)")
    parser.add_argument('-o', '--output_filename',
        help='Filename of video to be saved (default: does not save)')
    args = parser.parse_args()

    # Setting logging parameters
    numeric_level = getattr(logging, args.log.upper(), None)
    if not isinstance(numeric_level, int):
        raise ValueError('Invalid log level: %s' % loglevel)
    logging.basicConfig(level=numeric_level, format='%(asctime)s %(message)s')

    sample_inds = [212, 699, 988, 1105, 2190, 2318]
    logging.info('Loading %i images... ', len(sample_inds))

    # Load data
    d = 6  # size of patch
    all_frames = util.grab_frame(args.input_filename)
    im_originals = list(util.index(all_frames, sample_inds))
    im_height, im_width = im_originals[0].shape[:2]
    all_patch_rows =  np.array(list(
            patch.ravel()
            for im in im_originals
            for patch in util.yield_windows(im, (d, d), (1, 1))
            ))
    num_rows_per_im = len(all_patch_rows) // len(im_originals)
    num_im = len(im_originals)
    logging.info('Loaded %i examples from %i images',
        len(all_patch_rows),
        len(im_originals))

    # Randomly sample a subset of the data
    sample_size = int(args.data_proportion * len(all_patch_rows))
    inds = np.random.choice(len(all_patch_rows), sample_size)
    X = all_patch_rows[inds]
    logging.info('Sampled %.1f%% of dataset = %i', 100 * args.data_proportion,
        sample_size)

    ############################# Define pipeline #############################    

    std_scaler = (sklearn.preprocessing.StandardScaler, {})
    coates_scaler = (CoatesScaler.CoatesScaler, {})
    pca = (sklearn.decomposition.PCA,
            {'whiten':True, 'copy':True}
            )
    zca = (ZCA.ZCA, {'regularization': .1})
    n_clusters = 100
    mbkmeans = (sklearn.cluster.MiniBatchKMeans,
            {
                'n_clusters': n_clusters,
                'batch_size': 3000,
            })
    skmeans = (SphericalKMeans.SphericalKMeans,
            {
                'n_clusters': n_clusters,
                'max_iter': 10,
            })
    kmeans = (sklearn.cluster.KMeans,
            {
                'n_clusters': n_clusters,
                #'random_state': np.random.RandomState,
                #'n_jobs': -1,
                #'n_init': 1,
                #'max_iter': 10,
            })

    # Define pipeline
    steps = [coates_scaler, zca, kmeans]
    pipeline = sklearn.pipeline.make_pipeline(
            *[fun(**kwargs) for fun, kwargs in steps])

    # Define pointers to certain steps for future processing
    whitener = pipeline.steps[1][1]  # second step
    dic = pipeline.steps[-1][1]  # last step

    steps = [(obj.__class__, obj.get_params()) for name, obj in pipeline.steps]
    util.print_steps(steps)


    ######################### Train pipeline ##################################

    logging.info('Training model...')
    pipeline.fit(X)
    logging.info('done.')

    ######################### Display atoms of dictionary #####################

    frames = util.grab_frame(args.input_filename)
    patch_row_chunks = (
            np.array(list(
            patch.ravel()
            for patch in util.yield_windows(im, (d, d), (1, 1))))
            for im in frames)

    def im_displays():
        for patch_rows in patch_row_chunks:
            y = pipeline.predict(patch_rows)

            # Map to [0, 1) so that imshow scales across entire colormap spectrum
            y = y / n_clusters

            newshape = (im_height - d + 1, im_width - d + 1, )
            segmentation = np.reshape(y, newshape)

            # Apply color map and remove alpha channel
            cmap = plt.cm.Set1
            colored_segmentation = cmap(segmentation)[:, :, :3]
            colored_segmentation = (colored_segmentation * 255).astype(np.uint8)

            yield colored_segmentation

    #frames = itertools.islice(im_displays(), 5)
    frames = im_displays()
    save_video = args.output_filename is not None
    if save_video:
        write_frames_to_disk(frames, args.output_filename)
    else:
        display_frames(frames)

    return

    logging.info('Displaying atoms of dictionary')

    # Inverse whiten atoms of dictionary
    atom_rows = dic.cluster_centers_ 
    if hasattr(whitener, 'inverse_transform'):
        atom_rows = whitener.inverse_transform(atom_rows)  

    plt.figure()
    for i, atom_row in enumerate(atom_rows):
        patch = atom_row.reshape(d, d, -1)[::-1]
        plt.subplot(10, 10, i + 1)
        plt.imshow(patch, interpolation='nearest')
        plt.xticks(())
        plt.yticks(())

    plt.suptitle('Atoms of dictionary learnt from %i patches by %s' %  \
            (len(atom_rows), dic.__class__.__name__))

    plt.figure()
    displayed_patches = X[np.random.choice(len(X), 100)]
    for i, patch in enumerate(displayed_patches):
        plt.subplot(10, 10, i + 1)
        plt.imshow(patch.reshape([d, d, -1])[:,:,::-1], interpolation='nearest')
        plt.xticks(())
        plt.yticks(())

    plt.show()
コード例 #42
0
ファイル: ala_gmm.py プロジェクト: kyleabeauchamp/PMTStuff
import sklearn.pipeline, sklearn.externals.joblib
import mixtape.utils

# Copy paste from optimize ala


n_timescales = 4
n_states = 6
tica = mixtape.tica.tICA(n_components=n_components, lag_time=lag_time)
msm = mixtape.markovstatemodel.MarkovStateModel(n_timescales=n_timescales)
cluster = mixtape.cluster.GMM(n_components=n_states, covariance_type='full')
feature_pipeline = sklearn.pipeline.Pipeline([("features", featurizer), ('tica', tica)])
cluster_pipeline = sklearn.pipeline.Pipeline([("features", featurizer), ('tica', tica), ("cluster", cluster)])
pipeline = sklearn.pipeline.Pipeline([("features", featurizer), ('tica', tica), ("cluster", cluster), ("msm", msm)])

pipeline.fit(train)
print(pipeline.score(train), pipeline.score(test))
X_all = feature_pipeline.transform(trajectories)
q = np.concatenate(X_all)

covars_ = cluster.covars_
covars_ = cluster.covars_.diagonal(axis1=1, axis2=2)

for i, j in [(0, 1)]:
    figure()
    title("%d" % n_states)
    hexbin(q[:,i], q[:, j], bins='log')
    errorbar(cluster.means_[:, i], cluster.means_[:, j], xerr=covars_[:,i] ** 0.5, yerr=covars_[:, j] ** 0.5, fmt='kx', linewidth=4)

offset = np.ones(2) * 0.05
for state in range(n_states):