Beispiel #1
0
    def test_inverse_tansform(self):
        # simple pipe
        sk_pipe = SKPipeline([("SS", self.sk_ss), ("PCA", self.sk_pca)])
        sk_pipe.fit(self.X, self.y)
        sk_transform = sk_pipe.transform(self.X)
        sk_inverse_transformed = sk_pipe.inverse_transform(sk_transform)

        photon_pipe = PhotonPipeline([("SS", self.p_ss), ("PCA", self.p_pca)])
        photon_pipe.fit(self.X, self.y)
        p_transform, _, _ = photon_pipe.transform(self.X)
        p_inverse_transformed, _, _ = photon_pipe.inverse_transform(
            p_transform)

        self.assertTrue(
            np.array_equal(sk_inverse_transformed, p_inverse_transformed))

        # now including stack
        stack = Stack("stack", [self.p_pca])
        stack_pipeline = PhotonPipeline([
            ("stack", stack),
            ("StandardScaler", PipelineElement("StandardScaler")),
            ("LinearSVC", PipelineElement("LinearSVC")),
        ])
        stack_pipeline.fit(self.X, self.y)
        feature_importances = stack_pipeline.feature_importances_
        inversed_data, _, _ = stack_pipeline.inverse_transform(
            feature_importances)
        self.assertEqual(inversed_data.shape[1], self.X.shape[1])
Beispiel #2
0
def sklearn_custom_transformer_model(sklearn_knn_model):
    def transform(vec):
        return vec + 1

    transformer = SKFunctionTransformer(transform, validate=True)
    pipeline = SKPipeline([("custom_transformer", transformer), ("knn", sklearn_knn_model.model)])
    return ModelWithData(pipeline, inference_data=datasets.load_iris().data[:, :2])
Beispiel #3
0
    def setUp(self):
        self.X, self.y = load_breast_cancer(True)
        self.scaler = PipelineElement("StandardScaler", {'with_mean': True})
        self.pca = PipelineElement('PCA', {'n_components': [1, 2]},
                                   test_disabled=True,
                                   random_state=3)
        self.tree = PipelineElement('DecisionTreeClassifier',
                                    {'min_samples_split': [2, 3, 4]},
                                    random_state=3)

        self.transformer_branch = Branch('MyBranch', [self.scaler, self.pca])
        self.transformer_branch_sklearn = SKPipeline([("SS", StandardScaler()),
                                                      ("PCA",
                                                       PCA(random_state=3))])
        self.estimator_branch = Branch('MyBranch',
                                       [self.scaler, self.pca, self.tree])
        self.estimator_branch_sklearn = SKPipeline([
            ("SS", StandardScaler()), ("PCA", PCA(random_state=3)),
            ("Tree", DecisionTreeClassifier(random_state=3))
        ])
Beispiel #4
0
    def test_predict_proba(self):

        sk_pipe = SKPipeline([("SS", self.sk_ss), ("SVC", self.sk_dt)])
        sk_pipe.fit(self.X, self.y)
        sk_proba = sk_pipe.predict_proba(self.X)

        photon_pipe = PhotonPipeline([("SS", self.p_ss), ("SVC", self.p_dt)])
        photon_pipe.fit(self.X, self.y)
        photon_proba = photon_pipe.predict_proba(self.X)

        self.assertTrue(np.array_equal(sk_proba, photon_proba))
Beispiel #5
0
    def test_predict_with_training_flag(self):
        # manually edit labels
        sk_pipe = SKPipeline([("SS", self.sk_ss), ("SVC", self.sk_svc)])
        y_plus_one = self.y + 1
        sk_pipe.fit(self.X, y_plus_one)
        sk_pred = sk_pipe.predict(self.X)

        # edit labels during pipeline
        p_pipe = PhotonPipeline([("SS", self.p_ss), ("YT", self.dummy_photon_element), ("SVC", self.p_svm)])
        p_pipe.fit(self.X, self.y)
        p_pred = p_pipe.predict(self.X)

        sk_standardized_X = self.sk_ss.transform(self.X)
        input_of_y_transformer = self.dummy_photon_element.base_element.X
        self.assertTrue(np.array_equal(sk_standardized_X, input_of_y_transformer))

        self.assertTrue(np.array_equal(sk_pred, p_pred))
Beispiel #6
0
    def test_regular_use(self):

        photon_pipe = PhotonPipeline([("PCA", self.p_pca), ("SVC", self.p_svm)])
        photon_pipe.fit(self.X, self.y)

        photon_transformed_X, _, _ = photon_pipe.transform(self.X)
        photon_predicted_y = photon_pipe.predict(self.X)

        # the element is given by reference, so it should be fitted right here
        photon_ref_transformed_X, _, _ = self.p_pca.transform(self.X)
        photon_ref_predicted_y = self.p_svm.predict(photon_ref_transformed_X)

        self.assertTrue(np.array_equal(photon_transformed_X, photon_ref_transformed_X))
        self.assertTrue(np.array_equal(photon_predicted_y, photon_ref_predicted_y))

        sk_pipe = SKPipeline([('PCA', self.sk_pca), ("SVC", self.sk_svc)])
        sk_pipe.fit(self.X, self.y)

        sk_predicted_y = sk_pipe.predict(self.X)
        self.assertTrue(np.array_equal(photon_predicted_y, sk_predicted_y))
Beispiel #7
0
def explain_article_lime_task_impl(view_cache_id, ace_id, pipeline_id,
                                   article_number):
    ace = ACE.objects.get({'_id': ObjectId(ace_id)})
    pipeline = Pipeline.objects.get({'_id': ObjectId(pipeline_id)})

    article_number = int(article_number)
    article = ace.data_source.articles[article_number]

    sk_pipeline = pipeline.sk_pipeline.get()

    prediction = sk_pipeline.predict([article.raw_text])[0]

    # do not modify pipeline.sk_pipeline
    skp = deepcopy(sk_pipeline)
    model = skp.steps.pop()[1]

    used_classes = model.classes_
    used_class_names = [ace.data_source.labels[x] for x in used_classes]

    lime_text_html = ''
    lime_features_html = ''
    anchor_html = ''

    # TODO: do not send article raw text, I suspect the bug report for stop-word appearance is due to raw_text
    # although we are sending through the pipeline predict_proba
    if pipeline.nlp_tool.name == 'TF-IDF':
        # the pipeline should be linear, but it contains a FeatureUnion (with a Pipeline), so let's flatten it
        steps = []  # will contained the flattened steps
        for step in sk_pipeline.steps:
            if isinstance(step[1], FeatureUnion):
                # step[1] FeatureUnion, should contain a single Pipeline
                steps.extend(step[1].transformer_list[0][1].steps)
            else:
                steps.append(step)

        # find tfidf vectorizer step number
        for tfidf_step_index, step in enumerate(steps):
            if isinstance(step[1], TfidfVectorizer):
                break

        preprocess_pipeline = SKPipeline(steps[:tfidf_step_index])
        rest_pipeline = SKPipeline(steps[tfidf_step_index:])

        # give lime text before tfidf_step, the function should be the rest of the pipeline.
        lime_text_html = get_lime_text_explanation(
            preprocess_pipeline.transform([article.raw_text])[0], prediction,
            used_class_names, rest_pipeline.predict_proba).as_html()

        # anchor_html = get_anchor_text_explanation(
        #     skp,
        #     article.raw_text,
        #     sk_pipeline.predict,
        #     used_class_names
        # ).as_html()

    else:
        lime_features_html = get_lime_feature_explanation(
            article, prediction, skp, model.predict_proba,
            pipeline.data_source.articles, used_class_names).as_html()

    cache = CachedView.objects.get({'_id': ObjectId(view_cache_id)})
    cache.task.set_success()
    cache.data = dict(
        pipeline=pipeline,
        article=article,
        prediction=prediction,
        exp1_html=lime_text_html or lime_features_html,
        exp2_html=anchor_html,
        article_number=article_number,
    )
    cache.save()
Beispiel #8
0
def dict_vectorize(transformer_name, transformer):
    return SKPipeline([(transformer_name, transformer),
                       ('DictVectorizer', DictVectorizer(sparse=False))])