Example #1
0
 def test_PCA_scorer_component(self):
     pca = PCA()
     for i in range(1, len(self.zoo.domain.attributes) + 1):
         pca.component = i
         scores = pca.score_data(self.zoo)
         self.assertEqual(scores.shape,
                          (pca.component, len(self.zoo.domain.attributes)))
Example #2
0
 def test_PCA_scorer_component(self):
     pca = PCA()
     for i in range(1, len(self.zoo.domain.attributes) + 1):
         pca.component = i
         scores = pca.score_data(self.zoo)
         self.assertEqual(scores.shape,
                          (pca.component, len(self.zoo.domain.attributes)))
Example #3
0
 def test_max_components(self):
     d = np.random.RandomState(0).rand(20, 20)
     data = Table(d)
     pca = PCA()(data)
     self.assertEqual(len(pca.explained_variance_ratio_), 20)
     pca = PCA(n_components=10)(data)
     self.assertEqual(len(pca.explained_variance_ratio_), 10)
Example #4
0
 def test_chain(self):
     zoo_c = Continuize()(self.zoo)
     pca = PCA(n_components=3)(zoo_c)(self.zoo)
     pca2 = PCA(n_components=3)(zoo_c)(zoo_c)
     pp = [Continuize()]
     pca3 = PCA(n_components=3, preprocessors=pp)(self.zoo)(self.zoo)
     np.testing.assert_almost_equal(pca.X, pca2.X)
     np.testing.assert_almost_equal(pca.X, pca3.X)
Example #5
0
 def test_chain(self):
     zoo = Orange.data.Table('zoo')
     zoo_c = Continuize(zoo)
     pca = PCA()(zoo_c)(zoo)
     pca2 = PCA()(zoo_c)(zoo_c)
     pca3 = PCA(preprocessors=[Continuize()])(zoo)(zoo)
     np.testing.assert_almost_equal(pca.X, pca2.X)
     np.testing.assert_almost_equal(pca.X, pca3.X)
Example #6
0
 def test_PCA_scorer(self):
     data = Orange.data.Table("iris")
     pca = PCA(preprocessors=[Normalize()])
     scores = pca.score_data(data)
     self.assertEqual(len(scores), len(data.domain.attributes))
     self.assertEqual(
         ["petal length", "petal width"], sorted([data.domain.attributes[i].name for i in np.argsort(scores)[-2:]])
     )
     self.assertEqual([round(s, 4) for s in scores], [0.5224, 0.2634, 0.5813, 0.5656])
Example #7
0
 def test_PCA_scorer(self):
     data = self.iris
     pca = PCA(preprocessors=[Normalize()])
     pca.component = 1
     scores = pca.score_data(data)
     self.assertEqual(scores.shape[1], len(data.domain.attributes))
     self.assertEqual(['petal length', 'petal width'],
                      sorted([data.domain.attributes[i].name
                              for i in np.argsort(scores[0])[-2:]]))
     self.assertEqual([round(s, 4) for s in scores[0]],
                      [0.5224, 0.2634, 0.5813, 0.5656])
Example #8
0
 def test_PCA_scorer(self):
     data = self.iris
     pca = PCA(preprocessors=[Normalize()])
     pca.component = 1
     scores = pca.score_data(data)
     self.assertEqual(scores.shape[1], len(data.domain.attributes))
     self.assertEqual(['petal length', 'petal width'],
                      sorted([data.domain.attributes[i].name
                              for i in np.argsort(scores[0])[-2:]]))
     self.assertEqual([round(s, 4) for s in scores[0]],
                      [0.5224, 0.2634, 0.5813, 0.5656])
Example #9
0
 def test_PCA_scorer(self):
     data = Orange.data.Table('iris')
     pca = PCA(preprocessors=[Normalize()])
     scores = pca.score_data(data)
     self.assertEqual(len(scores), len(data.domain.attributes))
     self.assertEqual(['petal length', 'petal width'],
                      sorted([
                          data.domain.attributes[i].name
                          for i in np.argsort(scores)[-2:]
                      ]))
     self.assertEqual([round(s, 4) for s in scores],
                      [0.5224, 0.2634, 0.5813, 0.5656])
Example #10
0
    def test_improved_randomized_pca_properly_called(self):
        # It doesn't matter what we put into the matrix
        x_ = np.random.normal(0, 1, (100, 20))
        x = Table.from_numpy(Domain.from_numpy(x_), x_)

        pca.randomized_pca = MagicMock(wraps=pca.randomized_pca)
        PCA(10, svd_solver="randomized", random_state=42)(x)
        pca.randomized_pca.assert_called_once()

        pca.randomized_pca.reset_mock()
        PCA(10, svd_solver="arpack", random_state=42)(x)
        pca.randomized_pca.assert_not_called()
Example #11
0
 def pca_preprocessing(self):
     if self.pca_data is not None and \
             self.pca_data.X.shape[1] == self.pca_components:
         return
     pca = PCA(n_components=self.pca_components, random_state=0)
     model = pca(self.data)
     self.pca_data = model(self.data)
Example #12
0
def pca_preprocessing(data, n_components, normalize):
    projector = PCA(n_components=n_components, random_state=0)
    if normalize:
        projector.preprocessors += (preprocess.Normalize(),)

    model = projector(data)
    return model(data)
Example #13
0
def run_models(grid_y, grid_x):
    X_train, Y_train = create_training_data(
        grid_x, grid_y)  # X and Y is the inputs and target
    data = Table(X_train,
                 Y_train)  # creating a Orange table combining both X and Y

    feature_method = og.preprocess.score.UnivariateLinearRegression(
    )  # feature selection
    selector = og.preprocess.SelectBestFeatures(
        method=feature_method, k=50)  # taking 50 features out of 216
    out_data2 = selector(data)  # this is the new dataset with 50 features

    pca = PCA(n_components=5)  # PCA with 5 components
    model = pca(out_data2)
    train = model(out_data2)

    temp = []
    temp.append(pca.domain)
    for arr in model.components_:
        temp.append(list(arr))
    # temp.append(model.components_)
    np.savetxt('pca/' + str(grid_x) + '_' + str(grid_y) + '.csv',
               np.array(temp),
               delimiter=',',
               fmt='%s')
    def _compute_pca_projection(self):
        if self.pca_projection is None and self.apply_pca:
            self.setStatusMessage('Computing PCA...')

            pca = PCA(n_components=self.pca_components, random_state=0)
            model = pca(self.data)
            self.pca_projection = model(self.data)
Example #15
0
 def setUp(self):
     self.widget = self.create_widget(OWRank)  # type: OWRank
     self.iris = Table("iris")
     self.housing = Table("housing")
     self.log_reg = LogisticRegressionLearner()
     self.lin_reg = LinearRegressionLearner()
     self.pca = PCA()
Example #16
0
 def test_learner_with_transformation(self):
     learner = RandomForestLearner(random_state=0)
     from Orange.projection import PCA
     iris = Table("iris")
     data = PCA(n_components=2)(iris)(iris)
     scores = learner.score_data(data)
     np.testing.assert_almost_equal(scores, [[0.7760495, 0.2239505]])
    def _reduce_dimensions(data, method="MDS", use_cosine=False):
        """
        Reduce the dimensionality of the data to 2D.

        Parameters
        ----------
        data: Orange.data.Table
            The image embeddings (vectors of length 2048).
        method: string
            The method to use (default MDS).
        use_cosine: bool
            Precompute cosine distances and pass them to MDS.

        Returns
        -------
        array-like
            The data, reduced to 2 dimensions.

        """
        if method == "MDS":
            if use_cosine:
                mds = MDS(n_init=1, dissimilarity="precomputed")
                dist_matrix = Cosine(data)
                return mds(dist_matrix).embedding_
            else:
                mds = MDS(n_init=1, init_type="PCA")
                return mds(data).embedding_

        elif method == "PCA":
            pca = PCA(n_components=2)
            return pca(data)(data)

        elif method == "TSNE":
            tsne = TSNE(init="pca")
            return tsne(data).embedding_
Example #18
0
    def test_transformed_domain_does_not_pickle_data(self):
        iris = self.iris
        pca = PCA(n_components=2)(iris)
        pca_iris = pca(iris)
        pca_iris2 = Table(pca_iris.domain, iris)

        pca_iris2 = pickle.loads(pickle.dumps(pca_iris))
        self.assertIsNone(pca_iris2.domain[0].compute_value.transformed)
Example #19
0
 def __rnd_pca_test_helper(self, data, n_com, min_xpl_var):
     rnd_pca = PCA(n_components=n_com, svd_solver='randomized')
     pca_model = rnd_pca(data)
     pca_xpl_var = np.sum(pca_model.explained_variance_ratio_)
     self.assertGreaterEqual(pca_xpl_var, min_xpl_var)
     self.assertEqual(n_com, pca_model.n_components)
     self.assertEqual((n_com, data.X.shape[1]), pca_model.components_.shape)
     proj = np.dot(data.X - pca_model.mean_, pca_model.components_.T)
     np.testing.assert_almost_equal(pca_model(data).X, proj)
Example #20
0
    def test_compute_value(self):
        iris = self.iris
        pca = PCA(n_components=2)(iris)
        pca_iris = pca(iris)
        pca_iris2 = Table(pca_iris.domain, iris)
        np.testing.assert_almost_equal(pca_iris.X, pca_iris2.X)
        np.testing.assert_equal(pca_iris.Y, pca_iris2.Y)

        pca_iris3 = pickle.loads(pickle.dumps(pca_iris))
        np.testing.assert_almost_equal(pca_iris.X, pca_iris3.X)
        np.testing.assert_equal(pca_iris.Y, pca_iris3.Y)
Example #21
0
    def init_projection(self):
        if self.placement == Placement.Circular:
            self.projector = CircularPlacement()
        elif self.placement == Placement.LDA:
            self.projector = LDA(solver="eigen", n_components=2)
        elif self.placement == Placement.PCA:
            self.projector = PCA(n_components=2)
            self.projector.component = 2
            self.projector.preprocessors = PCA.preprocessors + [Normalize()]

        super().init_projection()
Example #22
0
    def _get_pca(self):
        pca_projector = PCA(n_components=2)
        pca_projector.component = 2
        pca_projector.preprocessors = PCA.preprocessors + [Normalize()]

        pca = pca_projector(self.data)
        variance_ratio = pca.explained_variance_ratio_
        cumulative = np.cumsum(variance_ratio)

        self._pca = pca
        if not np.isfinite(cumulative[-1]):
            self.Warning.trivial_components()

        coords = pca(self.data).X
        valid_mask = ~np.isnan(coords).any(axis=1)
        # scale axes
        max_radius = np.min([np.abs(np.min(coords, axis=0)),
                             np.max(coords, axis=0)])
        axes = pca.components_.T.copy()
        axes *= max_radius / np.max(np.linalg.norm(axes, axis=1))
        return valid_mask, coords, axes
Example #23
0
def add_embedding(corpus: Corpus) -> Corpus:
    transformed_corpus = BowVectorizer().transform(corpus)

    pca = PCA(n_components=2)
    pca_model = pca(transformed_corpus)
    projection = pca_model(transformed_corpus)

    domain = Domain(
        transformed_corpus.domain.attributes,
        transformed_corpus.domain.class_vars,
        chain(transformed_corpus.domain.metas, projection.domain.attributes))
    return corpus.transform(domain)
Example #24
0
def pca_preprocessing(data, pca_components):
    """
    :param data:
    :param pca_components:

    :return:
    """

    pca = PCA(n_components=pca_components, random_state=0)
    model = pca(data)
    pca_data = model(data)
    return pca_data
Example #25
0
    def test_improved_randomized_pca_sparse_data(self):
        """Randomized PCA should work well on dense data."""
        random_state = check_random_state(42)

        # Let's take a tall, skinny matrix
        x_ = random_state.negative_binomial(1, 0.5, (100, 20))
        x = Table.from_numpy(Domain.from_numpy(x_), x_).to_sparse()

        pca = PCA(10, svd_solver="full",
                  random_state=random_state)(x.to_dense())
        rpca = PCA(10, svd_solver="randomized", random_state=random_state)(x)

        np.testing.assert_almost_equal(pca.components_,
                                       rpca.components_,
                                       decimal=8)
        np.testing.assert_almost_equal(pca.explained_variance_,
                                       rpca.explained_variance_,
                                       decimal=8)
        np.testing.assert_almost_equal(pca.singular_values_,
                                       rpca.singular_values_,
                                       decimal=8)

        # And take a short, fat matrix
        x_ = random_state.negative_binomial(1, 0.5, (20, 100))
        x = Table.from_numpy(Domain.from_numpy(x_), x_).to_sparse()

        pca = PCA(10, svd_solver="full",
                  random_state=random_state)(x.to_dense())
        rpca = PCA(10, svd_solver="randomized", random_state=random_state)(x)

        np.testing.assert_almost_equal(pca.components_,
                                       rpca.components_,
                                       decimal=8)
        np.testing.assert_almost_equal(pca.explained_variance_,
                                       rpca.explained_variance_,
                                       decimal=8)
        np.testing.assert_almost_equal(pca.singular_values_,
                                       rpca.singular_values_,
                                       decimal=8)
Example #26
0
    def pca_preprocessing(self):
        """Perform PCA preprocessing before passing off the data to t-SNE."""
        if self.pca_data is not None:
            return

        projector = PCA(n_components=self.pca_components, random_state=0)
        # If the normalization box is ticked, we'll add the `Normalize`
        # preprocessor to PCA
        if self.normalize:
            projector.preprocessors += (preprocess.Normalize(),)

        model = projector(self.data)
        self.pca_data = model(self.data)
Example #27
0
def preprocess(corpus: Corpus) -> Corpus:
    for pp in (LowercaseTransformer(), RegexpTokenizer(r"\w+"),
               StopwordsFilter("English"), FrequencyFilter(0.1)):
        corpus = pp(corpus)

    transformed_corpus = BowVectorizer().transform(corpus)

    pca = PCA(n_components=2)
    pca_model = pca(transformed_corpus)
    projection = pca_model(transformed_corpus)

    domain = Domain(
        transformed_corpus.domain.attributes,
        transformed_corpus.domain.class_vars,
        chain(transformed_corpus.domain.metas, projection.domain.attributes))
    return corpus.transform(domain)
Example #28
0
 def __ipca_test_helper(self, data, n_com, min_xpl_var):
     pca = IncrementalPCA(n_components=n_com)
     pca_model = pca(data[::2])
     pca_xpl_var = np.sum(pca_model.explained_variance_ratio_)
     self.assertGreaterEqual(pca_xpl_var + 1e-6, min_xpl_var)
     self.assertEqual(n_com, pca_model.n_components)
     self.assertEqual((n_com, data.X.shape[1]), pca_model.components_.shape)
     proj = np.dot(data.X - pca_model.mean_, pca_model.components_.T)
     np.testing.assert_almost_equal(pca_model(data).X, proj)
     pc1_ipca = pca_model.components_[0]
     self.assertAlmostEqual(np.linalg.norm(pc1_ipca), 1)
     pc1_pca = PCA(n_components=n_com)(data).components_[0]
     self.assertAlmostEqual(np.linalg.norm(pc1_pca), 1)
     self.assertNotAlmostEqual(abs(pc1_ipca.dot(pc1_pca)), 1, 2)
     pc1_ipca = pca_model.partial_fit(data[1::2]).components_[0]
     self.assertAlmostEqual(abs(pc1_ipca.dot(pc1_pca)), 1, 4)
    def _init_data(cls):
        data_path = "https://datasets.orange.biolab.si/sc/aml-1k.tab.gz"
        table_data = Table(data_path)
        table_data.attributes[TAX_ID] = "9606"

        ref_data = table_data[::2]
        pca = PCA(n_components=2)
        model = pca(ref_data)
        proj = model(ref_data)
        domain = Domain(
            ref_data.domain.attributes,
            ref_data.domain.class_vars,
            chain(ref_data.domain.metas, proj.domain.attributes),
        )
        cls.data = ref_data.transform(domain)
        cls.reference_data = ref_data
        cls.secondary_data = table_data[1:200:2]
Example #30
0
def run_models(grid_y, grid_x):
    X_train, Y_train = create_training_data(
        grid_x, grid_y)  # X and Y is the inputs and target
    data = Table(X_train,
                 Y_train)  # creating a Orange table combining both X and Y

    feature_method = og.preprocess.score.UnivariateLinearRegression(
    )  # feature selection
    selector = og.preprocess.SelectBestFeatures(
        method=feature_method, k=50)  # taking 50 features out of 216
    out_data2 = selector(data)  # this is the new dataset with 50 features

    pca = PCA(n_components=5)  # PCA with 5 components
    model = pca(out_data2)
    train2 = model(out_data2)

    featuresIndex = set()
    for comp in range(len(model.components_) - 1, 0, -1):
        top2 = (-np.array(model.components_[comp])).argsort()[:2]
        featuresIndex |= set(top2)

    top2 = (-np.array(model.components_[0])).argsort()[:13]
    f_index = 0
    while (len(featuresIndex) != 13):
        featuresIndex.add(top2[f_index])
        f_index += 1

    ind = np.array(list(featuresIndex))

    # train = Table(list(out_data2[:,ind]), Y_train)
    # print(train)
    store = np.array(pca.domain)[ind]
    # print(store)
    np.savetxt('unlucky13/' + str(grid_x) + '_' + str(grid_y) + '.csv',
               store,
               delimiter=',',
               fmt='%s')
Example #31
0
def run_on_data(data, pca_components, k_neighbors, metric, resolution, state):
    # type: (Table, Optional[int], int, str, float, TaskState) -> Results
    """
    Run the louvain clustering on `data`.

    state is used to report progress and partial results. Returns early if
    `task.is_interuption_requested()` returns true.

    Parameters
    ----------
    data : Table
        Data table
    pca_components : Optional[int]
        If not `None` then the data is first projected onto first
        `pca_components` principal components.
    k_neighbors : int
        Passed to `table_to_knn_graph`
    metric : str
        Passed to `table_to_knn_graph`
    resolution : float
        Passed to `Louvain`
    state : TaskState

    Returns
    -------
    res : Results
    """
    state = state  # type: TaskState
    res = Results(
        pca_components=pca_components,
        k_neighbors=k_neighbors,
        metric=metric,
        resolution=resolution,
    )
    step = 0
    if state.is_interuption_requested():
        return res
    if pca_components is not None:
        steps = 3
        state.set_status("Computing PCA...")
        pca = PCA(n_components=pca_components, random_state=0)
        data = res.pca_projection = pca(data)(data)
        assert isinstance(data, Table)
        state.set_partial_results(("pca_projection", res.pca_projection))
        step += 1
    else:
        steps = 2

    if state.is_interuption_requested():
        return res

    state.set_progress_value(100. * step / steps)
    state.set_status("Building graph...")

    def pcallback(val):
        state.set_progress_value((100. * step + 100 * val) / steps)
        if state.is_interuption_requested():
            raise InteruptRequested()

    try:
        res.graph = graph = table_to_knn_graph(data,
                                               k_neighbors=k_neighbors,
                                               metric=metric,
                                               progress_callback=pcallback)
    except InteruptRequested:
        return res

    state.set_partial_results(("graph", res.graph))

    step += 1
    state.set_progress_value(100 * step / steps)
    state.set_status("Detecting communities...")
    if state.is_interuption_requested():
        return res

    louvain = Louvain(resolution=resolution, random_state=0)
    res.partition = louvain.fit_predict(graph)
    state.set_partial_results(("partition", res.partition))
    return res
Example #32
0
 def _init_projector(self):
     self._pca_projector = PCA(n_components=MAX_COMPONENTS, random_state=0)
     self._pca_projector.component = self.ncomponents
     self._pca_preprocessors = PCA.preprocessors
Example #33
0
 def test_PCA_scorer_all_components(self):
     n_attr = len(self.iris.domain.attributes)
     pca = PCA()
     scores = pca.score_data(self.iris)
     self.assertEqual(scores.shape, (n_attr, n_attr))
Example #34
0
 def test_PCA_scorer_all_components(self):
     n_attr = len(self.iris.domain.attributes)
     pca = PCA()
     scores = pca.score_data(self.iris)
     self.assertEqual(scores.shape, (n_attr, n_attr))