def test_PCA_scorer_component(self): pca = PCA() for i in range(1, len(self.zoo.domain.attributes) + 1): pca.component = i scores = pca.score_data(self.zoo) self.assertEqual(scores.shape, (pca.component, len(self.zoo.domain.attributes)))
def test_max_components(self): d = np.random.RandomState(0).rand(20, 20) data = Table(d) pca = PCA()(data) self.assertEqual(len(pca.explained_variance_ratio_), 20) pca = PCA(n_components=10)(data) self.assertEqual(len(pca.explained_variance_ratio_), 10)
def test_chain(self): zoo_c = Continuize()(self.zoo) pca = PCA(n_components=3)(zoo_c)(self.zoo) pca2 = PCA(n_components=3)(zoo_c)(zoo_c) pp = [Continuize()] pca3 = PCA(n_components=3, preprocessors=pp)(self.zoo)(self.zoo) np.testing.assert_almost_equal(pca.X, pca2.X) np.testing.assert_almost_equal(pca.X, pca3.X)
def test_chain(self): zoo = Orange.data.Table('zoo') zoo_c = Continuize(zoo) pca = PCA()(zoo_c)(zoo) pca2 = PCA()(zoo_c)(zoo_c) pca3 = PCA(preprocessors=[Continuize()])(zoo)(zoo) np.testing.assert_almost_equal(pca.X, pca2.X) np.testing.assert_almost_equal(pca.X, pca3.X)
def test_PCA_scorer(self): data = Orange.data.Table("iris") pca = PCA(preprocessors=[Normalize()]) scores = pca.score_data(data) self.assertEqual(len(scores), len(data.domain.attributes)) self.assertEqual( ["petal length", "petal width"], sorted([data.domain.attributes[i].name for i in np.argsort(scores)[-2:]]) ) self.assertEqual([round(s, 4) for s in scores], [0.5224, 0.2634, 0.5813, 0.5656])
def test_PCA_scorer(self): data = self.iris pca = PCA(preprocessors=[Normalize()]) pca.component = 1 scores = pca.score_data(data) self.assertEqual(scores.shape[1], len(data.domain.attributes)) self.assertEqual(['petal length', 'petal width'], sorted([data.domain.attributes[i].name for i in np.argsort(scores[0])[-2:]])) self.assertEqual([round(s, 4) for s in scores[0]], [0.5224, 0.2634, 0.5813, 0.5656])
def test_PCA_scorer(self): data = Orange.data.Table('iris') pca = PCA(preprocessors=[Normalize()]) scores = pca.score_data(data) self.assertEqual(len(scores), len(data.domain.attributes)) self.assertEqual(['petal length', 'petal width'], sorted([ data.domain.attributes[i].name for i in np.argsort(scores)[-2:] ])) self.assertEqual([round(s, 4) for s in scores], [0.5224, 0.2634, 0.5813, 0.5656])
def test_improved_randomized_pca_properly_called(self): # It doesn't matter what we put into the matrix x_ = np.random.normal(0, 1, (100, 20)) x = Table.from_numpy(Domain.from_numpy(x_), x_) pca.randomized_pca = MagicMock(wraps=pca.randomized_pca) PCA(10, svd_solver="randomized", random_state=42)(x) pca.randomized_pca.assert_called_once() pca.randomized_pca.reset_mock() PCA(10, svd_solver="arpack", random_state=42)(x) pca.randomized_pca.assert_not_called()
def pca_preprocessing(self): if self.pca_data is not None and \ self.pca_data.X.shape[1] == self.pca_components: return pca = PCA(n_components=self.pca_components, random_state=0) model = pca(self.data) self.pca_data = model(self.data)
def pca_preprocessing(data, n_components, normalize): projector = PCA(n_components=n_components, random_state=0) if normalize: projector.preprocessors += (preprocess.Normalize(),) model = projector(data) return model(data)
def run_models(grid_y, grid_x): X_train, Y_train = create_training_data( grid_x, grid_y) # X and Y is the inputs and target data = Table(X_train, Y_train) # creating a Orange table combining both X and Y feature_method = og.preprocess.score.UnivariateLinearRegression( ) # feature selection selector = og.preprocess.SelectBestFeatures( method=feature_method, k=50) # taking 50 features out of 216 out_data2 = selector(data) # this is the new dataset with 50 features pca = PCA(n_components=5) # PCA with 5 components model = pca(out_data2) train = model(out_data2) temp = [] temp.append(pca.domain) for arr in model.components_: temp.append(list(arr)) # temp.append(model.components_) np.savetxt('pca/' + str(grid_x) + '_' + str(grid_y) + '.csv', np.array(temp), delimiter=',', fmt='%s')
def _compute_pca_projection(self): if self.pca_projection is None and self.apply_pca: self.setStatusMessage('Computing PCA...') pca = PCA(n_components=self.pca_components, random_state=0) model = pca(self.data) self.pca_projection = model(self.data)
def setUp(self): self.widget = self.create_widget(OWRank) # type: OWRank self.iris = Table("iris") self.housing = Table("housing") self.log_reg = LogisticRegressionLearner() self.lin_reg = LinearRegressionLearner() self.pca = PCA()
def test_learner_with_transformation(self): learner = RandomForestLearner(random_state=0) from Orange.projection import PCA iris = Table("iris") data = PCA(n_components=2)(iris)(iris) scores = learner.score_data(data) np.testing.assert_almost_equal(scores, [[0.7760495, 0.2239505]])
def _reduce_dimensions(data, method="MDS", use_cosine=False): """ Reduce the dimensionality of the data to 2D. Parameters ---------- data: Orange.data.Table The image embeddings (vectors of length 2048). method: string The method to use (default MDS). use_cosine: bool Precompute cosine distances and pass them to MDS. Returns ------- array-like The data, reduced to 2 dimensions. """ if method == "MDS": if use_cosine: mds = MDS(n_init=1, dissimilarity="precomputed") dist_matrix = Cosine(data) return mds(dist_matrix).embedding_ else: mds = MDS(n_init=1, init_type="PCA") return mds(data).embedding_ elif method == "PCA": pca = PCA(n_components=2) return pca(data)(data) elif method == "TSNE": tsne = TSNE(init="pca") return tsne(data).embedding_
def test_transformed_domain_does_not_pickle_data(self): iris = self.iris pca = PCA(n_components=2)(iris) pca_iris = pca(iris) pca_iris2 = Table(pca_iris.domain, iris) pca_iris2 = pickle.loads(pickle.dumps(pca_iris)) self.assertIsNone(pca_iris2.domain[0].compute_value.transformed)
def __rnd_pca_test_helper(self, data, n_com, min_xpl_var): rnd_pca = PCA(n_components=n_com, svd_solver='randomized') pca_model = rnd_pca(data) pca_xpl_var = np.sum(pca_model.explained_variance_ratio_) self.assertGreaterEqual(pca_xpl_var, min_xpl_var) self.assertEqual(n_com, pca_model.n_components) self.assertEqual((n_com, data.X.shape[1]), pca_model.components_.shape) proj = np.dot(data.X - pca_model.mean_, pca_model.components_.T) np.testing.assert_almost_equal(pca_model(data).X, proj)
def test_compute_value(self): iris = self.iris pca = PCA(n_components=2)(iris) pca_iris = pca(iris) pca_iris2 = Table(pca_iris.domain, iris) np.testing.assert_almost_equal(pca_iris.X, pca_iris2.X) np.testing.assert_equal(pca_iris.Y, pca_iris2.Y) pca_iris3 = pickle.loads(pickle.dumps(pca_iris)) np.testing.assert_almost_equal(pca_iris.X, pca_iris3.X) np.testing.assert_equal(pca_iris.Y, pca_iris3.Y)
def init_projection(self): if self.placement == Placement.Circular: self.projector = CircularPlacement() elif self.placement == Placement.LDA: self.projector = LDA(solver="eigen", n_components=2) elif self.placement == Placement.PCA: self.projector = PCA(n_components=2) self.projector.component = 2 self.projector.preprocessors = PCA.preprocessors + [Normalize()] super().init_projection()
def _get_pca(self): pca_projector = PCA(n_components=2) pca_projector.component = 2 pca_projector.preprocessors = PCA.preprocessors + [Normalize()] pca = pca_projector(self.data) variance_ratio = pca.explained_variance_ratio_ cumulative = np.cumsum(variance_ratio) self._pca = pca if not np.isfinite(cumulative[-1]): self.Warning.trivial_components() coords = pca(self.data).X valid_mask = ~np.isnan(coords).any(axis=1) # scale axes max_radius = np.min([np.abs(np.min(coords, axis=0)), np.max(coords, axis=0)]) axes = pca.components_.T.copy() axes *= max_radius / np.max(np.linalg.norm(axes, axis=1)) return valid_mask, coords, axes
def add_embedding(corpus: Corpus) -> Corpus: transformed_corpus = BowVectorizer().transform(corpus) pca = PCA(n_components=2) pca_model = pca(transformed_corpus) projection = pca_model(transformed_corpus) domain = Domain( transformed_corpus.domain.attributes, transformed_corpus.domain.class_vars, chain(transformed_corpus.domain.metas, projection.domain.attributes)) return corpus.transform(domain)
def pca_preprocessing(data, pca_components): """ :param data: :param pca_components: :return: """ pca = PCA(n_components=pca_components, random_state=0) model = pca(data) pca_data = model(data) return pca_data
def test_improved_randomized_pca_sparse_data(self): """Randomized PCA should work well on dense data.""" random_state = check_random_state(42) # Let's take a tall, skinny matrix x_ = random_state.negative_binomial(1, 0.5, (100, 20)) x = Table.from_numpy(Domain.from_numpy(x_), x_).to_sparse() pca = PCA(10, svd_solver="full", random_state=random_state)(x.to_dense()) rpca = PCA(10, svd_solver="randomized", random_state=random_state)(x) np.testing.assert_almost_equal(pca.components_, rpca.components_, decimal=8) np.testing.assert_almost_equal(pca.explained_variance_, rpca.explained_variance_, decimal=8) np.testing.assert_almost_equal(pca.singular_values_, rpca.singular_values_, decimal=8) # And take a short, fat matrix x_ = random_state.negative_binomial(1, 0.5, (20, 100)) x = Table.from_numpy(Domain.from_numpy(x_), x_).to_sparse() pca = PCA(10, svd_solver="full", random_state=random_state)(x.to_dense()) rpca = PCA(10, svd_solver="randomized", random_state=random_state)(x) np.testing.assert_almost_equal(pca.components_, rpca.components_, decimal=8) np.testing.assert_almost_equal(pca.explained_variance_, rpca.explained_variance_, decimal=8) np.testing.assert_almost_equal(pca.singular_values_, rpca.singular_values_, decimal=8)
def pca_preprocessing(self): """Perform PCA preprocessing before passing off the data to t-SNE.""" if self.pca_data is not None: return projector = PCA(n_components=self.pca_components, random_state=0) # If the normalization box is ticked, we'll add the `Normalize` # preprocessor to PCA if self.normalize: projector.preprocessors += (preprocess.Normalize(),) model = projector(self.data) self.pca_data = model(self.data)
def preprocess(corpus: Corpus) -> Corpus: for pp in (LowercaseTransformer(), RegexpTokenizer(r"\w+"), StopwordsFilter("English"), FrequencyFilter(0.1)): corpus = pp(corpus) transformed_corpus = BowVectorizer().transform(corpus) pca = PCA(n_components=2) pca_model = pca(transformed_corpus) projection = pca_model(transformed_corpus) domain = Domain( transformed_corpus.domain.attributes, transformed_corpus.domain.class_vars, chain(transformed_corpus.domain.metas, projection.domain.attributes)) return corpus.transform(domain)
def __ipca_test_helper(self, data, n_com, min_xpl_var): pca = IncrementalPCA(n_components=n_com) pca_model = pca(data[::2]) pca_xpl_var = np.sum(pca_model.explained_variance_ratio_) self.assertGreaterEqual(pca_xpl_var + 1e-6, min_xpl_var) self.assertEqual(n_com, pca_model.n_components) self.assertEqual((n_com, data.X.shape[1]), pca_model.components_.shape) proj = np.dot(data.X - pca_model.mean_, pca_model.components_.T) np.testing.assert_almost_equal(pca_model(data).X, proj) pc1_ipca = pca_model.components_[0] self.assertAlmostEqual(np.linalg.norm(pc1_ipca), 1) pc1_pca = PCA(n_components=n_com)(data).components_[0] self.assertAlmostEqual(np.linalg.norm(pc1_pca), 1) self.assertNotAlmostEqual(abs(pc1_ipca.dot(pc1_pca)), 1, 2) pc1_ipca = pca_model.partial_fit(data[1::2]).components_[0] self.assertAlmostEqual(abs(pc1_ipca.dot(pc1_pca)), 1, 4)
def _init_data(cls): data_path = "https://datasets.orange.biolab.si/sc/aml-1k.tab.gz" table_data = Table(data_path) table_data.attributes[TAX_ID] = "9606" ref_data = table_data[::2] pca = PCA(n_components=2) model = pca(ref_data) proj = model(ref_data) domain = Domain( ref_data.domain.attributes, ref_data.domain.class_vars, chain(ref_data.domain.metas, proj.domain.attributes), ) cls.data = ref_data.transform(domain) cls.reference_data = ref_data cls.secondary_data = table_data[1:200:2]
def run_models(grid_y, grid_x): X_train, Y_train = create_training_data( grid_x, grid_y) # X and Y is the inputs and target data = Table(X_train, Y_train) # creating a Orange table combining both X and Y feature_method = og.preprocess.score.UnivariateLinearRegression( ) # feature selection selector = og.preprocess.SelectBestFeatures( method=feature_method, k=50) # taking 50 features out of 216 out_data2 = selector(data) # this is the new dataset with 50 features pca = PCA(n_components=5) # PCA with 5 components model = pca(out_data2) train2 = model(out_data2) featuresIndex = set() for comp in range(len(model.components_) - 1, 0, -1): top2 = (-np.array(model.components_[comp])).argsort()[:2] featuresIndex |= set(top2) top2 = (-np.array(model.components_[0])).argsort()[:13] f_index = 0 while (len(featuresIndex) != 13): featuresIndex.add(top2[f_index]) f_index += 1 ind = np.array(list(featuresIndex)) # train = Table(list(out_data2[:,ind]), Y_train) # print(train) store = np.array(pca.domain)[ind] # print(store) np.savetxt('unlucky13/' + str(grid_x) + '_' + str(grid_y) + '.csv', store, delimiter=',', fmt='%s')
def run_on_data(data, pca_components, k_neighbors, metric, resolution, state): # type: (Table, Optional[int], int, str, float, TaskState) -> Results """ Run the louvain clustering on `data`. state is used to report progress and partial results. Returns early if `task.is_interuption_requested()` returns true. Parameters ---------- data : Table Data table pca_components : Optional[int] If not `None` then the data is first projected onto first `pca_components` principal components. k_neighbors : int Passed to `table_to_knn_graph` metric : str Passed to `table_to_knn_graph` resolution : float Passed to `Louvain` state : TaskState Returns ------- res : Results """ state = state # type: TaskState res = Results( pca_components=pca_components, k_neighbors=k_neighbors, metric=metric, resolution=resolution, ) step = 0 if state.is_interuption_requested(): return res if pca_components is not None: steps = 3 state.set_status("Computing PCA...") pca = PCA(n_components=pca_components, random_state=0) data = res.pca_projection = pca(data)(data) assert isinstance(data, Table) state.set_partial_results(("pca_projection", res.pca_projection)) step += 1 else: steps = 2 if state.is_interuption_requested(): return res state.set_progress_value(100. * step / steps) state.set_status("Building graph...") def pcallback(val): state.set_progress_value((100. * step + 100 * val) / steps) if state.is_interuption_requested(): raise InteruptRequested() try: res.graph = graph = table_to_knn_graph(data, k_neighbors=k_neighbors, metric=metric, progress_callback=pcallback) except InteruptRequested: return res state.set_partial_results(("graph", res.graph)) step += 1 state.set_progress_value(100 * step / steps) state.set_status("Detecting communities...") if state.is_interuption_requested(): return res louvain = Louvain(resolution=resolution, random_state=0) res.partition = louvain.fit_predict(graph) state.set_partial_results(("partition", res.partition)) return res
def _init_projector(self): self._pca_projector = PCA(n_components=MAX_COMPONENTS, random_state=0) self._pca_projector.component = self.ncomponents self._pca_preprocessors = PCA.preprocessors
def test_PCA_scorer_all_components(self): n_attr = len(self.iris.domain.attributes) pca = PCA() scores = pca.score_data(self.iris) self.assertEqual(scores.shape, (n_attr, n_attr))