def test_from_numpy_names(self): d = Domain.from_numpy(np.zeros((1, 5))) self.assertTrue(d.anonymous) self.assertEqual([var.name for var in d.attributes], ["Feature {}".format(i) for i in range(1, 6)]) d = Domain.from_numpy(np.zeros((1, 99))) self.assertTrue(d.anonymous) self.assertEqual([var.name for var in d.attributes], ["Feature {:02}".format(i) for i in range(1, 100)]) d = Domain.from_numpy(np.zeros((1, 100))) self.assertTrue(d.anonymous) self.assertEqual([var.name for var in d.attributes], ["Feature {:03}".format(i) for i in range(1, 101)]) d = Domain.from_numpy(np.zeros((1, 1))) self.assertTrue(d.anonymous) self.assertEqual(d.attributes[0].name, "Feature") d = Domain.from_numpy(np.zeros((1, 3)), np.zeros((1, 1)), np.zeros((1, 100))) self.assertTrue(d.anonymous) self.assertEqual([var.name for var in d.attributes], ["Feature {}".format(i) for i in range(1, 4)]) self.assertEqual(d.class_var.name, "Target") self.assertEqual([var.name for var in d.metas], ["Meta {:03}".format(i) for i in range(1, 101)])
def test_do_not_recluster_on_same_data(self): """Do not recluster data points when targets or metas change.""" # Prepare some dummy data x = np.eye(5) y1, y2 = np.ones((5, 1)), np.ones((5, 2)) meta1, meta2 = np.ones((5, 1)), np.ones((5, 2)) table1 = Table.from_numpy( domain=Domain.from_numpy(X=x, Y=y1, metas=meta1), X=x, Y=y1, metas=meta1, ) # X is same, should not cause update table2 = Table.from_numpy( domain=Domain.from_numpy(X=x, Y=y2, metas=meta2), X=x, Y=y2, metas=meta2, ) # X is different, should cause update table3 = table1.copy() table3.X[:, 0] = 1 with patch.object(self.widget, '_invalidate_output') as commit: self.send_signal(self.widget.Inputs.data, table1) self.commit_and_wait() call_count = commit.call_count # Sending data with same X should not recompute the clustering self.send_signal(self.widget.Inputs.data, table2) self.commit_and_wait() self.assertEqual(call_count, commit.call_count) # Sending data with different X should recompute the clustering self.send_signal(self.widget.Inputs.data, table3) self.commit_and_wait() self.assertEqual(call_count + 1, commit.call_count)
def test_do_not_recluster_on_same_data(self): """Do not recluster data points when targets or metas change.""" # Prepare some dummy data x = np.eye(5) y1, y2 = np.ones((5, 1)), np.ones((5, 2)) meta1, meta2 = np.ones((5, 1)), np.ones((5, 2)) table1 = Table.from_numpy( domain=Domain.from_numpy(X=x, Y=y1, metas=meta1), X=x, Y=y1, metas=meta1, ) # X is same, should not cause update table2 = Table.from_numpy( domain=Domain.from_numpy(X=x, Y=y2, metas=meta2), X=x, Y=y2, metas=meta2, ) # X is different, should cause update table3 = table1.copy() table3.X[:, 0] = 1 with patch.object(self.widget, 'commit') as commit: self.send_signal(self.widget.Inputs.data, table1) self.commit_and_wait() call_count = commit.call_count # Sending data with same X should not recompute the clustering self.send_signal(self.widget.Inputs.data, table2) self.commit_and_wait() self.assertEqual(call_count, commit.call_count) # Sending data with different X should recompute the clustering self.send_signal(self.widget.Inputs.data, table3) self.commit_and_wait() self.assertEqual(call_count + 1, commit.call_count)
def test_do_not_recluster_on_same_data(self): """Do not recluster data points when targets or metas change.""" # Prepare some dummy data x = np.eye(5) y1, y2 = np.ones((5, 1)), np.ones((5, 2)) meta1, meta2 = np.ones((5, 1)), np.ones((5, 2)) table1 = Table.from_numpy( domain=Domain.from_numpy(X=x, Y=y1, metas=meta1), X=x, Y=y1, metas=meta1, ) # X is same, should not cause update table2 = Table.from_numpy( domain=Domain.from_numpy(X=x, Y=y2, metas=meta2), X=x, Y=y2, metas=meta2, ) # X is different, should cause update table3 = table1.copy() with table3.unlocked(): table3.X[:, 0] = 1 with patch.object(self.widget, 'unconditional_commit') as commit: self.send_signal(self.widget.Inputs.data, table1) self.commit_and_wait() commit.reset_mock() # Sending data with same X should not recompute the clustering self.send_signal(self.widget.Inputs.data, table2) commit.assert_not_called() # Sending data with different X should recompute the clustering self.send_signal(self.widget.Inputs.data, table3) commit.assert_called_once()
def test_from_numpy_names(self): for n_cols, name in [ (5, "Feature {}"), (99, "Feature {:02}"), (100, "Feature {:03}"), ]: d = Domain.from_numpy(np.zeros((1, n_cols))) self.assertTrue(d.anonymous) self.assertEqual( [var.name for var in d.attributes], [name.format(i) for i in range(1, n_cols + 1)], ) d = Domain.from_numpy(np.zeros((1, 1))) self.assertTrue(d.anonymous) self.assertEqual(d.attributes[0].name, "Feature") d = Domain.from_numpy(np.zeros((1, 3)), np.zeros((1, 1)), np.zeros((1, 100))) self.assertTrue(d.anonymous) self.assertEqual( [var.name for var in d.attributes], ["Feature {}".format(i) for i in range(1, 4)], ) self.assertEqual(d.class_var.name, "Target") self.assertEqual( [var.name for var in d.metas], ["Meta {:03}".format(i) for i in range(1, 101)], )
def test_from_numpy_dimensions(self): d = Domain.from_numpy(np.zeros((1, 1)), np.zeros(5)) self.assertTrue(d.anonymous) self.assertEqual(len(d.class_vars), 1) d = Domain.from_numpy(np.zeros((1, 1)), np.zeros((5, 1))) self.assertTrue(d.anonymous) self.assertEqual(len(d.class_vars), 1) self.assertRaises(ValueError, Domain.from_numpy, np.zeros(2)) self.assertRaises(ValueError, Domain.from_numpy, np.zeros((2, 2, 2))) self.assertRaises(ValueError, Domain.from_numpy, np.zeros((2, 2)), np.zeros((2, 2, 2)))
def test_from_numpy_values(self): d = Domain.from_numpy(np.zeros((1, 1)), np.arange(1, 3).reshape(2, 1)) self.assertTrue(d.anonymous) self.assertIsInstance(d.class_var, ContinuousVariable) d = Domain.from_numpy(np.zeros((1, 1)), np.arange(2).reshape(2, 1)) self.assertTrue(d.anonymous) self.assertIsInstance(d.class_var, DiscreteVariable) self.assertEqual(d.class_var.values, ["v{}".format(i) for i in range(1, 3)]) d = Domain.from_numpy(np.zeros((1, 1)), np.arange(18, 23).reshape(5, 1)) self.assertTrue(d.anonymous) self.assertIsInstance(d.class_var, ContinuousVariable)
def run_cn2(Xtr, Ytr, Xt, Yt, lb, k=None, log=None): domainx = Domain.from_numpy(Xtr.values) domainy = Domain.from_numpy(Ytr.values.reshape((-1, 1))) datax = Orange.data.Table.from_numpy(domainx, Xtr.values) datay = Orange.data.Table.from_numpy(domainy, Ytr.values.reshape((-1, 1))) discretizer = Orange.preprocess.DomainDiscretizer() domainx = discretizer(datax) domainy = discretizer(datay) domain = Domain(domainx.attributes, domainy.attributes[0]) data = Orange.data.Table.from_numpy(domain, Xtr.values, Y=Ytr.values) learner = Orange.classification.CN2UnorderedLearner() #learner = Orange.classification.rules.CN2Learner() learner.rule_finder.search_algorithm.beam_width = 10 learner.rule_finder.search_strategy.constrain_continuous = True learner.rule_finder.general_validator.min_covered_examples = 15 cn2 = learner(data) if k is not None: r_def = cn2.rule_list[-1] cn2.rule_list = cn2.rule_list[:k] cn2.rule_list.append(r_def) Y_pred = np.argmax(cn2.predict(Xt.values), axis=1) ids = np.arange(Xtr.shape[0]) print('default:', cn2.rule_list[-1].prediction) # Skip the last default rule for i, r in enumerate(cn2.rule_list[:-1]): cov = np.array([r.evaluate_instance(x) for x in data]) pred = np.array([r.prediction] * sum(cov)) acc = pred == Ytr.values[cov] r.covered = set(ids[cov]) print( 'CN2', '#{}, label:{}, len:{}, cov:{}, acc:{}'.format( i, r.prediction, r.length, sum(cov) / len(ids), sum(acc) / sum(cov))) if log is None: from logger import log log('cn2-k', len(cn2.rule_list[:-1])) [log('cn2-nconds', r.length, i) for i, r in enumerate(cn2.rule_list[:-1])] log('cn2-auc', roc_auc_score(lb.transform(Yt.values), lb.transform(Y_pred))) log('cn2-bacc', balanced_accuracy_score(Yt, Y_pred)) log('cn2-disp', dispersion_(cn2.rule_list[:-1], average=True)) log('cn2-overlap', overlap(cn2.rule_list[:-1])) print(confusion_matrix(Yt, Y_pred))
def __into_orange_table(self, attrs, X, meta_parts): if not attrs and X.shape[1]: attrs = Domain.from_numpy(X).attributes try: metas = None M = None if meta_parts: meta_parts = [ df_.reset_index() if not df_.index.is_integer() else df_ for df_ in meta_parts ] metas = [ StringVariable.make(name) for name in chain(*(_.columns for _ in meta_parts)) ] M = np.hstack(tuple(df_.values for df_ in meta_parts)) domain = Domain(attrs, metas=metas) table = Table.from_numpy(domain, X, None, M) except ValueError: table = None rows = self.leading_cols if self.transposed else self.leading_rows cols = self.leading_rows if self.transposed else self.leading_cols self.errors["inadequate_headers"] = (rows, cols) return table
def test_latlon_detection_heuristic(self): xy = np.c_[np.random.uniform(-180, 180, 100), np.random.uniform(-90, 90, 100)] data = Table.from_numpy(Domain.from_numpy(xy), xy) self.widget.set_data(data) self.assertIn(self.widget.lat_attr, data.domain) self.assertIn(self.widget.lon_attr, data.domain)
def test_from_numpy_values(self): for aran_min, aran_max, vartype in [(1, 3, ContinuousVariable), (0, 2, DiscreteVariable), (18, 23, ContinuousVariable)]: n_rows, n_cols, = aran_max - aran_min, 1 d = Domain.from_numpy(np.zeros((1, 1)), np.arange(aran_min, aran_max).reshape(n_rows, n_cols)) self.assertTrue(d.anonymous) self.assertIsInstance(d.class_var, vartype) if isinstance(vartype, DiscreteVariable): self.assertEqual(d.class_var.values, ["v{}".format(i) for i in range(1, 3)])
def test_anova(self): nrows, ncols = 500, 5 X = np.random.rand(nrows, ncols) y = 4 + (-3 * X[:, 1] + X[:, 3]) // 2 domain = Domain.from_numpy(X, y) domain = Domain(domain.attributes, DiscreteVariable('c', values=np.unique(y))) data = Table(domain, X, y) scorer = ANOVA() sc = [scorer(data, a) for a in range(ncols)] self.assertTrue(np.argmax(sc) == 1)
def test_anova(self): nrows, ncols = 500, 5 X = np.random.rand(nrows, ncols) y = 4 + (-3*X[:, 1] + X[:, 3]) // 2 domain = Domain.from_numpy(X, y) domain = Domain(domain.attributes, DiscreteVariable('c', values=np.unique(y))) data = Table(domain, X, y) scorer = score.ANOVA() sc = [scorer(data, a) for a in range(ncols)] self.assertTrue(np.argmax(sc) == 1)
def test_missing_values_with_no_pca_preprocessing(self): data = np.ones((5, 5)) data[range(5), range(5)] = np.nan np.random.shuffle(data) table = Table.from_numpy(domain=Domain.from_numpy(X=data), X=data) self.send_signal(self.widget.Inputs.data, table) self.widget.apply_pca = False self.widget.commit(force=True) self.assertTrue(self.widget.Error.data_has_nans.is_shown())
def test_improved_randomized_pca_properly_called(self): # It doesn't matter what we put into the matrix x_ = np.random.normal(0, 1, (100, 20)) x = Table.from_numpy(Domain.from_numpy(x_), x_) pca.randomized_pca = MagicMock(wraps=pca.randomized_pca) PCA(10, svd_solver="randomized", random_state=42)(x) pca.randomized_pca.assert_called_once() pca.randomized_pca.reset_mock() PCA(10, svd_solver="arpack", random_state=42)(x) pca.randomized_pca.assert_not_called()
def test_chi2(self): nrows, ncols = 500, 5 X = np.random.randint(4, size=(nrows, ncols)) y = 10 + (-3 * X[:, 1] + X[:, 3]) // 2 domain = Domain.from_numpy(X, y) domain = Domain(domain.attributes, DiscreteVariable('c', values=np.unique(y))) table = Table(domain, X, y) data = preprocess.Discretize()(table) scorer = Chi2() sc = [scorer(data, a) for a in range(ncols)] self.assertTrue(np.argmax(sc) == 1)
def test_chi2(self): nrows, ncols = 500, 5 X = np.random.randint(4, size=(nrows, ncols)) y = 10 + (-3*X[:, 1] + X[:, 3]) // 2 domain = Domain.from_numpy(X, y) domain = Domain(domain.attributes, DiscreteVariable('c', values=np.unique(y))) table = Table(domain, X, y) data = preprocess.Discretize()(table) scorer = score.Chi2() sc = [scorer(data, a) for a in range(ncols)] self.assertTrue(np.argmax(sc) == 1)
def test_from_numpy_names(self): for n_cols, name in [(5, "Feature {}"), (99, "Feature {:02}"), (100, "Feature {:03}")]: d = Domain.from_numpy(np.zeros((1, n_cols))) self.assertTrue(d.anonymous) self.assertEqual([var.name for var in d.attributes], [name.format(i) for i in range(1, n_cols+1)]) d = Domain.from_numpy(np.zeros((1, 1))) self.assertTrue(d.anonymous) self.assertEqual(d.attributes[0].name, "Feature") d = Domain.from_numpy(np.zeros((1, 3)), np.zeros((1, 1)), np.zeros((1, 100))) self.assertTrue(d.anonymous) self.assertEqual([var.name for var in d.attributes], ["Feature {}".format(i) for i in range(1, 4)]) self.assertEqual(d.class_var.name, "Target") self.assertEqual([var.name for var in d.metas], ["Meta {:03}".format(i) for i in range(1, 101)])
def test_improved_randomized_pca_sparse_data(self): """Randomized PCA should work well on dense data.""" random_state = check_random_state(42) # Let's take a tall, skinny matrix x_ = random_state.negative_binomial(1, 0.5, (100, 20)) x = Table.from_numpy(Domain.from_numpy(x_), x_).to_sparse() pca = PCA(10, svd_solver="full", random_state=random_state)(x.to_dense()) rpca = PCA(10, svd_solver="randomized", random_state=random_state)(x) np.testing.assert_almost_equal(pca.components_, rpca.components_, decimal=8) np.testing.assert_almost_equal(pca.explained_variance_, rpca.explained_variance_, decimal=8) np.testing.assert_almost_equal(pca.singular_values_, rpca.singular_values_, decimal=8) # And take a short, fat matrix x_ = random_state.negative_binomial(1, 0.5, (20, 100)) x = Table.from_numpy(Domain.from_numpy(x_), x_).to_sparse() pca = PCA(10, svd_solver="full", random_state=random_state)(x.to_dense()) rpca = PCA(10, svd_solver="randomized", random_state=random_state)(x) np.testing.assert_almost_equal(pca.components_, rpca.components_, decimal=8) np.testing.assert_almost_equal(pca.explained_variance_, rpca.explained_variance_, decimal=8) np.testing.assert_almost_equal(pca.singular_values_, rpca.singular_values_, decimal=8)
def test_rrelieff(self): X = np.random.random((100, 5)) y = ((X[:, 0] > .5) ^ (X[:, 1] < .5) - 1).astype(float) xor = Table.from_numpy(Domain.from_numpy(X, y), X, y) scorer = score.RReliefF() weights = scorer(xor, None) best = {xor.domain[attr].name for attr in weights.argsort()[-2:]} self.assertSetEqual(set(a.name for a in xor.domain.attributes[:2]), best) weights = scorer(self.housing, None) best = {self.housing.domain[attr].name for attr in weights.argsort()[-6:]} for feature in ('LSTAT', 'RM', 'AGE'): self.assertIn(feature, best)
def test_improved_randomized_pca_dense_data(self): """Randomized PCA should work well on dense data.""" random_state = check_random_state(42) # Let's take a tall, skinny matrix x_ = random_state.normal(0, 1, (100, 20)) x = Table.from_numpy(Domain.from_numpy(x_), x_) pca = PCA(10, svd_solver="full", random_state=random_state)(x) rpca = PCA(10, svd_solver="randomized", random_state=random_state)(x) np.testing.assert_almost_equal( pca.components_, rpca.components_, decimal=8 ) np.testing.assert_almost_equal( pca.explained_variance_, rpca.explained_variance_, decimal=8 ) np.testing.assert_almost_equal( pca.singular_values_, rpca.singular_values_, decimal=8 ) # And take a short, fat matrix x_ = random_state.normal(0, 1, (20, 100)) x = Table.from_numpy(Domain.from_numpy(x_), x_) pca = PCA(10, svd_solver="full", random_state=random_state)(x) rpca = PCA(10, svd_solver="randomized", random_state=random_state)(x) np.testing.assert_almost_equal( pca.components_, rpca.components_, decimal=8 ) np.testing.assert_almost_equal( pca.explained_variance_, rpca.explained_variance_, decimal=8 ) np.testing.assert_almost_equal( pca.singular_values_, rpca.singular_values_, decimal=8 )
def test_clusters_ordered_by_size(self): """Cluster names should be sorted based on the number of instances.""" x1 = np.array([[0, 0]] * 20) x2 = np.array([[1, 0]] * 15) x3 = np.array([[0, 1]] * 10) x4 = np.array([[1, 1]] * 5) data = np.vstack((x1, x2, x3, x4)) # Remove any order depencence in data, not that this should affect it np.random.shuffle(data) table = Table.from_numpy(domain=Domain.from_numpy(X=data), X=data) self.send_signal(self.widget.Inputs.data, table) self.widget.k_neighbours = 4 self.widget.commit(force=True) output = self.get_output(self.widget.Outputs.annotated_data, wait=1000) clustering = output.get_column_view('Cluster')[0].astype(int) counts = np.bincount(clustering) np.testing.assert_equal(counts, sorted(counts, reverse=True))
def __into_orange_table(self, attrs, X, meta_parts): if not attrs and X.shape[1]: attrs = Domain.from_numpy(X).attributes try: metas = None M = None if meta_parts: meta_parts = [df_.reset_index() if not df_.index.is_integer() else df_ for df_ in meta_parts] metas, M = self.__guess_metas(meta_parts) domain = Domain(attrs, metas=metas) table = Table.from_numpy(domain, X, None, M) except ValueError: table = None rows = self.leading_cols if self.transposed else self.leading_rows cols = self.leading_rows if self.transposed else self.leading_cols self.errors["inadequate_headers"] = (rows, cols) return table
def test_clusters_ordered_by_size(self): """Cluster names should be sorted based on the number of instances.""" x1 = np.array([[0, 0]] * 20) x2 = np.array([[1, 0]] * 15) x3 = np.array([[0, 1]] * 10) x4 = np.array([[1, 1]] * 5) data = np.vstack((x1, x2, x3, x4)) # Remove any order depencence in data, not that this should affect it np.random.shuffle(data) table = Table.from_numpy(domain=Domain.from_numpy(X=data), X=data) self.send_signal(self.widget.Inputs.data, table) self.widget.k_neighbors = 4 self.commit_and_wait() output = self.get_output(self.widget.Outputs.annotated_data) clustering = output.get_column_view('Cluster')[0].astype(int) counts = np.bincount(clustering) np.testing.assert_equal(counts, sorted(counts, reverse=True))
def test_rrelieff(self): X = np.random.random((100, 5)) y = ((X[:, 0] > 0.5) ^ (X[:, 1] < 0.5) - 1).astype(float) xor = Table.from_numpy(Domain.from_numpy(X, y), X, y) scorer = RReliefF(random_state=42) weights = scorer(xor, None) best = {xor.domain[attr].name for attr in weights.argsort()[-2:]} self.assertSetEqual(set(a.name for a in xor.domain.attributes[:2]), best) weights = scorer(self.housing, None) best = { self.housing.domain[attr].name for attr in weights.argsort()[-6:] } for feature in ("LSTAT", "RM"): self.assertIn(feature, best) np.testing.assert_array_equal( RReliefF(random_state=1)(self.housing, None), RReliefF(random_state=1)(self.housing, None), )
train_disc_pts1 = create_disc_pts(75, 2) train_disc_pts2 = create_disc_pts(75, 3.5, 2) plt.figure() plt.scatter(train_disc_pts1[:, 0], train_disc_pts1[:, 1], c='r') plt.scatter(train_disc_pts2[:, 0], train_disc_pts2[:, 1], c='b') bound_ang = np.arange(0, 2 * np.pi, 0.01) plt.plot(2 * np.cos(bound_ang), 2 * np.sin(bound_ang)) plt.xlim(-4, 4) plt.ylim(-4, 4) plt.show() train_disc_pts = np.append(train_disc_pts1, train_disc_pts2, axis=0) train_disc_pt_labels = np.append(np.zeros(75), np.ones(75)) train_disc_data_domain = Domain.from_numpy(train_disc_pts, train_disc_pt_labels) train_disc_data_tab = Table.from_numpy(train_disc_data_domain, train_disc_pts, train_disc_pt_labels) print("###########TASK 5###################") non_linear_learner = SVMLearner() eval_results = CrossValidation(train_disc_data_tab, [non_linear_learner], k=10) #Accuracy of cross validation: 0.960 #AUC: 0.959 print("Accuracy of cross validation: {:.3f}".format(scoring.CA(eval_results)[0])) print("AUC: {:.3f}".format(scoring.AUC(eval_results)[0])) print("###########EXERCISE 1###############") non_linear_learner = SVMLearner()