def test_improved_randomized_pca_dense_data(self): """Randomized PCA should work well on dense data.""" random_state = check_random_state(42) # Let's take a tall, skinny matrix x_ = random_state.normal(0, 1, (100, 20)) x = Table.from_numpy(Domain.from_numpy(x_), x_) pca = PCA(10, svd_solver="full", random_state=random_state)(x) rpca = PCA(10, svd_solver="randomized", random_state=random_state)(x) np.testing.assert_almost_equal( pca.components_, rpca.components_, decimal=8 ) np.testing.assert_almost_equal( pca.explained_variance_, rpca.explained_variance_, decimal=8 ) np.testing.assert_almost_equal( pca.singular_values_, rpca.singular_values_, decimal=8 ) # And take a short, fat matrix x_ = random_state.normal(0, 1, (20, 100)) x = Table.from_numpy(Domain.from_numpy(x_), x_) pca = PCA(10, svd_solver="full", random_state=random_state)(x) rpca = PCA(10, svd_solver="randomized", random_state=random_state)(x) np.testing.assert_almost_equal( pca.components_, rpca.components_, decimal=8 ) np.testing.assert_almost_equal( pca.explained_variance_, rpca.explained_variance_, decimal=8 ) np.testing.assert_almost_equal( pca.singular_values_, rpca.singular_values_, decimal=8 )
def __call__(self, data, feature=None): if not data.domain.class_var: raise ValueError("{} requires data with a target variable.".format( self.friendly_name)) if not isinstance(data.domain.class_var, self.class_type): raise ValueError("{} requires a {} target variable.".format( self.friendly_name, self._friendly_vartype_name(self.class_type))) if feature is not None: f = data.domain[feature] data = data.transform(Domain([f], data.domain.class_vars)) for pp in self.preprocessors: data = pp(data) for var in data.domain.attributes: if not isinstance(var, self.feature_type): raise ValueError("{} cannot score {} variables.".format( self.friendly_name, self._friendly_vartype_name(type(var)))) return self.score_data(data, feature)
def test_XY_large(self): from Orange.data.sql.table import AUTO_DL_LIMIT as DLL mat = np.random.randint(0, 2, (DLL + 100, 3)) conn, table_name = self.create_sql_table(mat) sql_table = SqlTable(conn, table_name, type_hints=Domain([], DiscreteVariable( name='col2', values=['0', '1', '2']))) self.assertRaises(ValueError, lambda: sql_table.X) self.assertRaises(ValueError, lambda: sql_table.Y) with self.assertRaises(ValueError): sql_table.download_data(DLL + 10) # Download partial data sql_table.download_data(DLL + 10, partial=True) assert_almost_equal(sql_table.X, mat[:DLL + 10, :2]) assert_almost_equal(sql_table.Y.flatten()[:DLL + 10], mat[:DLL + 10, 2]) # Download all data sql_table.download_data() assert_almost_equal(sql_table.X, mat[:, :2]) assert_almost_equal(sql_table.Y.flatten(), mat[:, 2])
def test_from_numpy_names(self): d = Domain.from_numpy(np.zeros((1, 5))) self.assertTrue(d.anonymous) self.assertEqual([var.name for var in d.attributes], ["Feature {}".format(i) for i in range(1, 6)]) d = Domain.from_numpy(np.zeros((1, 99))) self.assertTrue(d.anonymous) self.assertEqual([var.name for var in d.attributes], ["Feature {:02}".format(i) for i in range(1, 100)]) d = Domain.from_numpy(np.zeros((1, 100))) self.assertTrue(d.anonymous) self.assertEqual([var.name for var in d.attributes], ["Feature {:03}".format(i) for i in range(1, 101)]) d = Domain.from_numpy(np.zeros((1, 1))) self.assertTrue(d.anonymous) self.assertEqual(d.attributes[0].name, "Feature") d = Domain.from_numpy(np.zeros((1, 3)), np.zeros((1, 1)), np.zeros((1, 100))) self.assertTrue(d.anonymous) self.assertEqual([var.name for var in d.attributes], ["Feature {}".format(i) for i in range(1, 4)]) self.assertEqual(d.class_var.name, "Target") self.assertEqual([var.name for var in d.metas], ["Meta {:03}".format(i) for i in range(1, 101)])
def test_do_not_recluster_on_same_data(self): """Do not recluster data points when targets or metas change.""" # Prepare some dummy data x = np.eye(5) y1, y2 = np.ones((5, 1)), np.ones((5, 2)) meta1, meta2 = np.ones((5, 1)), np.ones((5, 2)) table1 = Table.from_numpy( domain=Domain.from_numpy(X=x, Y=y1, metas=meta1), X=x, Y=y1, metas=meta1, ) # X is same, should not cause update table2 = Table.from_numpy( domain=Domain.from_numpy(X=x, Y=y2, metas=meta2), X=x, Y=y2, metas=meta2, ) # X is different, should cause update table3 = table1.copy() table3.X[:, 0] = 1 with patch.object(self.widget, 'commit') as commit: self.send_signal(self.widget.Inputs.data, table1) self.commit_and_wait() call_count = commit.call_count # Sending data with same X should not recompute the clustering self.send_signal(self.widget.Inputs.data, table2) self.commit_and_wait() self.assertEqual(call_count, commit.call_count) # Sending data with different X should recompute the clustering self.send_signal(self.widget.Inputs.data, table3) self.commit_and_wait() self.assertEqual(call_count + 1, commit.call_count)
def test_index_error(self): d = Domain((age, gender, income), metas=(ssn, race)) for idx in (3, np.int(3), -3, np.int(-3), incomeA, "no_such_thing"): with self.assertRaises(ValueError): d.index(idx) with self.assertRaises(TypeError): d.index([2])
def test_copy(self): age.number_of_decimals = 5 attributes = (age, gender, income) domain = Domain(attributes, [race], [ssn]) new_domain = domain.copy() new_domain[age].number_of_decimals = 10 self.assertEqual(domain[age].number_of_decimals, 5) self.assertEqual(new_domain[age].number_of_decimals, 10)
def select_attrs(table, features, class_var=None, class_vars=None, metas=None): """ Select only ``attributes`` from the ``table``. """ if class_vars is None: domain = Domain(features, class_var) else: domain = Domain(features, class_var, class_vars=class_vars) if metas: domain.add_metas(metas) return Table(domain, table)
def test_conversion(self): domain = Domain([age, income], [race], [gender, education, ssn]) values, metas = domain.convert([42, 13, "White"]) assert_array_equal(values, np.array([42, 13, 0])) assert_array_equal(metas, np.array([Unknown, Unknown, None])) values, metas = domain.convert([42, 13, "White", "M", "HS", "1234567"]) assert_array_equal(values, np.array([42, 13, 0])) assert_array_equal(metas, np.array([0, 1, "1234567"], dtype=object))
def test_conversion_size(self): domain = Domain([age, gender, income], [race]) self.assertRaises(ValueError, domain.convert, [0] * 3) self.assertRaises(ValueError, domain.convert, [0] * 5) domain = Domain([age, income], [race], [gender, education, ssn]) self.assertRaises(ValueError, domain.convert, [0] * 2) self.assertRaises(ValueError, domain.convert, [0] * 4) self.assertRaises(ValueError, domain.convert, [0] * 7) domain.convert([0] * 3) domain.convert([0] * 6)
def test_from_numpy_dimensions(self): d = Domain.from_numpy(np.zeros((1, 1)), np.zeros(5)) self.assertTrue(d.anonymous) self.assertEqual(len(d.class_vars), 1) d = Domain.from_numpy(np.zeros((1, 1)), np.zeros((5, 1))) self.assertTrue(d.anonymous) self.assertEqual(len(d.class_vars), 1) self.assertRaises(ValueError, Domain.from_numpy, np.zeros(2)) self.assertRaises(ValueError, Domain.from_numpy, np.zeros((2, 2, 2))) self.assertRaises(ValueError, Domain.from_numpy, np.zeros((2, 2)), np.zeros((2, 2, 2)))
def test_conversion(self): domain = Domain([age, income], [race], [gender, education, ssn]) x, y, metas = domain.convert([42, 13, "White"]) assert_array_equal(x, np.array([42, 13])) assert_array_equal(y, np.array([0])) self.assertTrue(all(np.isnan(np.array(metas, dtype=float)))) x, y, metas = domain.convert([42, 13, "White", "M", "HS", "1234567"]) assert_array_equal(x, np.array([42, 13])) assert_array_equal(y, np.array([0])) assert_array_equal(metas, np.array([0, 1, "1234567"], dtype=object))
def test_var_from_domain(self): d = Domain((age, gender, income), metas=(ssn, race)) self.assertEqual(d.var_from_domain(incomeA), incomeA) self.assertEqual(d.var_from_domain(incomeA, False), incomeA) with self.assertRaises(IndexError): d.var_from_domain(incomeA, True) with self.assertRaises(TypeError): d.var_from_domain(1, no_index=True) with self.assertRaises(TypeError): d.var_from_domain(-1, no_index=True)
def test_from_numpy_values(self): d = Domain.from_numpy(np.zeros((1, 1)), np.arange(1, 3).reshape(2, 1)) self.assertTrue(d.anonymous) self.assertIsInstance(d.class_var, ContinuousVariable) d = Domain.from_numpy(np.zeros((1, 1)), np.arange(2).reshape(2, 1)) self.assertTrue(d.anonymous) self.assertIsInstance(d.class_var, DiscreteVariable) self.assertEqual(d.class_var.values, ["v{}".format(i) for i in range(1, 3)]) d = Domain.from_numpy(np.zeros((1, 1)), np.arange(18, 23).reshape(5, 1)) self.assertTrue(d.anonymous) self.assertIsInstance(d.class_var, ContinuousVariable)
def test_has_continuous(self): self.assertFalse(Domain([]).has_continuous_attributes()) self.assertFalse(Domain([], [age]).has_continuous_attributes()) self.assertFalse(Domain([], [race]).has_continuous_attributes()) self.assertTrue(Domain([age], None).has_continuous_attributes()) self.assertFalse(Domain([race], None).has_continuous_attributes()) self.assertTrue(Domain([age, race], None).has_continuous_attributes()) self.assertTrue(Domain([race, age], None).has_continuous_attributes()) self.assertTrue(Domain([], [age]).has_continuous_attributes(True)) self.assertFalse(Domain([], [race]).has_continuous_attributes(True)) self.assertTrue(Domain([age], None).has_continuous_attributes(True)) self.assertFalse(Domain([race], None).has_continuous_attributes(True)) self.assertTrue(Domain([age], race).has_continuous_attributes(True)) self.assertTrue(Domain([race], age).has_continuous_attributes(True)) self.assertTrue(Domain([], [race, age]).has_continuous_attributes(True)) d = Domain([], None, [age]) self.assertTrue(d.has_continuous_attributes(False, True)) d = Domain([], None, [gender]) self.assertFalse(d.has_continuous_attributes(False, True)) d = Domain([], [gender], [age]) self.assertTrue(d.has_continuous_attributes(True, True)) d = Domain([], [race], [gender]) self.assertFalse(d.has_continuous_attributes(True, True))
def test_has_discrete(self): self.assertFalse(Domain([]).has_discrete_attributes()) self.assertFalse(Domain([], [age]).has_discrete_attributes()) self.assertFalse(Domain([], race).has_discrete_attributes()) self.assertFalse(Domain([age], None).has_discrete_attributes()) self.assertTrue(Domain([race], None).has_discrete_attributes()) self.assertTrue(Domain([age, race], None).has_discrete_attributes()) self.assertTrue(Domain([race, age], None).has_discrete_attributes()) self.assertFalse(Domain([], [age]).has_discrete_attributes(True)) self.assertTrue(Domain([], [race]).has_discrete_attributes(True)) self.assertFalse(Domain([age], None).has_discrete_attributes(True)) self.assertTrue(Domain([race], None).has_discrete_attributes(True)) self.assertTrue(Domain([age], race).has_discrete_attributes(True)) self.assertTrue(Domain([race], age).has_discrete_attributes(True)) self.assertTrue(Domain([], [race, age]).has_discrete_attributes(True)) d = Domain([], None, [gender]) self.assertTrue(d.has_discrete_attributes(False, True)) d = Domain([], None, [age]) self.assertFalse(d.has_discrete_attributes(False, True)) d = Domain([], [age], [gender]) self.assertTrue(d.has_discrete_attributes(True, True)) d = Domain([], [incomeA], [age]) self.assertFalse(d.has_discrete_attributes(True, True))
def test_get_conversion(self): d = Domain((age, gender, income), metas=(ssn, race)) e = Domain((gender, race), None, metas=(age, gender, ssn)) f = Domain((gender,), (race, income), metas=(age, income, ssn)) g = Domain((), metas=(age, gender, ssn)) d_to_e = e.get_conversion(d) self.assertIs(d_to_e.source, d) self.assertEqual(d_to_e.attributes, [1, -2]) self.assertEqual(d_to_e.class_vars, []) self.assertEqual(d_to_e.metas, [0, 1, -1]) d_to_e = e.get_conversion(d) self.assertIs(d_to_e.source, d) self.assertEqual(d_to_e.attributes, [1, -2]) self.assertEqual(d_to_e.class_vars, []) self.assertEqual(d_to_e.metas, [0, 1, -1]) d_to_f = f.get_conversion(d) self.assertIs(d_to_f.source, d) self.assertEqual(d_to_f.attributes, [1]) self.assertEqual(d_to_f.class_vars, [-2, 2]) self.assertEqual(d_to_f.metas, [0, 2, -1]) d_to_e = e.get_conversion(d) self.assertIs(d_to_e.source, d) self.assertEqual(d_to_e.attributes, [1, -2]) self.assertEqual(d_to_e.class_vars, []) self.assertEqual(d_to_e.metas, [0, 1, -1]) d_to_f = f.get_conversion(d) self.assertIs(d_to_f.source, d) self.assertEqual(d_to_f.attributes, [1]) self.assertEqual(d_to_f.class_vars, [-2, 2]) self.assertEqual(d_to_f.metas, [0, 2, -1]) f_to_g = g.get_conversion(f) self.assertIs(f_to_g.source, f) self.assertEqual(f_to_g.attributes, []) self.assertEqual(f_to_g.class_vars, []) self.assertEqual(f_to_g.metas, [-1, 0, -3]) x = lambda: 42 income.compute_value = x g_to_f = f.get_conversion(g) self.assertIs(g_to_f.source, g) self.assertEqual(g_to_f.attributes, [-2]) self.assertEqual(g_to_f.class_vars, [Variable.compute_value, x]) self.assertEqual(g_to_f.metas, [-1, x, -3])
def take(table, indices, axis=0): """ Take values form the ``table`` along the ``axis``. """ indices = mask_to_indices(indices, (len(table), len(table.domain)), axis) if axis == 0: # Take the rows (instances) instances = [table[i] for i in indices] table = Table(instances) if instances else Table(table.domain) elif axis == 1: # Take the columns (attributes) variables = table.domain.variables vars = [variables[i] for i in indices] domain = Domain(vars, table.domain.class_var in vars) domain.add_metas(table.domain.get_metas()) table = Table(domain, table) return table
def test_index(self): d = Domain((age, gender, income), metas=(ssn, race)) for idx, var in [(age, 0), ("AGE", 0), (0, 0), (np.int_(0), 0), (income, 2), ("income", 2), (2, 2), (np.int_(2), 2), (ssn, -1), ("SSN", -1), (-1, -1), (np.int_(-1), -1), (-2, -2), (np.int_(-2), -2)]: self.assertEqual(d.index(idx), var)
def test_latlon_detection_heuristic(self): xy = np.c_[np.random.uniform(-180, 180, 100), np.random.uniform(-90, 90, 100)] data = Table.from_numpy(Domain.from_numpy(xy), xy) self.widget.set_data(data) self.assertIn(self.widget.lat_attr, data.domain) self.assertIn(self.widget.lon_attr, data.domain)
def test_from_numpy_values(self): for aran_min, aran_max, vartype in [(1, 3, ContinuousVariable), (0, 2, DiscreteVariable), (18, 23, ContinuousVariable)]: n_rows, n_cols, = aran_max - aran_min, 1 d = Domain.from_numpy(np.zeros((1, 1)), np.arange(aran_min, aran_max).reshape(n_rows, n_cols)) self.assertTrue(d.anonymous) self.assertIsInstance(d.class_var, vartype) if isinstance(vartype, DiscreteVariable): self.assertEqual(d.class_var.values, ["v{}".format(i) for i in range(1, 3)])
def join_domains(domain1, domain2): variables = domain1.variables + domain1.variables used_set = set() def used(vars): mask = [] for var in vars: mask.append(var not in used_set) used_set.add(var) used_mask1 = used(domain1.variables) used_mask2 = used(domain2.variables) if domain2.classVar: used_mask2[-1] = True variables = [v for v, used in zip(variables, used_mask1 + used_mask2)] joined_domain = Domain(variables, domain2.classVar) joined_domain.add_metas(domain1.get_metas()) joined_domain.add_metas(domain2.get_metas()) return joined_domain, used_mask1, used_mask2
def test_anova(self): nrows, ncols = 500, 5 X = np.random.rand(nrows, ncols) y = 4 + (-3*X[:, 1] + X[:, 3]) // 2 domain = Domain.from_numpy(X, y) domain = Domain(domain.attributes, DiscreteVariable('c', values=np.unique(y))) data = Table(domain, X, y) scorer = score.ANOVA() sc = [scorer(data, a) for a in range(ncols)] self.assertTrue(np.argmax(sc) == 1)
def test_chi2(self): nrows, ncols = 500, 5 X = np.random.randint(4, size=(nrows, ncols)) y = 10 + (-3*X[:, 1] + X[:, 3]) // 2 domain = Domain.from_numpy(X, y) domain = Domain(domain.attributes, DiscreteVariable('c', values=np.unique(y))) table = Table(domain, X, y) data = preprocess.Discretize()(table) scorer = score.Chi2() sc = [scorer(data, a) for a in range(ncols)] self.assertTrue(np.argmax(sc) == 1)
def test_improved_randomized_pca_properly_called(self): # It doesn't matter what we put into the matrix x_ = np.random.normal(0, 1, (100, 20)) x = Table.from_numpy(Domain.from_numpy(x_), x_) pca.randomized_pca = MagicMock(wraps=pca.randomized_pca) PCA(10, svd_solver="randomized", random_state=42)(x) pca.randomized_pca.assert_called_once() pca.randomized_pca.reset_mock() PCA(10, svd_solver="arpack", random_state=42)(x) pca.randomized_pca.assert_not_called()
def test_from_numpy_names(self): for n_cols, name in [(5, "Feature {}"), (99, "Feature {:02}"), (100, "Feature {:03}")]: d = Domain.from_numpy(np.zeros((1, n_cols))) self.assertTrue(d.anonymous) self.assertEqual([var.name for var in d.attributes], [name.format(i) for i in range(1, n_cols+1)]) d = Domain.from_numpy(np.zeros((1, 1))) self.assertTrue(d.anonymous) self.assertEqual(d.attributes[0].name, "Feature") d = Domain.from_numpy(np.zeros((1, 3)), np.zeros((1, 1)), np.zeros((1, 100))) self.assertTrue(d.anonymous) self.assertEqual([var.name for var in d.attributes], ["Feature {}".format(i) for i in range(1, 4)]) self.assertEqual(d.class_var.name, "Target") self.assertEqual([var.name for var in d.metas], ["Meta {:03}".format(i) for i in range(1, 101)])
def test_conversion(self): domain = Domain([age, income], [race], [gender, education, ssn]) x, y, metas = domain.convert([42, 13, "White"]) assert_array_equal(x, np.array([42, 13])) assert_array_equal(y, np.array([0])) metas_exp = [gender.Unknown, education.Unknown, ssn.Unknown] def eq(a, b): if isinstance(a, Real) and isinstance(b, Real) and \ np.isnan(a) and np.isnan(b): return True else: return a == b self.assertTrue(all(starmap(eq, zip(metas, metas_exp)))) x, y, metas = domain.convert([42, 13, "White", "M", "HS", "1234567"]) assert_array_equal(x, np.array([42, 13])) assert_array_equal(y, np.array([0])) assert_array_equal(metas, np.array([0, 1, "1234567"], dtype=object))
def test_rrelieff(self): X = np.random.random((100, 5)) y = ((X[:, 0] > .5) ^ (X[:, 1] < .5) - 1).astype(float) xor = Table.from_numpy(Domain.from_numpy(X, y), X, y) scorer = score.RReliefF() weights = scorer(xor, None) best = {xor.domain[attr].name for attr in weights.argsort()[-2:]} self.assertSetEqual(set(a.name for a in xor.domain.attributes[:2]), best) weights = scorer(self.housing, None) best = {self.housing.domain[attr].name for attr in weights.argsort()[-6:]} for feature in ('LSTAT', 'RM', 'AGE'): self.assertIn(feature, best)
def test_different_domains_with_same_attributes_are_equal(self): domain1 = Domain([]) domain2 = Domain([]) self.assertEqual(domain1, domain2) domain1.attributes = (ContinuousVariable('var1'),) self.assertNotEqual(domain1, domain2) domain2.attributes = (ContinuousVariable('var1'),) self.assertEqual(domain1, domain2) domain1.class_vars = (ContinuousVariable('var1'),) self.assertNotEqual(domain1, domain2) domain2.class_vars = (ContinuousVariable('var1'),) self.assertEqual(domain1, domain2) domain1._metas = (ContinuousVariable('var1'),) self.assertNotEqual(domain1, domain2) domain2._metas = (ContinuousVariable('var1'),) self.assertEqual(domain1, domain2)
def test_clusters_ordered_by_size(self): """Cluster names should be sorted based on the number of instances.""" x1 = np.array([[0, 0]] * 20) x2 = np.array([[1, 0]] * 15) x3 = np.array([[0, 1]] * 10) x4 = np.array([[1, 1]] * 5) data = np.vstack((x1, x2, x3, x4)) # Remove any order depencence in data, not that this should affect it np.random.shuffle(data) table = Table.from_numpy(domain=Domain.from_numpy(X=data), X=data) self.send_signal(self.widget.Inputs.data, table) self.widget.k_neighbors = 4 self.commit_and_wait() output = self.get_output(self.widget.Outputs.annotated_data) clustering = output.get_column_view('Cluster')[0].astype(int) counts = np.bincount(clustering) np.testing.assert_equal(counts, sorted(counts, reverse=True))
def test_mismatching_targets(self): warning = self.widget.Warning maj_iris = ConstantLearner()(self.iris) dom = self.iris.domain iris3 = self.iris.transform(Domain(dom[:3], dom[3])) maj_iris3 = ConstantLearner()(iris3) self.send_signal(self.widget.Inputs.predictors, maj_iris, 1) self.send_signal(self.widget.Inputs.predictors, maj_iris3, 2) self.assertFalse(warning.wrong_targets.is_shown()) self.send_signal(self.widget.Inputs.data, self.iris) self.assertTrue(warning.wrong_targets.is_shown()) self.send_signal(self.widget.Inputs.predictors, None, 2) self.assertFalse(warning.wrong_targets.is_shown()) self.send_signal(self.widget.Inputs.predictors, maj_iris3, 2) self.assertTrue(warning.wrong_targets.is_shown()) self.send_signal(self.widget.Inputs.data, None) self.assertFalse(warning.wrong_targets.is_shown())
def setUp(self): self.domain = Domain(attributes=[ ContinuousVariable('c1'), DiscreteVariable('d1', values='abc'), DiscreteVariable('d2', values='def') ], class_vars=[DiscreteVariable('d3', values='ghi')], metas=[ ContinuousVariable('c2'), DiscreteVariable('d4', values='jkl') ]) self.args = (self.domain, { 'c1': Continuous, 'd1': Discrete, 'd2': Discrete, 'd3': Discrete }, { 'c2': Continuous, 'd4': Discrete, }) self.handler = SelectAttributesDomainContextHandler(first_match=False) self.handler.read_defaults = lambda: None
def test_set_data_no_class(self): """Widget is properly set up when there is no class""" widget = self.widget var_model = widget.controls.var.model() cvar_model = widget.controls.cvar.model() iris = self.iris domain = Domain(iris.domain.attributes + iris.domain.class_vars) data = iris.transform(domain) self.send_signal(widget.Inputs.data, data) self.assertEqual({var.name for var in var_model}, {var.name for var in domain.attributes}) self.assertEqual(list(cvar_model), [None, DomainModel.Separator, iris.domain.class_var]) self.assertIs(widget.var, domain[0]) self.assertIs(widget.cvar, None) np.testing.assert_equal(widget.valid_data, self.iris.X[:, 0]) self.assertIsNone(widget.valid_group_data) self.assertIsNotNone(self.get_output(widget.Outputs.histogram_data)) self.assertIsNotNone(self.get_output(widget.Outputs.annotated_data)) self.assertIsNone(self.get_output(widget.Outputs.selected_data))
def create_coef_table(classifier): i = classifier.intercept c = classifier.coefficients if c.shape[0] > 2: values = [ classifier.domain.class_var.values[int(i)] for i in classifier.used_vals[0] ] else: values = [ classifier.domain.class_var.values[int(classifier.used_vals[0][1])] ] domain = Domain( [ContinuousVariable(value, number_of_decimals=7) for value in values], metas=[StringVariable("name")], ) coefs = np.vstack((i.reshape(1, len(i)), c.T)) names = [[attr.name] for attr in classifier.domain.attributes] names = [["intercept"]] + names names = np.array(names, dtype=object) coef_table = Table.from_numpy(domain, X=coefs, metas=names) coef_table.name = "coefficients" return coef_table
def test_vizrank_class_nan(self): """ When class values are nan, vizrank should be disabled. It should behave like the class column is missing. GH-2757 """ def assert_vizrank_enabled(data, is_enabled): self.send_signal(self.widget.Inputs.data, data) self.assertEqual(is_enabled, self.widget.vizrank_button.isEnabled()) data1 = Table("iris")[::30] data2 = Table("iris")[::30] data2.Y[:] = np.nan domain = Domain(attributes=data2.domain.attributes[:4], class_vars=DiscreteVariable("iris", values=[])) data2 = Table(domain, data2.X, Y=data2.Y) data3 = Table("iris")[::30] data3.Y[:] = np.nan for data, is_enabled in zip([data1, data2, data1, data3, data1], [True, False, True, False, True]): assert_vizrank_enabled(data, is_enabled)
def test_random(self): nrows, ncols = 1000, 5 x = np.random.randint(-20, 51, (nrows, ncols)) y = np.random.randint(-2, 3, (nrows, 1)) x1, x2 = np.split(x, 2) y1, y2 = np.split(y, 2) attr = ( ContinuousVariable("Feature 1"), ContinuousVariable("Feature 2"), ContinuousVariable("Feature 3"), ContinuousVariable("Feature 4"), ContinuousVariable("Feature 5"), ) class_vars = (DiscreteVariable("Target 1"),) domain = Domain(attr, class_vars) t = Table(domain, x1, y1) lrn = KNNLearner() clf = lrn(t) z = clf(x2) correct = z == y2.flatten() ca = sum(correct) / len(correct) self.assertGreater(ca, 0.1) self.assertLess(ca, 0.3)
def _score_heuristic(self): def normalized(a): span = np.max(a, axis=0) - np.min(a, axis=0) span[span == 0] = 1 return (a - np.mean(a, axis=0)) / span domain = self.master.data.domain attr_color = self.master.attr_color domain = Domain( attributes=[ v for v in chain(domain.variables, domain.metas) if v.is_continuous and v is not attr_color ], class_vars=attr_color, ) data = self.master.data.transform(domain).copy() with data.unlocked(): data.X = normalized(data.X) relief = ReliefF if attr_color.is_discrete else RReliefF weights = relief(n_iterations=100, k_nearest=self.minK)(data) results = sorted(zip(weights, domain.attributes), key=lambda x: (-x[0], x[1].name)) return [attr for _, attr in results]
def dataCombine(corpus,liwcResultList,featureNames,markedTexts): liwcResultTable,columnNames = list2table(liwcResultList,featureNames) fieldIdFile = getFieldId(corpus, FIELDNAMEFILE) fieldIdCounselor = getFieldId(corpus, FIELDNAMECOUNSELOR) domain = [ContinuousVariable(name=FIELDNAMEMSGID)]+list(corpus.domain.variables) for columnName in sortKeys(columnNames): domain.append(ContinuousVariable(name=columnName,number_of_decimals=NBROFDECIMALS)) metas = [StringVariable(name=FIELDNAMEFILE),StringVariable(name=FIELDNAMECOUNSELOR),StringVariable(name=FIELDNAMEMARKEDTEXT)] dataOut = [] metasOut = [] for i in range(0,len(corpus)): fileName = corpus.metas[i][fieldIdFile] counselorId = corpus.metas[i][fieldIdCounselor] metasOut.append([fileName,counselorId,markedTexts[i]]) row = [i+1]+list(corpus[i].values()) for columnName in sortKeys(columnNames): if (not re.match("^\d+\s",columnName) and columnName != NUMBERCOUNT) or int(liwcResultTable[i][NBROFMATCHES]) == 0: row.append(int(liwcResultTable[i][columnName])) else: row.append(float(liwcResultTable[i][columnName])/float(liwcResultTable[i][NBROFMATCHES])) dataOut.append(row) table = Table.from_numpy(Domain(domain,metas=metas),np.array(dataOut),metas=np.array(metasOut)) return(table)
def send_coefficients(self): """ Function sends coefficients on widget's output if model has them """ if (self.model is not None and isinstance(self.learner, LogisticRegressionLearner) and hasattr(self.model, 'skl_model')): model = self.model.skl_model domain = Domain([ContinuousVariable("coef", number_of_decimals=7)], metas=[StringVariable("name")]) coefficients = (model.intercept_.tolist() + model.coef_[0].tolist()) data = self.model.instances for preprocessor in self.learner.preprocessors: data = preprocessor(data) names = ["Intercept"] + [x.name for x in data.domain.attributes] coefficients_table = Table(domain, list(zip(coefficients, names))) self.Outputs.coefficients.send(coefficients_table) else: self.Outputs.coefficients.send(None)
def compute_distances(self): self.Error.diff_domains.clear() if not self.data or not self.reference: self.distances = None return if set(self.reference.domain.attributes) != \ set(self.data.domain.attributes): self.Error.diff_domains() self.distances = None return metric = METRICS[self.distance_index][1] n_ref = len(self.reference) # comparing only attributes, no metas and class-vars new_domain = Domain(self.data.domain.attributes) reference = self.reference.transform(new_domain) data = self.data.transform(new_domain) all_data = Table.concatenate([reference, data], 0) pp_all_data = Impute()(RemoveNaNColumns()(all_data)) pp_reference, pp_data = pp_all_data[:n_ref], pp_all_data[n_ref:] self.distances = metric(pp_data, pp_reference).min(axis=1)
def test_annotation_bool(self): """Check if bool labels remain bool""" a = ContinuousVariable("a") a.attributes["hidden"] = True d = Domain([a]) t = Table.from_domain(d) self.send_signal(self.widget.Inputs.data, t) assert isinstance(self.widget, OWEditDomain) # select first variable idx = self.widget.domain_view.model().index(0) self.widget.domain_view.setCurrentIndex(idx) # change first attribute value editor = self.widget.findChild(ContinuousVariableEditor) assert isinstance(editor, ContinuousVariableEditor) idx = editor.labels_model.index(0, 1) editor.labels_model.setData(idx, "False", Qt.EditRole) self.widget.commit() t2 = self.get_output(self.widget.Outputs.data) self.assertFalse(t2.domain["a"].attributes["hidden"])
def test_NaiveBayes(self): table = SqlTable(dict(host='localhost', database='test'), 'iris', type_hints=Domain([], DiscreteVariable( "iris", values=[ 'Iris-setosa', 'Iris-virginica', 'Iris-versicolor' ]))) table = DiscretizeTable(table) bayes = nb.NaiveBayesLearner() clf = bayes(table) # Single instance prediction self.assertEqual(clf(table[0]), table[0].get_class()) # Table prediction pred = clf(table) actual = array([ins.get_class() for ins in table]) ca = pred == actual ca = ca.sum() / len(ca) self.assertGreater(ca, 0.95) self.assertLess(ca, 1.)
def test_fix_values(self, msgbox): w = self.widget msgbox.ApplyRole, msgbox.RejectRole = object(), object() msgbox.return_value = Mock() dlgexec = msgbox.return_value.exec = Mock() v = [DiscreteVariable(name, values=tuple("abc")) for name in ("ana", "berta", "cilka")] domain = Domain(v, []) self.send_signal(w.Inputs.data, Table.from_numpy(domain, [[0, 1, 2]])) w.descriptors = [StringDescriptor( "y", "ana.value + berta.value + cilka.value")] # Reject fixing - no changes dlgexec.return_value=msgbox.RejectRole w.fix_expressions() self.assertEqual(w.descriptors[0].expression, "ana.value + berta.value + cilka.value") dlgexec.return_value = Mock(return_value=msgbox.AcceptRole) w.fix_expressions() self.assertEqual(w.descriptors[0].expression, "ana + berta + cilka") w.descriptors = [StringDescriptor( "y", "ana.value + dani.value + cilka.value")] with patch.object(w, "apply"): # dani doesn't exist and will fail w.fix_expressions() self.assertEqual(w.descriptors[0].expression, "ana + dani.value + cilka") w.descriptors = [ContinuousDescriptor("y", "sqrt(berta)", 1)] w.fix_expressions() self.assertEqual(w.descriptors[0].expression, "sqrt({'a': 0, 'b': 1, 'c': 2}[berta])")
def __call__(self, data): """ Removes unused features or classes from the given data. Returns a new data table. Parameters ---------- data : Orange.data.Table A data table to remove features or classes from. Returns ------- data : Orange.data.Table New data table. """ if data is None: return None domain = data.domain attrs_state = [ purge_var_M(var, data, self.attr_flags) for var in domain.attributes ] class_state = [ purge_var_M(var, data, self.class_flags) for var in domain.class_vars ] metas_state = [ purge_var_M(var, data, self.meta_flags) for var in domain.metas ] att_vars, self.attr_results = self.get_vars_and_results(attrs_state) cls_vars, self.class_results = self.get_vars_and_results(class_state) meta_vars, self.meta_results = self.get_vars_and_results(metas_state) domain = Domain(att_vars, cls_vars, meta_vars) return data.transform(domain)
def recompute_heatmap(self, points): if self.model is None or self.data is None: self.exposeObject('model_predictions', {}) self.evalJS('draw_heatmap()') return latlons = np.array(points) table = Table(Domain([self.lat_attr, self.lon_attr]), latlons) try: predictions = self.model(table) except Exception as e: self._owwidget.Error.model_error(e) return else: self._owwidget.Error.model_error.clear() class_var = self.model.domain.class_var is_regression = class_var.is_continuous if is_regression: predictions = scale(np.round(predictions, 7)) # Avoid small errors kwargs = dict(extrema=self._legend_values( class_var, [np.nanmin(predictions), np.nanmax(predictions)])) else: colorgen = ColorPaletteGenerator(len(class_var.values), class_var.colors) predictions = colorgen.getRGB(predictions) kwargs = dict(legend_labels=self._legend_values( class_var, range(len(class_var.values))), full_labels=list(class_var.values), colors=[ color_to_hex(colorgen.getRGB(i)) for i in range(len(class_var.values)) ]) self.exposeObject('model_predictions', dict(data=predictions, **kwargs)) self.evalJS('draw_heatmap()')
def compute(self): fileName = xoppy_calc_und_power_density( ELECTRONENERGY=self.ELECTRONENERGY, ELECTRONENERGYSPREAD=self.ELECTRONENERGYSPREAD, ELECTRONCURRENT=self.ELECTRONCURRENT, ELECTRONBEAMSIZEH=self.ELECTRONBEAMSIZEH, ELECTRONBEAMSIZEV=self.ELECTRONBEAMSIZEV, ELECTRONBEAMDIVERGENCEH=self.ELECTRONBEAMDIVERGENCEH, ELECTRONBEAMDIVERGENCEV=self.ELECTRONBEAMDIVERGENCEV, PERIODID=self.PERIODID, NPERIODS=self.NPERIODS, KV=self.KV, DISTANCE=self.DISTANCE, GAPH=self.GAPH, GAPV=self.GAPV, HSLITPOINTS=self.HSLITPOINTS, VSLITPOINTS=self.VSLITPOINTS, METHOD=self.METHOD) #send specfile self.send("xoppy_specfile", fileName) print("Loading file: ", fileName) #load spec file with one scan, # is comment out = np.loadtxt(fileName) print("data shape: ", out.shape) #get labels txt = open(fileName).readlines() tmp = [line.find("#L") for line in txt] itmp = np.where(np.array(tmp) != (-1)) labels = txt[itmp[0]].replace("#L ", "").split(" ") print("data labels: ", labels) # # build and send orange table # domain = Domain([ContinuousVariable(i) for i in labels]) table = Table.from_numpy(domain, out) self.send("xoppy_table", table)
def finance_data(symbol, since=None, until=None, granularity='d'): """Fetch Yahoo Finance data for stock or index `symbol` within the period after `since` and before `until` (both inclusive). Parameters ---------- symbol: str A stock or index symbol, as supported by Yahoo Finance. since: date A start date (default: 1900-01-01). until: date An end date (default: today). granularity: 'd' or 'w' or 'm' or 'v' What data to get: daily, weekly, monthly, or dividends. Returns ------- data : Timeseries """ if since is None: since = date(1900, 1, 1) if until is None: until = date.today() f = web.DataReader(symbol, 'yahoo', since, until) data = Timeseries.from_data_table(table_from_frame(f)) # Make Adjusted Close a class variable attrs = [var.name for var in data.domain.attributes] attrs.remove('Adj Close') data = Timeseries.from_table( Domain(attrs, [data.domain['Adj Close']], None, source=data.domain), data) data.name = symbol data.time_variable = data.domain['Date'] return data
def _corpus_from_records(records, includes_metadata): """Receives PubMed records and transforms them into a corpus. Args: records (list): A list of PubMed entries. includes_metadata (list): A list of text fields to include. Returns: corpus: The output Corpus. """ meta_vars = [] time_var = None for field_name, _ in includes_metadata: if field_name == PUBMED_FIELD_DATE: time_var = TimeVariable(field_name) meta_vars.append(time_var) else: meta_vars.append(StringVariable.make(field_name)) if field_name == PUBMED_FIELD_TITLE: meta_vars[-1].attributes["title"] = True meta_values, class_values = _records_to_corpus_entries( records, includes_metadata=includes_metadata, time_var=time_var, ) class_vars = [ DiscreteVariable('section', values=list(map(str, set(filter(None, class_values))))) ] domain = Domain([], class_vars=class_vars, metas=meta_vars) Y = np.array([class_vars[0].to_val(cv) for cv in class_values])[:, None] return Corpus(domain=domain, Y=Y, metas=meta_values)
def commit(self): selected = annotated = components = None graph = self.graph if self.plotdata.data is not None: name = self.data.name data = self.plotdata.data mask = self.plotdata.valid_mask.astype(int) mask[mask == 1] = graph.selection if graph.selection is not None \ else [False * len(mask)] selection = np.array([], dtype=np.uint8) if mask is None else np.flatnonzero(mask) if len(selection): selected = data[selection] selected.name = name + ": selected" selected.attributes = self.data.attributes if graph.selection is not None and np.max(graph.selection) > 1: annotated = create_groups_table(data, mask) else: annotated = create_annotated_table(data, selection) annotated.attributes = self.data.attributes annotated.name = name + ": annotated" comp_domain = Domain( self.plotdata.points[:, 2], metas=[StringVariable(name='component')]) metas = np.array([["RX"], ["RY"], ["angle"]]) angle = np.arctan2(np.array(self.plotdata.points[:, 1].T, dtype=float), np.array(self.plotdata.points[:, 0].T, dtype=float)) components = Table.from_numpy( comp_domain, X=np.row_stack((self.plotdata.points[:, :2].T, angle)), metas=metas) components.name = name + ": components" self.Outputs.selected_data.send(selected) self.Outputs.annotated_data.send(annotated) self.Outputs.components.send(components)
def set_actual_data(): self.closeContext() self.Error.data_size_mismatch.clear() self.Warning.no_graph_found.clear() self._invalid_data = False if network is None: if self.node_data is not None: self.Warning.no_graph_found() return n_nodes = len(self.network.nodes) if self.node_data is not None: if len(self.node_data) != n_nodes: self.Error.data_size_mismatch() self._invalid_data = True self.data = None else: self.data = self.node_data if self.node_data is None: if isinstance(network.nodes, Table): self.data = network.nodes elif isinstance(network.nodes, np.ndarray) \ and (len(network.nodes.shape) == 1 or network.nodes.shape[1] == 1): self.data = Table.from_numpy( Domain([], None, [StringVariable("label")]), np.zeros((len(network.nodes), 0)), None, metas=network.nodes.reshape((n_nodes, 1))) else: self.data = None if self.data is not None: # Replicate the necessary parts of set_data self.valid_data = np.full(len(self.data), True, dtype=bool) self.init_attr_values() self.openContext(self.data) self.cb_class_density.setEnabled(self.can_draw_density())
def test_callbacks_called_on_value(self): widget = self.widget send = widget.Outputs.network.send = Mock() update = widget.update_output = Mock(side_effect=widget.update_output) self._set_graph(Table(Domain([self.c]))) update.assert_called() update.reset_mock() send.assert_called() send.reset_mock() widget.connect_value = 1 widget.controls.connect_value.activated[int].emit(1) update.assert_called() update.reset_mock() send.assert_called() send.reset_mock() widget.connector_value = 1 widget.controls.connector_value.activated[int].emit(1) update.assert_called() update.reset_mock() send.assert_called() send.reset_mock()
def test_value_combo_updates(self): widget = self.widget widget.update_output = Mock() cb_kept = widget.controls.connect_value a, c = self.a, self.c self._set_graph(Table(Domain([a, c]))) self.assertEqual(len(cb_kept), 2) widget.update_output.assert_called() widget.update_output.reset_mock() widget.variable = c widget.controls.variable.activated[int].emit(1) self.assertEqual(len(cb_kept), 4) widget.update_output.assert_called() widget.update_output.reset_mock() widget.connect_value = 3 widget.variable = a widget.controls.variable.activated[int].emit(0) self.assertEqual(len(cb_kept), 2) self.assertEqual(widget.connect_value, 0) widget.update_output.assert_called() widget.update_output.reset_mock()
def test_file_not_found(self): # Create a dummy file file_name = "test_owfile_data.tab" domainA = Domain([DiscreteVariable("d1", values=("a", "b"))], DiscreteVariable("c1", values=("aaa", "bbb"))) dataA = Table(domainA, np.array([[0], [1], [0], [np.nan]]), np.array([0, 1, 0, 1])) dataA.save(file_name) # Open the file with the widget self.open_dataset(file_name) self.assertEqual(self.get_output(self.widget.Outputs.data).domain, dataA.domain) # Delete the file and try to reload it remove(file_name) self.widget.load_data() self.assertEqual(file_name, path.basename(self.widget.last_path())) self.assertTrue(self.widget.Error.file_not_found.is_shown()) self.assertIsNone(self.get_output(self.widget.Outputs.data)) self.assertEqual(self.widget.infolabel.text(), "No data.") # Open a sample dataset self.open_dataset("iris") self.assertFalse(self.widget.Error.file_not_found.is_shown())
def test_no_values_target(self): train = Table("titanic") model = ConstantLearner()(train) self.send_signal(self.widget.Inputs.predictors, model) domain = Domain([ DiscreteVariable("status", values=["first", "third"]), DiscreteVariable("age", values=["adult", "child"]), DiscreteVariable("sex", values=["female", "male"]) ], [DiscreteVariable("survived", values=[])]) test = Table(domain, np.array([[0, 0, 1], [0, 1, 0], [1, 0, 0]]), np.full((3, 1), np.nan)) self.send_signal(self.widget.Inputs.data, test) pred = self.get_output(self.widget.Outputs.predictions) self.assertEqual(len(pred), len(test)) results = self.get_output(self.widget.Outputs.evaluation_results) cm_widget = self.create_widget(OWConfusionMatrix) self.send_signal(cm_widget.Inputs.evaluation_results, results, widget=cm_widget) ra_widget = self.create_widget(OWROCAnalysis) self.send_signal(ra_widget.Inputs.evaluation_results, results, widget=ra_widget) lc_widget = self.create_widget(OWLiftCurve) self.send_signal(lc_widget.Inputs.evaluation_results, results, widget=lc_widget) cp_widget = self.create_widget(OWCalibrationPlot) self.send_signal(cp_widget.Inputs.evaluation_results, results, widget=cp_widget)
def test_result(self): pp = PreprocessorList([BASE_TRANSFORMER, RegexpTokenizer()]) corpus = pp(Corpus.from_file("book-excerpts")[::3]) vect = BowVectorizer() corpus_vect = vect.transform(corpus) words = ["beheld", "events", "dragged", "basin", "visit", "have"] d = Domain([corpus_vect.domain[w] for w in words]) corpus_vect = corpus_vect.transform(d) self.send_signal(self.widget.Inputs.data, corpus_vect) self.send_signal(self.widget.Inputs.selected_data, corpus_vect[:1]) self.wait_until_finished(timeout=100000) np.testing.assert_array_almost_equal( self.widget.results.p_values, [0.02128, 1, 0.04255, 0.06383, 0.08511, 0.97872], decimal=5, ) np.testing.assert_array_almost_equal( self.widget.results.fdr_values, [0.12766, 1, 0.12766, 0.12766, 0.12766, 1], decimal=5, )
def test_empty_data(self): """No crash on empty data""" data = Table("iris")[::3] widget = self.widget widget.default_method_index = Method.Model self.send_signal(self.widget.Inputs.data, data, wait=1000) imp_data = self.get_output(self.widget.Outputs.data) np.testing.assert_equal(imp_data.X, data.X) np.testing.assert_equal(imp_data.Y, data.Y) self.send_signal(self.widget.Inputs.data, Table.from_domain(data.domain), wait=1000) imp_data = self.get_output(self.widget.Outputs.data) self.assertEqual(len(imp_data), 0) # only meta columns data = data.transform(Domain([], [], data.domain.attributes)) self.send_signal("Data", data, wait=1000) imp_data = self.get_output("Data") self.assertEqual(len(imp_data), len(data)) self.assertEqual(imp_data.domain, data.domain) np.testing.assert_equal(imp_data.metas, data.metas)
def prepare_data(): if len(cont_attrs) < len(attrs): self.Warning.ignoring_disc_variables() if len(cont_attrs) == 1: self.Warning.single_attribute() x = Table.from_table(Domain(cont_attrs), data).X if sp.issparse(x): self.data = data self.cont_x = x.tocsr() else: mask = np.all(np.isfinite(x), axis=1) if not np.any(mask): self.Error.no_defined_rows() else: if np.all(mask): self.data = data self.cont_x = x.copy() else: self.data = data[mask] self.cont_x = x[mask] self.cont_x -= np.min(self.cont_x, axis=0)[None, :] sums = np.sum(self.cont_x, axis=0)[None, :] sums[sums == 0] = 1 self.cont_x /= sums
def test_select_data_discrete(self): """ Test select data function """ w = self.widget # test with data set for logistic regression - class discrete domain = Domain([ContinuousVariable('a'), ContinuousVariable('b')], DiscreteVariable('c', values=['a', 'b'])) data = Table(domain, [[1, 2], [1, 2]], [0, 1]) self.send_signal(w.Inputs.data, data) self.assertEqual(len(w.select_data()), len(data)) self.assertEqual(len(w.select_data().domain.attributes), 2) self.assertEqual(len(w.select_data().domain.class_var.values), 2) self.assertEqual(w.select_data().domain.class_var.values[1], data.domain.class_var.values[1]) self.assertEqual(w.select_data().domain.class_var.values[0], data.domain.class_var.values[0]) self.assertEqual(w.select_data().domain.attributes[0].name, w.attr_x) self.assertEqual(w.select_data().domain.attributes[1].name, w.attr_y) self.assertEqual(w.select_data().domain.class_var.values[0], w.target_class)
def test_index(self): d = Domain((age, gender, income), metas=(ssn, race)) self.assertEqual(d.index(age), 0) self.assertEqual(d.index("AGE"), 0) self.assertEqual(d.index(0), 0) self.assertEqual(d.index(np.int_(0)), 0) self.assertEqual(d.index(income), 2) self.assertEqual(d.index("income"), 2) self.assertEqual(d.index(2), 2) self.assertEqual(d.index(np.int_(2)), 2) self.assertEqual(d.index(ssn), -1) self.assertEqual(d.index("SSN"), -1) self.assertEqual(d.index(-1), -1) self.assertEqual(d.index(np.int_(-1)), -1) self.assertEqual(d.index(-2), -2) self.assertEqual(d.index(np.int_(-2)), -2)
def test_index_error(self): d = Domain((age, gender, income), metas=(ssn, race)) with self.assertRaises(ValueError): d.index(3) with self.assertRaises(ValueError): d.index(np.int_(3)) with self.assertRaises(ValueError): d.index(-3) with self.assertRaises(ValueError): d.index(np.int_(-3)) with self.assertRaises(ValueError): d.index(incomeA) with self.assertRaises(ValueError): d.index("no_such_thing") with self.assertRaises(TypeError): d.index([2])