def test_make_proxy_disc(self): abc = DiscreteVariable("abc", values="abc", ordered=True) abc1 = abc.make_proxy() abc2 = abc1.make_proxy() self.assertIs(abc.master, abc) self.assertIs(abc1.master, abc) self.assertIs(abc2.master, abc) self.assertEqual(abc, abc1) self.assertEqual(abc, abc2) self.assertEqual(abc1, abc2) self.assertEqual(hash(abc), hash(abc1)) self.assertEqual(hash(abc1), hash(abc2)) abcx = DiscreteVariable("abc", values="abc", ordered=True) self.assertNotEqual(abc, abcx) abc1p = pickle.loads(pickle.dumps(abc1)) self.assertIs(abc1p.master, abc) self.assertEqual(abc1p, abc) abcp, abc1p, abc2p = pickle.loads(pickle.dumps((abc, abc1, abc2))) self.assertIs(abcp.master, abcp.master) self.assertIs(abc1p.master, abcp.master) self.assertIs(abc2p.master, abcp.master) self.assertEqual(abcp, abc1p) self.assertEqual(abcp, abc2p) self.assertEqual(abc1p, abc2p)
def test_colors(self): var = DiscreteVariable.make("a", values=["F", "M"]) self.assertIsNone(var._colors) self.assertEqual(var.colors.shape, (2, 3)) self.assertIs(var._colors, var.colors) self.assertEqual(var.colors.shape, (2, 3)) self.assertFalse(var.colors.flags.writeable) var.colors = np.arange(6).reshape((2, 3)) np.testing.assert_almost_equal(var.colors, [[0, 1, 2], [3, 4, 5]]) self.assertFalse(var.colors.flags.writeable) with self.assertRaises(ValueError): var.colors[0] = [42, 41, 40] var.set_color(0, [42, 41, 40]) np.testing.assert_almost_equal(var.colors, [[42, 41, 40], [3, 4, 5]]) var = DiscreteVariable.make("x", values=["A", "B"]) var.attributes["colors"] = ['#0a0b0c', '#0d0e0f'] np.testing.assert_almost_equal(var.colors, [[10, 11, 12], [13, 14, 15]]) # Test ncolors adapts to nvalues var = DiscreteVariable.make('foo', values=['d', 'r']) self.assertEqual(len(var.colors), 2) var.add_value('e') self.assertEqual(len(var.colors), 3) user_defined = (0, 0, 0) var.set_color(2, user_defined) var.add_value('k') self.assertEqual(len(var.colors), 4) np.testing.assert_array_equal(var.colors[2], user_defined)
def test_repr(self): var = DiscreteVariable.make("a", values=["F", "M"]) self.assertEqual( repr(var), "DiscreteVariable(name='a', values=['F', 'M'])") var.ordered = True self.assertEqual( repr(var), "DiscreteVariable(name='a', values=['F', 'M'], ordered=True)") var = DiscreteVariable.make("a", values="1234567") self.assertEqual( repr(var), "DiscreteVariable(name='a', values=['1', '2', '3', '4', '5', '6', '7'])")
def test_copy_descriptor_discrete(self): var = DiscreteVariable("foo", values=list("abc"), ordered=True) var.attributes = {"bar": 42, "baz": 13} copied = copy_descriptor(var) self.assertIsInstance(copied, DiscreteVariable) self.assertEqual(copied.name, "foo") self.assertEqual(list(copied.values), list("abc")) self.assertTrue(copied.ordered) self.assertEqual(copied.attributes, var.attributes) self.assertIsNot(copied.attributes, var.attributes) var = DiscreteVariable("foo", values=list("abc"), ordered=False) copied = copy_descriptor(var, "cux") self.assertEqual(copied.name, "cux") self.assertFalse(copied.ordered)
def test_unpickle(self): d1 = DiscreteVariable("A", values=["two", "one"]) s = pickle.dumps(d1) d2 = DiscreteVariable.make("A", values=["one", "two", "three"]) d2_values = tuple(d2.values) d1c = pickle.loads(s) # See: gh-3238 # The unpickle reconstruction picks an existing variable (d2), on which # __setstate__ or __dict__.update is called self.assertSequenceEqual(d2.values, d2_values) self.assertSequenceEqual(d1c.values, d1.values) s = pickle.dumps(d2) DiscreteVariable._clear_all_caches() # [comment redacted] d1 = DiscreteVariable("A", values=["one", "two"]) d2 = pickle.loads(s) self.assertSequenceEqual(d2.values, ["two", "one", "three"])
def _create_corpus(self): corpus = None names = ["name", "path", "content"] data = [] category_data = [] text_categories = list(set(t.category for t in self._text_data)) values = list(set(text_categories)) category_var = DiscreteVariable.make("category", values=values) for textdata in self._text_data: data.append( [textdata.name, textdata.path, textdata.content] ) category_data.append(category_var.to_val(textdata.category)) if len(text_categories) > 1: category_data = np.array(category_data) else: category_var = [] category_data = np.empty((len(data), 0)) domain = Domain( [], category_var, [StringVariable.make(name) for name in names] ) domain["name"].attributes["title"] = True data = np.array(data, dtype=object) if len(data): corpus = Corpus(domain, Y=category_data, metas=data, text_features=[domain.metas[2]]) return corpus
def create_discretized_var(cls, var, points): lpoints = list(points) if lpoints: values = [ cls._fmt_interval(low, high, var.number_of_decimals) for low, high in zip([-np.inf] + lpoints, lpoints + [np.inf])] to_sql = BinSql(var, lpoints) else: values = ["single_value"] to_sql = SingleValueSql(values[0]) dvar = DiscreteVariable(name=var.name, values=values, compute_value=cls(var, points)) dvar.source_variable = var dvar.to_sql = to_sql return dvar
def extend_attributes(self, X, feature_names, feature_values=None, compute_values=None, var_attrs=None, sparse=False): """ Append features to corpus. If `feature_values` argument is present, features will be Discrete else Continuous. Args: X (numpy.ndarray or scipy.sparse.csr_matrix): Features values to append feature_names (list): List of string containing feature names feature_values (list): A list of possible values for Discrete features. compute_values (list): Compute values for corresponding features. var_attrs (dict): Additional attributes appended to variable.attributes. sparse (bool): Whether the features should be marked as sparse. """ if self.X.size == 0: self.X = X elif sp.issparse(self.X) or sp.issparse(X): self.X = sp.hstack((self.X, X)).tocsr() else: self.X = np.hstack((self.X, X)) if compute_values is None: compute_values = [None] * X.shape[1] if feature_values is None: feature_values = [None] * X.shape[1] new_attr = self.domain.attributes for f, values, cv in zip(feature_names, feature_values, compute_values): if values is not None: var = DiscreteVariable(f, values=values, compute_value=cv) else: var = ContinuousVariable(f, compute_value=cv) var.sparse = sparse # don't pass this to constructor so this works with Orange < 3.8.0 if cv is not None: # set original variable for cv cv.variable = var if isinstance(var_attrs, dict): var.attributes.update(var_attrs) new_attr += (var, ) new_domain = Domain( attributes=new_attr, class_vars=self.domain.class_vars, metas=self.domain.metas ) self.domain = new_domain
def test_repr(self): var = DiscreteVariable.make("a", values=["F", "M"]) self.assertEqual( repr(var), "DiscreteVariable('a', values=['F', 'M'])") var.base_value = 1 self.assertEqual( repr(var), "DiscreteVariable('a', values=['F', 'M'], base_value=1)") var.ordered = True self.assertEqual( repr(var), "DiscreteVariable('a', values=['F', 'M'], " "ordered=True, base_value=1)") var = DiscreteVariable.make("a", values="1234567") self.assertEqual( repr(var), "DiscreteVariable('a', values=['1', '2', '3', '4', '5', ...])")
def test_colors(self): var = DiscreteVariable.make("a", values=["F", "M"]) self.assertIsNone(var._colors) self.assertEqual(var.colors.shape, (2, 3)) self.assertIs(var._colors, var.colors) self.assertEqual(var.colors.shape, (2, 3)) self.assertFalse(var.colors.flags.writeable) var.colors = np.arange(6).reshape((2, 3)) np.testing.assert_almost_equal(var.colors, [[0, 1, 2], [3, 4, 5]]) self.assertFalse(var.colors.flags.writeable) with self.assertRaises(ValueError): var.colors[0] = [42, 41, 40] var.set_color(0, [42, 41, 40]) np.testing.assert_almost_equal(var.colors, [[42, 41, 40], [3, 4, 5]]) var = DiscreteVariable.make("x", values=["A", "B"]) var.attributes["colors"] = ['#0a0b0c', '#0d0e0f'] np.testing.assert_almost_equal(var.colors, [[10, 11, 12], [13, 14, 15]])
def create_discretized_var(cls, var, points): lpoints = list(points) if points: values = [ cls._fmt_interval(low, high, var.number_of_decimals) for low, high in zip([-np.inf] + lpoints, lpoints + [np.inf])] def discretized_attribute(): return 'bin(%s, ARRAY%s)' % (var.to_sql(), str(lpoints)) else: values = ["single_value"] def discretized_attribute(): return "'%s'" % values[0] dvar = DiscreteVariable(name="D_" + var.name, values=values) dvar.compute_value = cls(var, points) dvar.source_variable = var dvar.to_sql = discretized_attribute return dvar
def _guess_variable(self, field_name, field_metadata, inspect_table): type_code = field_metadata[0] FLOATISH_TYPES = (700, 701, 1700) # real, float8, numeric INT_TYPES = (20, 21, 23) # bigint, int, smallint CHAR_TYPES = (25, 1042, 1043,) # text, char, varchar BOOLEAN_TYPES = (16,) # bool DATE_TYPES = (1082, 1114, 1184, ) # date, timestamp, timestamptz # time, timestamp, timestamptz, timetz TIME_TYPES = (1083, 1114, 1184, 1266,) if type_code in FLOATISH_TYPES: return ContinuousVariable.make(field_name) if type_code in TIME_TYPES + DATE_TYPES: tv = TimeVariable.make(field_name) tv.have_date |= type_code in DATE_TYPES tv.have_time |= type_code in TIME_TYPES return tv if type_code in INT_TYPES: # bigint, int, smallint if inspect_table: values = self.get_distinct_values(field_name, inspect_table) if values: return DiscreteVariable.make(field_name, values) return ContinuousVariable.make(field_name) if type_code in BOOLEAN_TYPES: return DiscreteVariable.make(field_name, ['false', 'true']) if type_code in CHAR_TYPES: if inspect_table: values = self.get_distinct_values(field_name, inspect_table) # remove trailing spaces values = [v.rstrip() for v in values] if values: return DiscreteVariable.make(field_name, values) return StringVariable.make(field_name)
def test_to_val(self): values = ["F", "M"] var = DiscreteVariable(name="Feature 0", values=values) self.assertEqual(var.to_val(0), 0) self.assertEqual(var.to_val("F"), 0) self.assertEqual(var.to_val(0.), 0) self.assertTrue(math.isnan(var.to_val("?"))) # TODO: with self.assertRaises(ValueError): var.to_val(2) with self.assertRaises(ValueError): var.to_val("G")
def test_find_compatible_ordered(self): abc = DiscreteVariable("abc", values="abc", ordered=True) find_comp = DiscreteVariable._find_compatible self.assertIsNone(find_comp("abc")) self.assertIsNone(find_comp("abc", list("abc"))) self.assertIs(find_comp("abc", ordered=True), abc) self.assertIs(find_comp("abc", ["a"], ordered=True), abc) self.assertIs(find_comp("abc", ["a", "b"], ordered=True), abc) self.assertIs(find_comp("abc", ["a", "b", "c"], ordered=True), abc) self.assertIs(find_comp("abc", ["a", "b", "c", "d"], ordered=True), abc) abd = DiscreteVariable.make( "abc", values=["a", "d", "b"], ordered=True) self.assertIsNot(abc, abd) abc_un = DiscreteVariable.make("abc", values=["a", "b", "c"]) self.assertIsNot(abc_un, abc) self.assertIs( find_comp("abc", values=["a", "d", "b"], ordered=True), abd) self.assertIs(find_comp("abc", values=["a", "b", "c"]), abc_un)
def __init__(self, filename): reader = vcf.Reader(open(filename, "r")) records = [r for r in reader] self.samples = np.array(reader.samples) self.gq = np.array([[s.data.GQ for s in r.samples] for r in records], dtype="f") self.gq = np.nan_to_num(self.gq) gt = np.array([[s.data.GT for s in r.samples] for r in records]) self.gt = gt != "0/0" self.records = records self.variables = [ DiscreteVariable("%s-%s" % (r.CHROM, r.POS), values=["0", "1"]) for r in self.records ] for v, r in zip(self.variables, records): v.attributes["CHROM"] = str(r.CHROM) v.attributes["POS"] = str(r.POS) v.attributes["REF"] = str(r.REF) v.attributes["ALT"] = "".join(str(s) for s in r.ALT)
def test_select_data_discrete(self): """ Test select data function """ w = self.widget # test with data set for logistic regression - class discrete domain = Domain([ContinuousVariable('a'), ContinuousVariable('b')], DiscreteVariable('c', values=['a', 'b'])) data = Table.from_numpy(domain, [[1, 2], [1, 2]], [0, 1]) self.send_signal(w.Inputs.data, data) self.assertEqual(len(w.select_data()), len(data)) self.assertEqual(len(w.select_data().domain.attributes), 2) self.assertEqual(len(w.select_data().domain.class_var.values), 2) self.assertEqual(w.select_data().domain.class_var.values[1], data.domain.class_var.values[1]) self.assertEqual(w.select_data().domain.class_var.values[0], data.domain.class_var.values[0]) self.assertEqual(w.select_data().domain.attributes[0].name, w.attr_x) self.assertEqual(w.select_data().domain.attributes[1].name, w.attr_y) self.assertEqual(w.select_data().domain.class_var.values[0], w.target_class)
def test_metadata(self): """ Widget should interpret meta data which are continuous or discrete in the same way as features or target. However still one variable should be target or feature. """ table = Table( Domain( [], [], [ ContinuousVariable("a"), DiscreteVariable("b", values=["y", "n"]) ], ), list(zip([42.48, 16.84, 15.23, 23.8], "yynn")), ) with patch("Orange.widgets.visualize.owsieve.Discretize", wraps=Discretize) as disc: self.send_signal(self.widget.Inputs.data, table) self.assertTrue(disc.called) metas = self.widget.discrete_data.domain.metas self.assertEqual(len(metas), 2) self.assertTrue(all(attr.is_discrete for attr in metas))
def test_XY_large(self): from Orange.data.sql.table import AUTO_DL_LIMIT as DLL mat = np.random.randint(0, 2, (DLL + 100, 3)) conn, table_name = self.create_sql_table(mat) sql_table = SqlTable(conn, table_name, type_hints=Domain([], DiscreteVariable(name='col2', values=('0', '1', '2')))) self.assertRaises(ValueError, lambda: sql_table.X) self.assertRaises(ValueError, lambda: sql_table.Y) with self.assertRaises(ValueError): sql_table.download_data(DLL + 10) # Download partial data sql_table.download_data(DLL + 10, partial=True) assert_almost_equal(sql_table.X, mat[:DLL + 10, :2]) assert_almost_equal(sql_table.Y.flatten()[:DLL + 10], mat[:DLL + 10, 2]) # Download all data sql_table.download_data() assert_almost_equal(sql_table.X, mat[:, :2]) assert_almost_equal(sql_table.Y.flatten(), mat[:, 2])
def _corpus_from_records(records, includes_metadata): """Receives PubMed records and transforms them into a corpus. Args: records (list): A list of PubMed entries. includes_metadata (list): A list of text fields to include. Returns: corpus: The output Corpus. """ meta_vars = [] time_var = None for field_name, _ in includes_metadata: if field_name == PUBMED_FIELD_DATE: time_var = TimeVariable(field_name) meta_vars.append(time_var) else: meta_vars.append(StringVariable.make(field_name)) if field_name == PUBMED_FIELD_TITLE: meta_vars[-1].attributes["title"] = True meta_values, class_values = _records_to_corpus_entries( records, includes_metadata=includes_metadata, time_var=time_var, ) class_vars = [ DiscreteVariable('section', values=list(map(str, set(filter(None, class_values))))) ] domain = Domain([], class_vars=class_vars, metas=meta_vars) Y = np.array([class_vars[0].to_val(cv) for cv in class_values])[:, None] return Corpus(domain=domain, Y=Y, metas=meta_values)
def send_data(self): if self.optimize_k: row = self.selected_row() k = self.k_from + row if row is not None else None else: k = self.k km = self.clusterings.get(k) if self.data is None or km is None or isinstance(km, str): self.Outputs.annotated_data.send(None) self.Outputs.centroids.send(None) return domain = self.data.domain cluster_var = DiscreteVariable( get_unique_names(domain, "Cluster"), values=["C%d" % (x + 1) for x in range(km.k)]) clust_ids = km(self.data) silhouette_var = ContinuousVariable( get_unique_names(domain, "Silhouette")) if km.silhouette_samples is not None: self.Warning.no_silhouettes.clear() scores = np.arctan(km.silhouette_samples) / np.pi + 0.5 else: self.Warning.no_silhouettes() scores = np.nan new_domain = add_columns(domain, metas=[cluster_var, silhouette_var]) new_table = self.data.transform(new_domain) new_table.get_column_view(cluster_var)[0][:] = clust_ids.X.ravel() new_table.get_column_view(silhouette_var)[0][:] = scores centroids = Table(Domain(km.pre_domain.attributes), km.centroids) self.Outputs.annotated_data.send(new_table) self.Outputs.centroids.send(centroids)
def set_marking_items(self, items): self.markInputCombo.clear() self.markInputRadioButton.setEnabled(False) self.markInputItems = items self.warning() if items is None: return if self.graph is None or self.graph.items() is None: self.warning( 'No graph provided or no items attached to the graph.') return graph_items = self.graph.items() domain = graph_items.domain if len(items) > 0: commonVars = ( set(x.name for x in chain(items.domain.variables, items.domain.metas)) & set(x.name for x in chain(domain.variables, domain.metas))) self.markInputCombo.addItem( gui.attributeIconDict[gui.vartype(DiscreteVariable())], "ID") for var in commonVars: orgVar, mrkVar = domain[var], items.domain[var] if type(orgVar) == type(mrkVar) == StringVariable: self.markInputCombo.addItem( gui.attributeIconDict[gui.vartype(orgVar)], orgVar.name) self.markInputRadioButton.setEnabled(True)
def test_result_shape_numpy(self): """ Test whether results shapes are correct when testing on numpy data """ iris = Table('iris') iris_bin = Table( Domain(iris.domain.attributes, DiscreteVariable("iris", values=["a", "b"])), iris.X[:100], iris.Y[:100]) for learner in all_learners(): with self.subTest(learner.__name__): args = [] if learner in (ThresholdLearner, CalibratedLearner): args = [LogisticRegressionLearner()] data = iris_bin if learner is ThresholdLearner else iris model = learner(*args)(data) transformed_iris = model.data_to_model_domain(data) res = model(transformed_iris.X[0:5]) self.assertTupleEqual((5, ), res.shape) res = model(transformed_iris.X[0:1], model.Probs) self.assertTupleEqual((1, len(data.domain.class_var.values)), res.shape)
def __call__(self, data: Table, attribute): fmt = [ "%Y", "%y %b", "%y %b %d", "%y %b %d %H:%M", "%y %b %d %H:%M", "%H:%M:%S" ][self.unit] values, _ = data.get_column_view(attribute) times = [] if values.size: mn, mx = ut.nanmin(values), ut.nanmax(values) if not np.isnan(mn): mn = utc_from_timestamp(mn).timetuple() mx = utc_from_timestamp(mx).timetuple() times = _time_range(mn, mx, self.unit, self.width, 0, 100) if times is None: raise TooManyIntervals times = [time.struct_time(t + (0, 0, 0)) for t in times][1:-1] points = np.array([calendar.timegm(t) for t in times]) values = [time.strftime(fmt, t) for t in times] values = _simplified_time_intervals(values) var = data.domain[attribute] return DiscreteVariable(name=var.name, values=values, compute_value=Discretizer(var, points), sparse=var.sparse)
def test_domain_union(self): X1, X2, X3 = map(ContinuousVariable, ["X1", "X2", "X3"]) D1, D2, D3 = map( lambda n: DiscreteVariable(n, values=["a", "b"]), ["D1", "D2", "D3"] ) S1, S2 = map(StringVariable, ["S1", "S2"]) domain1 = Domain([X1, X2], [D1], [S1]) domain2 = Domain([X3], [D2], [S2]) res = domain_union(domain1, domain2) self.assertSequenceEqual(res.attributes, [X1, X2, X3]) self.assertSequenceEqual(res.class_vars, [D1, D2]) self.assertSequenceEqual(res.metas, [S1, S2]) domain2 = Domain([X3, X2], [D2, D1, D3], [S2, S1]) res = domain_union(domain1, domain2) self.assertSequenceEqual(res.attributes, [X1, X2, X3]) self.assertSequenceEqual(res.class_vars, [D1, D2, D3]) self.assertSequenceEqual(res.metas, [S1, S2]) res = domain_union(domain1, domain1) self.assertSequenceEqual(res.attributes, domain1.attributes) self.assertSequenceEqual(res.class_vars, domain1.class_vars) self.assertSequenceEqual(res.metas, domain1.metas)
def test_vizrank_class_nan(self): """ When class values are nan, vizrank should be disabled. It should behave like the class column is missing. GH-2757 """ def assert_vizrank_enabled(data, is_enabled): self.send_signal(self.widget.Inputs.data, data) self.assertEqual(is_enabled, self.widget.vizrank_button.isEnabled()) data1 = Table("iris")[::30] data2 = Table("iris")[::30].copy() with data2.unlocked(): data2.Y[:] = np.nan domain = Domain( attributes=data2.domain.attributes[:4], class_vars=DiscreteVariable("iris", values=())) data2 = Table(domain, data2.X, Y=data2.Y) data3 = Table("iris")[::30].copy() with data3.unlocked(): data3.Y[:] = np.nan for data, is_enabled in zip([data1, data2, data1, data3, data1], [True, False, True, False, True]): assert_vizrank_enabled(data, is_enabled)
def _create_corpus(self): corpus = None names = ["name", "path", "content"] data = [] category_data = [] text_categories = list(set(t.category for t in self._text_data)) values = list(set(text_categories)) category_var = DiscreteVariable.make("category", values=values) for textdata in self._text_data: data.append([ # some characters are written as decomposed (č is char c # and separate char for caron), with NFC normalization we # normalize them to be written as precomposed (č is one # unicode char - 0x10D) # https://docs.python.org/3/library/unicodedata.html#unicodedata.normalize normalize('NFC', textdata.name), normalize('NFC', textdata.path), normalize('NFC', textdata.content) ]) category_data.append(category_var.to_val(textdata.category)) if len(text_categories) > 1: category_data = np.array(category_data) else: category_var = [] category_data = np.empty((len(data), 0)) domain = Domain([], category_var, [StringVariable.make(name) for name in names]) domain["name"].attributes["title"] = True data = np.array(data, dtype=object) if len(data): corpus = Corpus(domain, Y=category_data, metas=data, text_features=[domain.metas[2]]) return corpus
def _create_variable(self): rules = self.active_rules # Transposition + stripping valid_rules = [label or pattern or n_matches for (label, pattern), n_matches in zip(rules, self.match_counts)] patterns = tuple( pattern for (_, pattern), valid in zip(rules, valid_rules) if valid) names = tuple( name for name, valid in zip(self.class_labels(), valid_rules) if valid) transformer = self.TRANSFORMERS[type(self.attribute)] var_key = (self.attribute, self.class_name, names, patterns, self.case_sensitive, self.match_beginning) if var_key in self.cached_variables: return self.cached_variables[var_key] compute_value = transformer( self.attribute, patterns, self.case_sensitive, self.match_beginning) new_var = DiscreteVariable( self.class_name, names, compute_value=compute_value) self.cached_variables[var_key] = new_var return new_var
def stratify_data(self, data: Table, state: TaskState) -> Optional[Table]: cohort_vars = () steps = iter(np.linspace(0, 100, len(data))) def callback(): try: state.set_progress_value(next(steps)) except StopIteration: pass if self.stratify_on == StratifyOn.CoxRiskScore: cox_model = self.learner(data) _, risk_score_label = self.stratify_on_options[self.stratify_on] risk_score_var = ContinuousVariable(risk_score_label, compute_value=partial( cox_risk_score, cox_model, data.domain)) risk_group_var = DiscreteVariable( 'Cohorts', values=['Low risk', 'High risk'], compute_value=partial(stratify, risk_score_var, self.splitting_criteria, data.domain, callback), ) cohort_vars = ( risk_score_var, risk_group_var, ) domain = Domain( self.data.domain.attributes, self.data.domain.class_vars, self.data.domain.metas + cohort_vars, ) return self.data.transform(domain)
def _send_data(self): if self.partition is None or self.data is None: return domain = self.data.domain # Compute the frequency of each cluster index counts = np.bincount(self.partition) indices = np.argsort(counts)[::-1] index_map = {n: o for n, o in zip(indices, range(len(indices)))} new_partition = list(map(index_map.get, self.partition)) cluster_var = DiscreteVariable( get_unique_names(domain, "Cluster"), values=["C%d" % (i + 1) for i, _ in enumerate(np.unique(new_partition))] ) new_domain = add_columns(domain, metas=[cluster_var]) new_table = self.data.transform(new_domain) new_table.get_column_view(cluster_var)[0][:] = new_partition self.Outputs.annotated_data.send(new_table) if Graph is not None: graph = Graph(self.graph) graph.set_items(new_table) self.Outputs.graph.send(graph)
def test_SimpleTree_to_string_regression(self): domain = Domain([DiscreteVariable(name='d1', values='ef'), ContinuousVariable(name='c1')], ContinuousVariable(name='cls')) data = Table.from_list(domain, [['e', 1, 10], ['e', 1, 20], ['e', 2, 20], ['f', 2, 30], ["e", 3, 10], ['f', 3, 30]]) lrn = SimpleTreeReg(min_instances=1) reg = lrn(data) reg_str = reg.to_string() res = '\n' \ 'd1 (20: 6.0)\n' \ ': e\n' \ ' c1 (15: 4.0)\n' \ ' : <=2.5\n' \ ' c1 (16.6667: 3.0)\n' \ ' : <=1.5 --> (15: 2.0)\n' \ ' : >1.5 --> (20: 1.0)\n' \ ' : >2.5 --> (10: 1.0)\n' \ ': f --> (30: 2.0)' self.assertEqual(reg_str, res)
def send_corpus(self): if self.clustering_mask is not None: cluster_var = DiscreteVariable( 'Duplicates Cluster', values=[str(Cluster(v)) for v in set(self.clustering_mask.flatten())] ) corpus, domain = self.corpus, self.corpus.domain attrs = domain.attributes class_ = domain.class_vars metas = domain.metas if self.cluster_role == self.AttributeRole: attrs = attrs + (cluster_var,) elif self.cluster_role == self.ClassRole: class_ = class_ + (cluster_var,) elif self.cluster_role == self.MetaRole: metas = metas + (cluster_var,) domain = Domain(attrs, class_, metas) corpus = corpus.from_table(domain, corpus) corpus.get_column_view(cluster_var)[0][:] = self.clustering_mask self.send(IO.corpus, corpus) else: self.send(IO.corpus, None)
def test_mapper_inplace(self): s = list(range(7)) abc = DiscreteVariable("a", values=tuple("abc")) dca = DiscreteVariable("a", values=tuple("dca")) mapper = dca.get_mapper_from(abc) arr = np.array([[0, 0, 2, 1, 0, 1, np.nan], s]).T mapper(arr, 0) np.testing.assert_array_equal( arr, np.array([[2, 2, 1, np.nan, 2, np.nan, np.nan], s]).T) self.assertRaises(ValueError, mapper, sp.csr_matrix(arr), 0) self.assertRaises(ValueError, mapper, [1, 2, 3], 0) self.assertRaises(ValueError, mapper, 1, 0) acd = DiscreteVariable("a", values=tuple("acd")) mapper = acd.get_mapper_from(abc) arr = np.array([[0, 0, 2, 1, 0, 1, np.nan], s]).T mapper(arr, 0) np.testing.assert_array_equal( arr, np.array([[0, 0, 1, np.nan, 0, np.nan, np.nan], s]).T) arr = sp.csr_matrix(np.array([[0, 0, 2, 1, 0, 1, np.nan], s]).T) mapper(arr, 0) np.testing.assert_array_equal( arr.todense(), np.array([[0, 0, 1, np.nan, 0, np.nan, np.nan], s]).T) arr = sp.csc_matrix(np.array([[0, 0, 2, 1, 0, 1, np.nan], s]).T) mapper(arr, 0) np.testing.assert_array_equal( arr.todense(), np.array([[0, 0, 1, np.nan, 0, np.nan, np.nan], s]).T)
def test_str(self): domain = self.create_domain(["x", DiscreteVariable("g", values="MF")]) inst = Instance(domain, [42, 0]) self.assertEqual(str(inst), "[42.000, M]") domain = self.create_domain(["x", DiscreteVariable("g", values="MF")], [DiscreteVariable("y", values="ABC")]) inst = Instance(domain, [42, "M", "B"]) self.assertEqual(str(inst), "[42.000, M | B]") domain = self.create_domain(["x", DiscreteVariable("g", values="MF")], [DiscreteVariable("y", values="ABC")], self.metas) inst = Instance(domain, [42, "M", "B", "X", 43, "Foo"]) self.assertEqual(str(inst), "[42.000, M | B] {X, 43.000, Foo}") domain = self.create_domain([], [DiscreteVariable("y", values="ABC")], self.metas) inst = Instance(domain, ["B", "X", 43, "Foo"]) self.assertEqual(str(inst), "[ | B] {X, 43.000, Foo}") domain = self.create_domain([], [], self.metas) inst = Instance(domain, ["X", 43, "Foo"]) self.assertEqual(str(inst), "[] {X, 43.000, Foo}") domain = self.create_domain(self.attributes) inst = Instance(domain, range(len(self.attributes))) self.assertEqual( str(inst), "[{}]".format(", ".join("{:.3f}".format(x) for x in range(len(self.attributes))))) for attr in domain: attr.number_of_decimals = 0 self.assertEqual( str(inst), "[{}]".format(", ".join("{}".format(x) for x in range(len(self.attributes)))))
item_summary_df[item_summary_df.total_perc <= 0.5].shape # In[13]: item_summary_df[item_summary_df.total_perc <= 0.5] # # Construct Orange Table # In[16]: input_assoc_rules = grocery_df domain_grocery = Domain([DiscreteVariable.make(name=item,values=['0', '1']) for item in input_assoc_rules.columns]) data_gro_1 = Orange.data.Table.from_numpy(domain=domain_grocery, X=input_assoc_rules.as_matrix(),Y= None) # # Prune Dataset for frequently purchased items # In[2]: def prune_dataset(input_df, length_trans = 2, total_sales_perc = 0.5, start_item = None, end_item = None): if 'total_items' in input_df.columns: del(input_df['total_items']) item_count = input_df.sum().sort_values(ascending = False).reset_index() total_items = sum(input_df.sum().sort_values(ascending = False)) item_count.rename(columns={item_count.columns[0]:'item_name',item_count.columns[1]:'item_count'}, inplace=True) if not start_item and not end_item:
def test_val_from_str(self): var = DiscreteVariable.make("a", values=["F", "M"]) self.assertTrue(math.isnan(var.to_val(None))) self.assertEqual(var.to_val(1), 1)
def test_make(self): var = DiscreteVariable.make("a", values=["F", "M"]) self.assertIsInstance(var, DiscreteVariable) self.assertEqual(var.name, "a") self.assertEqual(var.values, ["F", "M"])
def setUp(self): DiscreteVariable._clear_cache()
def test_no_duplicated_values(self): a = DiscreteVariable("foo", values=["a", "b", "c"]) a.add_value("b") self.assertEqual(list(a.values), ["a", "b", "c"]) self.assertEqual(list(a._value_index), ["a", "b", "c"])
VarDataPair = namedtuple('VarDataPair', ['variable', 'data']) # Continuous variable variations continuous_full = VarDataPair( ContinuousVariable('continuous_full'), np.array([0, 1, 2, 3, 4], dtype=float), ) continuous_missing = VarDataPair( ContinuousVariable('continuous_missing'), np.array([0, 1, 2, np.nan, 4], dtype=float), ) # Unordered discrete variable variations rgb_full = VarDataPair( DiscreteVariable('rgb_full', values=('r', 'g', 'b')), np.array([0, 1, 1, 1, 2], dtype=float), ) rgb_missing = VarDataPair( DiscreteVariable('rgb_missing', values=('r', 'g', 'b')), np.array([0, 1, 1, np.nan, 2], dtype=float), ) # Ordered discrete variable variations ints_full = VarDataPair( DiscreteVariable('ints_full', values=('2', '3', '4'), ordered=True), np.array([0, 1, 1, 1, 2], dtype=float), ) ints_missing = VarDataPair( DiscreteVariable('ints_missing', values=('2', '3', '4'), ordered=True), np.array([0, 1, 1, np.nan, 2], dtype=float),
def test_no_nonstringvalues(self): self.assertRaises(TypeError, DiscreteVariable, "foo", values=["a", 42]) a = DiscreteVariable("foo", values=["a", "b", "c"]) self.assertRaises(TypeError, a.add_value, 42)
def test_have_date_have_time_in_construct(self): """Test if have_time and have_date is correctly set""" var = TimeVariable('time', have_date=1) self.assertTrue(var.have_date) self.assertFalse(var.have_time) PickleContinuousVariable = create_pickling_tests( "PickleContinuousVariable", ("with_name", lambda: ContinuousVariable(name="Feature 0")), ) PickleDiscreteVariable = create_pickling_tests( "PickleDiscreteVariable", ("with_name", lambda: DiscreteVariable(name="Feature 0")), ("with_str_value", lambda: DiscreteVariable(name="Feature 0", values=["F", "M"])), ("ordered", lambda: DiscreteVariable( name="Feature 0", values=["F", "M"], ordered=True)), ("with_base_value", lambda: DiscreteVariable( name="Feature 0", values=["F", "M"], base_value=0))) PickleStringVariable = create_pickling_tests( "PickleStringVariable", ("with_name", lambda: StringVariable(name="Feature 0"))) class VariableTestMakeProxy(unittest.TestCase): def setUp(self): Variable._clear_all_caches()
def test_value_from_discrete_substring(self): trans = ValueFromDiscreteSubstring( DiscreteVariable("x", values=self.arr), self.patterns) np.testing.assert_equal(trans.lookup_table, [0, 1, 2, 0, 3])
import warnings from unittest import TestCase from unittest.mock import Mock from Orange.data import Domain, DiscreteVariable from Orange.data import ContinuousVariable from Orange.util import OrangeDeprecationWarning from Orange.widgets.settings import DomainContextHandler, ContextSetting from Orange.widgets.utils import vartype Continuous = vartype(ContinuousVariable()) Discrete = vartype(DiscreteVariable()) class TestDomainContextHandler(TestCase): def setUp(self): self.domain = Domain( attributes=[ ContinuousVariable("c1"), DiscreteVariable("d1", values="abc"), DiscreteVariable("d2", values="def"), ], class_vars=[DiscreteVariable("d3", values="ghi")], metas=[ ContinuousVariable("c2"), DiscreteVariable("d4", values="jkl") ], ) self.args = ( self.domain, { "c1": Continuous,
def setUp(self): DiscreteVariable._clear_cache() self.data = Table([[1, 2, 3]])
def test_set_data(self): """ Test widget behavior when data set """ w = self.widget num_continuous_attributes = sum(True for var in self.iris.domain.attributes if isinstance(var, ContinuousVariable)) self.send_signal(w.Inputs.data, self.iris) # widget does not have any problems with that data set so # everything should be fine self.assertEqual(w.cbx.count(), num_continuous_attributes) self.assertEqual(w.cby.count(), num_continuous_attributes) self.assertEqual(w.target_class_combobox.count(), len(self.iris.domain.class_var.values)) self.assertEqual(w.cbx.currentText(), self.iris.domain[0].name) self.assertEqual(w.cby.currentText(), self.iris.domain[1].name) self.assertEqual(w.target_class_combobox.currentText(), self.iris.domain.class_var.values[0]) self.assertEqual(w.attr_x, self.iris.domain[0].name) self.assertEqual(w.attr_y, self.iris.domain[1].name) self.assertEqual(w.target_class, self.iris.domain.class_var.values[0]) # change showed attributes w.attr_x = self.iris.domain[1].name w.attr_y = self.iris.domain[2].name w.target_class = self.iris.domain.class_var.values[1] self.assertEqual(w.cbx.currentText(), self.iris.domain[1].name) self.assertEqual(w.cby.currentText(), self.iris.domain[2].name) self.assertEqual(w.target_class_combobox.currentText(), self.iris.domain.class_var.values[1]) self.assertEqual(w.attr_x, self.iris.domain[1].name) self.assertEqual(w.attr_y, self.iris.domain[2].name) self.assertEqual(w.target_class, self.iris.domain.class_var.values[1]) # remove data set self.send_signal(w.Inputs.data, None) self.assertEqual(w.cbx.count(), 0) self.assertEqual(w.cby.count(), 0) self.assertEqual(w.target_class_combobox.count(), 0) # set data set again self.send_signal(w.Inputs.data, self.iris) # widget does not have any problems with that data set so # everything should be fine self.assertEqual(w.cbx.count(), num_continuous_attributes) self.assertEqual(w.cby.count(), num_continuous_attributes) self.assertEqual(w.target_class_combobox.count(), len(self.iris.domain.class_var.values)) self.assertEqual(w.cbx.currentText(), self.iris.domain[0].name) self.assertEqual(w.cby.currentText(), self.iris.domain[1].name) self.assertEqual(w.target_class_combobox.currentText(), self.iris.domain.class_var.values[0]) self.assertEqual(w.attr_x, self.iris.domain[0].name) self.assertEqual(w.attr_y, self.iris.domain[1].name) self.assertEqual(w.target_class, self.iris.domain.class_var.values[0]) # set data set with no class table_no_class = Table( Domain([ContinuousVariable("x"), ContinuousVariable("y")]), [[1, 2], [2, 3]]) self.send_signal(w.Inputs.data, table_no_class) self.assertEqual(w.cbx.count(), 0) self.assertEqual(w.cby.count(), 0) self.assertEqual(w.target_class_combobox.count(), 0) self.assertTrue(w.Error.no_class.is_shown()) # set data with one class variable table_one_class = Table( Domain([ContinuousVariable("x"), ContinuousVariable("y")], DiscreteVariable("a", values=["k"])), [[1, 2], [2, 3]], [0, 0]) self.send_signal(w.Inputs.data, table_one_class) self.assertEqual(w.cbx.count(), 0) self.assertEqual(w.cby.count(), 0) self.assertEqual(w.target_class_combobox.count(), 0) self.assertTrue(w.Error.no_class.is_shown()) # set data with not enough continuous variables table_no_enough_cont = Table( Domain([ ContinuousVariable("x"), DiscreteVariable("y", values=["a", "b"]) ], ContinuousVariable("a")), [[1, 0], [2, 1]], [0, 0]) self.send_signal(w.Inputs.data, table_no_enough_cont) self.assertEqual(w.cbx.count(), 0) self.assertEqual(w.cby.count(), 0) self.assertEqual(w.target_class_combobox.count(), 0) self.assertTrue(w.Error.to_few_features.is_shown())
def test_colors_diff_domain(self): """ Test whether the color selection for values is correct. """ # pylint: disable=protected-access self.send_signal(self.widget.Inputs.data, self.iris) # case 1: two domains one subset other idom = self.iris.domain dom1 = Domain( idom.attributes, DiscreteVariable(idom.class_var.name, idom.class_var.values)) dom2 = Domain( idom.attributes, DiscreteVariable(idom.class_var.name, idom.class_var.values[:2])) iris1 = self.iris[:100].transform(dom1) iris2 = self.iris[:100].transform(dom2) predictor_iris1 = ConstantLearner()(iris1) predictor_iris2 = ConstantLearner()(iris2) self.send_signal(self.widget.Inputs.predictors, predictor_iris1) self.send_signal(self.widget.Inputs.predictors, predictor_iris2, 1) colors = self.widget._get_colors() np.testing.assert_array_equal(colors, iris1.domain.class_var.colors) # case 2: two domains one subset other - different color order idom = self.iris.domain colors = idom.class_var.colors[::-1] dom1 = Domain( idom.attributes, DiscreteVariable(idom.class_var.name, idom.class_var.values)) dom2 = Domain( idom.attributes, DiscreteVariable(idom.class_var.name, idom.class_var.values[:2])) dom1.class_var.colors = colors dom2.class_var.colors = colors[:2] iris1 = self.iris[:100].transform(dom1) iris2 = self.iris[:100].transform(dom2) predictor_iris1 = ConstantLearner()(iris1) predictor_iris2 = ConstantLearner()(iris2) self.send_signal(self.widget.Inputs.predictors, predictor_iris1) self.send_signal(self.widget.Inputs.predictors, predictor_iris2, 1) colors = self.widget._get_colors() np.testing.assert_array_equal(colors, iris1.domain.class_var.colors) # case 3: domain color, values miss-match - use default colors idom = self.iris.domain dom1 = Domain( idom.attributes, DiscreteVariable(idom.class_var.name, idom.class_var.values)) dom2 = Domain( idom.attributes, DiscreteVariable(idom.class_var.name, idom.class_var.values)) dom1.class_var.colors = dom1.class_var.colors[::-1] iris1 = self.iris.transform(dom1) iris2 = self.iris.transform(dom2) predictor_iris1 = ConstantLearner()(iris1) predictor_iris2 = ConstantLearner()(iris2) self.send_signal(self.widget.Inputs.predictors, predictor_iris1) self.send_signal(self.widget.Inputs.predictors, predictor_iris2, 1) colors = self.widget._get_colors() np.testing.assert_array_equal(colors, ColorPaletteGenerator.palette(3)) # case 4: two domains different values order, matching colors idom = self.iris.domain # this way we know that default colors are not used colors = ColorPaletteGenerator.palette(5)[2:] dom1 = Domain( idom.attributes, DiscreteVariable(idom.class_var.name, idom.class_var.values)) dom2 = Domain( idom.attributes, DiscreteVariable(idom.class_var.name, idom.class_var.values[::-1])) dom1.class_var.colors = colors dom2.class_var.colors = colors[::-1] # colors mixed same than values iris1 = self.iris[:100].transform(dom1) iris2 = self.iris[:100].transform(dom2) predictor_iris1 = ConstantLearner()(iris1) predictor_iris2 = ConstantLearner()(iris2) self.send_signal(self.widget.Inputs.predictors, predictor_iris1) self.send_signal(self.widget.Inputs.predictors, predictor_iris2, 1) colors = self.widget._get_colors() np.testing.assert_array_equal(colors, iris1.domain.class_var.colors)