def open_data(name, flags=0): """ Open a named data-set return it. """ dataset = orange.ExampleTable(name) if flags & CONTINUIZE_DOMAIN: preprocessor = preprocess.Continuize() dataset = preprocessor(dataset) elif flags & DISCRETIZE_DOMAIN: preprocessor = preprocess.Discretize(method=orange.EquiNDiscretization(), discretize_class=False) dataset = preprocessor(dataset) dataset.name = name return dataset
def discretizeDomain(data, removeUnusedValues=1, numberOfIntervals=2): entroDisc = orange.EntropyDiscretization() equiDisc = orange.EquiNDiscretization(numberOfIntervals=numberOfIntervals) discAttrs = [] className = data and len( data ) > 0 and data.domain.classVar and data.domain.classVar.name or None # if className: # data = data.filterref(orange.Filter_hasClassValue()) # remove examples with missing classes if not data or len(data) == 0: return None # if we have a continuous class we have to discretize it before we can discretize the attributes if className and data.domain.classVar.varType == orange.VarTypes.Continuous: try: newClass = equiDisc(data.domain.classVar.name, data) newClass.name = className except orange.KernelException as ex: warnings.warn("Could not discretize class variable '%s'. %s" % (data.domain.classVar.name, ex.message)) newClass = None className = None newDomain = orange.Domain(data.domain.attributes, newClass) data = orange.ExampleTable(newDomain, data) for attr in data.domain.attributes: try: name = attr.name if attr.varType == orange.VarTypes.Continuous: # if continuous attribute then use entropy discretization if data.domain.classVar and data.domain.classVar.varType == orange.VarTypes.Discrete: new_attr = entroDisc(attr, data) else: new_attr = equiDisc(attr, data) else: new_attr = attr if removeUnusedValues: new_attr = orange.RemoveUnusedValues(new_attr, data) if new_attr is None: raise orange.KernelException("No values") new_attr.name = name discAttrs.append(new_attr) except orange.KernelException as ex: # if all values are missing, entropy discretization will throw an exception. in such cases ignore the attribute warnings.warn("Could not discretize %s attribute. %s" % (attr.name, ex.message)) if className: discAttrs.append(data.domain.classVar) d2 = data.translate(discAttrs, True) return d2
def cforange_attribute_distance(input_dict): import orange import orngInteract inputdata = input_dict['dataset'] discretizedData = None classInteractions = int(input_dict['classInteractions']) atts = inputdata.domain.attributes if len(atts) < 2: return None matrix = orange.SymMatrix(len(atts)) matrix.setattr('items', atts) if classInteractions < 3: if inputdata.domain.hasContinuousAttributes(): if discretizedData is None: try: discretizedData = orange.Preprocessor_discretize( inputdata, method=orange.EquiNDiscretization(numberOfIntervals=4)) except orange.KernelException, ex: return None data = discretizedData else: data = inputdata # This is ugly (no shit) if not data.domain.classVar: if classInteractions == 0: classedDomain = orange.Domain( data.domain.attributes, orange.EnumVariable("foo", values=["0", "1"])) data = orange.ExampleTable(classedDomain, data) else: return None im = orngInteract.InteractionMatrix(data, dependencies_too=1) off = 1 if classInteractions == 0: diss, labels = im.exportChi2Matrix() off = 0 elif classInteractions == 1: (diss, labels) = im.depExportDissimilarityMatrix( jaccard=1) # 2-interactions else: (diss, labels) = im.exportDissimilarityMatrix( jaccard=1) # 3-interactions for i in range(len(atts) - off): for j in range(i + 1): matrix[i + off, j] = diss[i][j]
def discretizeClass(self): if self.originalData: discType = self.classDiscretization classVar = self.originalData.domain.classVar if discType == 2: try: content = self.customClassSplits.replace(":", " ").replace(",", " ").replace("-", " ").split() customs = dict.fromkeys([float(x) for x in content]).keys() # remove duplicates (except 8.0, 8.000 ...) customs.sort() except: customs = [] if not customs: discType = 0 try: if discType == 0: discretizer = orange.EquiNDiscretization(classVar, self.originalData, numberOfIntervals = self.classIntervals) elif discType == 1: discretizer = orange.EquiDistDiscretization(classVar, self.originalData, numberOfIntervals = self.classIntervals) else: discretizer = orange.IntervalDiscretizer(points = customs).constructVariable(classVar) self.discClassData = orange.ExampleTable(orange.Domain(self.originalData.domain.attributes, discretizer), self.originalData) if self.data: self.data = self.discClassData # else, the data has no continuous attributes other then the class self.classIntervalsLabel.setText("Current splits: " + ", ".join([str(classVar(x)) for x in discretizer.getValueFrom.transformer.points])) self.error(0) self.warning(0) return True except: if self.data: self.warning(0, "Cannot discretize the class; using previous class") else: self.error(0, "Cannot discretize the class") self.classIntervalsLabel.setText("") return False
def create_dataset(file_base, num_bins): file_prefix = "data/" file_suffix = ".csv" train_in_file = file_prefix + file_base + "_train" + file_suffix train_out_file = file_prefix + file_base + "_bin_%s_train" % ( num_bins) + file_suffix test_in_file = file_prefix + file_base + "_test" + file_suffix test_out_file = file_prefix + file_base + "_bin_%s_test" % ( num_bins) + file_suffix train_data = np.genfromtxt(train_in_file, delimiter=',', skip_header=0) test_data = np.genfromtxt(train_in_file, delimiter=',', skip_header=0) num_features = train_data.shape[1] attributes = np.ndarray((1, num_features), buffer=np.array(range(1, num_features + 1))) classes = np.ndarray( (1, num_features), buffer=np.array(["continuous" for i in range(num_features)])) orange_data = np.concatenate((attributes, classes, train_data)) data_binned = orange.Preprocessor_discretize(orange_data,\ method=orange.EquiNDiscretization(numberOfIntervals=num_bins)) #find cutoffs from orange for i in range(num_features): cutoffs_string = str(data_binned.domain.attributes[i].getValueFrom. transformer.points).lstrip('<').rstrip('>') bins = [float(ele) for ele in cutoffs_string.split(", ")] train_digitized = np.digitize(train_data[:, i], bins) train_data[:, i] = train_digitized test_digitized = np.digitize(test_data[:, i], bins) test_data[:, i] = test_digitized np.savetxt(train_out_file, train_data, fmt="%d", delimiter=";") np.savetxt(test_out_file, test_data, fmt="%d", delimiter=";")
def discretizeDomain(data, removeUnusedValues = 1, numberOfIntervals = 2): entroDisc = orange.EntropyDiscretization() equiDisc = orange.EquiNDiscretization(numberOfIntervals = numberOfIntervals) discAttrs = [] className = data and len(data) > 0 and data.domain.classVar and data.domain.classVar.name or None # if className: # data = data.filterref(orange.Filter_hasClassValue()) # remove examples with missing classes if not data or len(data) == 0: return None # if we have a continuous class we have to discretize it before we can discretize the attributes if className and data.domain.classVar.varType == orange.VarTypes.Continuous: newClass = equiDisc(data.domain.classVar.name, data) newClass.name = className newDomain = orange.Domain(data.domain.attributes, newClass) data = orange.ExampleTable(newDomain, data) for attr in data.domain.attributes: try: name = attr.name if attr.varType == orange.VarTypes.Continuous: # if continuous attribute then use entropy discretization if data.domain.classVar and data.domain.classVar.varType == orange.VarTypes.Discrete: attr = entroDisc(attr, data) else: attr = equiDisc(attr, data) if removeUnusedValues: attr = orange.RemoveUnusedValues(attr, data) attr.name = name discAttrs.append(attr) except: # if all values are missing, entropy discretization will throw an exception. in such cases ignore the attribute pass if className: discAttrs.append(data.domain.classVar) return data.select(discAttrs)
def computeMatrix(self): self.error() if self.data: atts = self.data.domain.attributes matrix = orange.SymMatrix(len(atts)) matrix.setattr('items', atts) if self.classInteractions < 3: if self.data.domain.hasContinuousAttributes(): if self.discretizedData is None: self.discretizedData = orange.Preprocessor_discretize( self.data, method=orange.EquiNDiscretization( numberOfIntervals=4)) data = self.discretizedData else: data = self.data # This is ugly, but: Aleks' code which computes Chi2 requires the class attribute because it prepares # some common stuff for all measures. If we want to use his code, we need the class variable, so we # prepare a fake one if not data.domain.classVar: if self.classInteractions == 0: classedDomain = orange.Domain( data.domain.attributes, orange.EnumVariable("foo", values=["0", "1"])) data = orange.ExampleTable(classedDomain, data) else: self.error( "The selected distance measure requires a data set with a class attribute" ) return None im = orngInteract.InteractionMatrix(data, dependencies_too=1) off = 1 if self.classInteractions == 0: diss, labels = im.exportChi2Matrix() off = 0 elif self.classInteractions == 1: (diss, labels) = im.depExportDissimilarityMatrix( jaccard=1) # 2-interactions else: (diss, labels) = im.exportDissimilarityMatrix( jaccard=1) # 3-interactions for i in range(len(atts) - off): for j in range(i + 1): matrix[i + off, j] = diss[i][j] else: if self.classInteractions == 3: for a1 in range(len(atts)): for a2 in range(a1): matrix[a1, a2] = orange.PearsonCorrelation( a1, a2, self.data, 0).p else: import numpy, statc m = self.data.toNumpyMA("A")[0] averages = numpy.ma.average(m, axis=0) filleds = [ list(numpy.ma.filled(m[:, i], averages[i])) for i in range(len(atts)) ] for a1, f1 in enumerate(filleds): for a2 in range(a1): matrix[a1, a2] = statc.spearmanr(f1, filleds[a2])[1] return matrix else: return None
def _prepare(self, t): # prepares an Orange table so that it doesn't contain continuous # attributes or missing values ### DISCRETIZE VARIABLES ### newatt = [] oldatt = [] entroD = orange.EntropyDiscretization() equiD = orange.EquiNDiscretization(numberOfIntervals=2) for i in t.domain.attributes: if i.varType == 2: d = entroD(i, t) if len(d.values) < 2: # prevent discretization into a single value d = equiD(i, t) d.name = 'E' + d.name warnings.warn('Discretizing %s into %s with %d values.' % (i.name, d.name, len(d.values))) newatt.append(d) else: oldatt.append(i) if len(newatt) > 0: t = t.select(oldatt + newatt + [t.domain.classVar]) ### FIX MISSING VALUES ### special_attributes = [] # 2006-08-23: fixed by PJ: append classVar only if it exists ## all_attributes = [i for i in t.domain.attributes]+[t.domain.classVar] all_attributes = [i for i in t.domain.attributes] if t.domain.classVar: all_attributes += [t.domain.classVar] for i in range(len(all_attributes)): for j in t: if j[i].isSpecial(): special_attributes.append(i) break # create new attributes if len(special_attributes) > 0: # prepare attributes newatts = [] for i in range(len(all_attributes)): old = all_attributes[i] if i in special_attributes: oldv = [v for v in old.values] assert ('.' not in oldv) new = orange.EnumVariable(name='M_' + old.name, values=oldv + ['.']) warnings.warn('Removing special values from %s into %s.' % (old.name, new.name)) newatts.append(new) else: newatts.append(old) # convert table exs = [] # 2006-08-23: added by PJ: add a class variable (if not already existing) if not t.domain.classVar: newatts.append(orange.EnumVariable("class", values=["."])) t = orange.ExampleTable( orange.Domain(t.domain.attributes, newatts[-1]), t) newd = orange.Domain(newatts) for ex in t: nex = [] for i in range(len(newatts)): if ex[i].isSpecial(): v = newatts[i]('.') else: v = newatts[i](int(ex[i])) nex.append(v) exs.append(orange.Example(newd, nex)) t = orange.ExampleTable(exs) return t
# Description: Entropy based discretization compared to discretization with equal-frequency # of instances in intervals # Category: preprocessing # Uses: iris.tab # Classes: Preprocessor_discretize, EntropyDiscretization # Referenced: o_categorization.htm import orange def show_values(data, heading): print heading for a in data.domain.attributes: print "%s: %s" % ( a.name, reduce(lambda x, y: x + ', ' + y, [i for i in a.values])) data = orange.ExampleTable("iris") data_ent = orange.Preprocessor_discretize( data, method=orange.EntropyDiscretization()) show_values(data_ent, "Entropy based discretization") print data_n = orange.Preprocessor_discretize( data, method=orange.EquiNDiscretization(numberOfIntervals=3)) show_values(data_n, "Equal-frequency intervals")
# Description: Attribute-based discretization. Shows how different attributes may be discretized with different categorization methods and how the default attribute values names used by these methods may be simply replaced by the list of user-defined names. # Category: preprocessing # Uses: iris # Classes: EquiNDiscretization, EntropyDiscretization # Referenced: o_categorization.htm def printexamples(data, inxs, msg="%i examples"): print msg % len(inxs) for i in inxs: print i, data[i] print import orange iris = orange.ExampleTable("iris") equiN = orange.EquiNDiscretization(numberOfIntervals=4) entropy = orange.EntropyDiscretization() pl = equiN("petal length", iris) sl = equiN("sepal length", iris) pl.values = sl.values = ["very low", "low", "high", "very high"] sl_ent = entropy("sepal length", iris) inxs = [0, 15, 35, 50, 98] d_iris = iris.select( ["sepal width", pl, "sepal length", sl, sl_ent, iris.domain.classVar]) printexamples(iris, inxs, "%i examples before discretization") printexamples(d_iris, inxs, "%i examples before discretization")
class TestDiscretizeEquiN(testing.PreprocessorTestCase): PREPROCESSOR = Preprocessor_discretize(method=orange.EquiNDiscretization())
num_bins = 5 file_prefix = "data/" file_suffix = ".csv" file_base = "features" in_file = file_prefix + file_base + file_suffix out_file = file_prefix + file_base + "_bin_%s" % (num_bins) + file_suffix data = np.genfromtxt(in_file, delimiter=',', skip_header=1) num_features = data.shape[1] attributes = np.ndarray((1, num_features), buffer=np.array(range(1, num_features + 1))) classes = np.ndarray( (1, num_features), buffer=np.array(["continuous" for i in range(num_features)])) orange_data = np.concatenate((attributes, classes, data)) data_binned = orange.Preprocessor_discretize(orange_data,\ method=orange.EquiNDiscretization(numberOfIntervals=num_bins)) #find cutoffs from orange for i in range(num_features): cutoffs_string = str(data_binned.domain.attributes[i].getValueFrom. transformer.points).lstrip('<').rstrip('>') bins = [float(ele) for ele in cutoffs_string.split(", ")] digitized = np.digitize(data[:, i], bins) data[:, i] = digitized np.savetxt(out_file, data, fmt="%d", delimiter=",")
def computeDiscretizer(self, i, idx, onlyDefaults=False): attr = self.data.domain[idx] indiData = self.indiData[idx] discType, intervals = indiData[:2] discName = self.shortDiscNames[discType] defaultUsed = not discType if defaultUsed: discType = self.discretization+1 intervals = self.intervals if discType >= self.D_N_METHODS + 1: try: customs = [float(r) for r in indiData[discType-self.D_N_METHODS+1]] except: customs = [] if not customs: discType = self.discretization+1 intervals = self.intervals discName = "%s ->%s)" % (self.shortDiscNames[indiData[0]][:-1], self.shortDiscNames[discType][2:-1]) defaultUsed = True if onlyDefaults and not defaultUsed: return discType -= 1 try: if discType == self.D_LEAVE: # leave continuous discretizer = None elif discType == self.D_ENTROPY: discretizer = orange.EntropyDiscretization(attr, self.data) elif discType == self.D_FREQUENCY: discretizer = orange.EquiNDiscretization(attr, self.data, numberOfIntervals = intervals) elif discType == self.D_WIDTH: discretizer = orange.EquiDistDiscretization(attr, self.data, numberOfIntervals = intervals) elif discType == self.D_REMOVE: discretizer = False else: discretizer = orange.IntervalDiscretizer(points = customs).constructVariable(attr) except: discretizer = False self.discretizers[idx] = discretizer if discType == self.D_LEAVE: discInts = "" elif discType == self.D_REMOVE: discInts = "" elif not discretizer: discInts = ": "+"<can't discretize>" else: points = discretizer.getValueFrom.transformer.points discInts = points and (": " + ", ".join([str(attr(x)) for x in points])) or ": "+"<removed>" self.indiLabels[i] = discInts + discName self.attrList.reset() if i == self.selectedAttr: self.graph.setSplits(discretizer and discretizer.getValueFrom.transformer.points or [])