def test_continuizer_iris(self): d = orange.ExampleTable("iris") dc = orange.DomainContinuizer() dc.class_treatment = dc.ClassTreatment.LeaveUnlessTarget dc.continuous_treatment = dc.ContinuousTreatment.Leave cdomain = dc(d.domain) self.assertEqual(cdomain.variables, d.domain.variables) dc.continuous_treatment = dc.ContinuousTreatment.NormalizeBySpan self.assertRaises(ValueError, dc, d.domain) cdomain = dc(d) dd = orange.ExampleTable(cdomain, d) bs = orange.DomainBasicAttrStat(d) for e, ec in zip(d[:10], dd): for i in range(4): self.assertEqual((e[i] - bs[i].min) / (bs[i].max - bs[i].min), ec[i]) dc.continuous_treatment = dc.ContinuousTreatment.NormalizeByVariance self.assertRaises(ValueError, dc, d.domain) cdomain = dc(d) dd = orange.ExampleTable(cdomain, d) bs = orange.DomainBasicAttrStat(d) for e, ec in zip(d[:10], dd): for i in range(4): self.assertEqual((e[i] - bs[i].avg) / bs[i].dev, ec[i])
def __call__(self, attr, data): # if the data changed clear the attribute values if data != self.data: self.attrInfo = {} self.data = data if self.attrInfo == {}: classVar = data.domain.classVar datas = [ data.select({data.domain.classVar.name: [val]}) for val in data.domain.classVar.values ] stats = [orange.DomainBasicAttrStat(d) for d in datas] cls = range(len(stats)) clsCount = len(stats) for i in range(len(stats[0])): if stats[0][i] == None: continue temp = 0.0 for j in cls: for k in range(j + 1, clsCount): if (stats[j][i].dev + stats[k][i].dev) > 0: temp += abs((stats[j][i].avg - stats[k][i].avg) / (stats[j][i].dev + stats[k][i].dev)) self.attrInfo[data.domain.attributes[i].name] = temp if self.attrInfo.has_key(data.domain[attr].name): return self.attrInfo[data.domain[attr].name] else: return -1
def test_pickle(self): d = orange.ExampleTable("iris") seplen = [float(e[0]) for e in d] import pickle c = orange.DomainBasicAttrStat(d) b = c[0] s = pickle.dumps(c) c2 = pickle.loads(s) self.assertEqual(b.variable, d.domain[0]) self.assertAlmostEqual(b.min, min(seplen)) self.assertAlmostEqual(b.max, max(seplen)) self.assertAlmostEqual(b.avg, sum(seplen) / len(seplen)) self.assertEqual(id(b), id(c["sepal length"])) self.assertEqual(id(b), id(c[d.domain[0]])) ll = list(c) self.assertEqual(id(b), id(ll[0])) self.assertEqual(ll[-1], None) self.assertTrue(c.has_class_var) self.assertEqual(len(c), 5) self.assertEqual(len(ll), 5) c.purge() self.assertEqual(len(c), 4)
def cforange_hierarchical_clustering_finished(postdata, input_dict, output_dict): import json import orange matrix = input_dict['dm'] linkage = int(input_dict['linkage']) widget_pk = postdata['widget_id'][0] try: selected_nodes = json.loads(postdata.get('selected_nodes')[0]) except: raise Exception('Please select a threshold for determining clusters.') if isinstance(matrix.items, orange.ExampleTable): root = Clustering.hierarchical_clustering(linkage, matrix) cluster_ids = set([cluster for _, _, cluster in selected_nodes]) selected_clusters = set( [cluster for _, selected, cluster in selected_nodes if selected]) clustVar = orange.EnumVariable( str('Cluster'), values=["Cluster %d" % i for i in cluster_ids] + ["Other"]) origDomain = matrix.items.domain domain = orange.Domain(origDomain.attributes, origDomain.classVar) domain.addmeta(orange.newmetaid(), clustVar) domain.addmetas(origDomain.getmetas()) # Build table with selected clusters selected_table, unselected_table = orange.ExampleTable( domain), orange.ExampleTable(domain) for id, selected, cluster in selected_nodes: new_ex = orange.Example(domain, matrix.items[id]) if selected: new_ex[clustVar] = clustVar("Cluster %d" % cluster) selected_table.append(new_ex) else: new_ex[clustVar] = clustVar("Other") unselected_table.append(new_ex) # Build table of centroids centroids = orange.ExampleTable(selected_table.domain) if len(selected_table) > 0: for cluster in sorted(selected_clusters): clusterEx = orange.ExampleTable([ ex for ex in selected_table if ex[clustVar] == "Cluster %d" % cluster ]) # Attribute statistics contstat = orange.DomainBasicAttrStat(clusterEx) discstat = orange.DomainDistributions(clusterEx, 0, 0, 1) ex = [ cs.avg if cs else (ds.modus() if ds else "?") for cs, ds in zip(contstat, discstat) ] example = orange.Example(centroids.domain, ex) example[clustVar] = clustVar("Cluster %d" % cluster) centroids.append(example) else: # Attribute distance centroids, selected_table, unselected_table = None, None, None return { 'centroids': centroids, 'selected_examples': selected_table, 'unselected_examples': unselected_table }
def __call__(self, trainingData=None, weight=None, allowMetas=False): self.basicStat = None if not trainingData: print "AZBaseClasses ERROR: Missing training data!" return False elif dataUtilities.findDuplicatedNames(trainingData.domain): print "AZBaseClasses ERROR: Duplicated names found in the training data. Please use the method dataUtilities.DataTable() when loading a dataset in order to fix the duplicated names and avoid this error." return False elif not trainingData.domain.classVar: print "AZBaseClasses ERROR: No class attribute found in training data!" return False elif not len(trainingData): print "AZBaseClasses ERROR: No examples in training data!" return False elif not len(trainingData.domain.attributes): print "AZBaseClasses ERROR: No attributes in training data!" return False possibleMetas = dataUtilities.getPossibleMetas(trainingData, checkIndividuality=True) if not allowMetas and possibleMetas: msg = "\nAZBaseClasses ERROR: Detected attributes that should be considered meta-attributes:" for attr in possibleMetas: msg += "\n " + attr raise Exception(msg) #return False #Get the Domain basic statistics and save only the desired info in self.basicStat basicStat = orange.DomainBasicAttrStat(trainingData) self.basicStat = {} for attr in trainingData.domain: if attr.varType in [ orange.VarTypes.Discrete, orange.VarTypes.String ]: self.basicStat[attr.name] = None else: self.basicStat[attr.name] = { "dev": basicStat[attr].dev, "min": basicStat[attr].min, "max": basicStat[attr].max, "avg": basicStat[attr].avg } # Gather all the learner parameters to be stored along with the classifier # Find the name of the Learner learnerName = str( self.__class__)[:str(self.__class__).rfind("'")].split(".")[-1] self.parameters = {} if learnerName != "ConsensusLearner": # Load the AZLearnersParamsConfig.py from the AZORANGEHOME! AZOLearnersConfig = imp.load_source( "AZLearnersParamsConfig", os.path.join(os.environ["AZORANGEHOME"], 'azorange', "AZLearnersParamsConfig.py")) pars = AZOLearnersConfig.API(learnerName) if pars: for par in pars.getParameterNames(): self.parameters[par] = getattr(self, par) return True
def data_center(data): """Return the central - average - point in the data set""" atts = data.domain.attributes astats = orange.DomainBasicAttrStat(data) center = [astats[a].avg if a.varType == orange.VarTypes.Continuous \ else max(enumerate(orange.Distribution(a, data)), key=lambda x:x[1])[0] if a.varType == orange.VarTypes.Discrete else None for a in atts] if data.domain.classVar: center.append(0) return orange.Example(data.domain, center)
def data_center(data): """ Returns a center of the instances in the data set (average across data instances for continuous attributes, most frequent value for discrete attributes). """ atts = data.domain.attributes astats = orange.DomainBasicAttrStat(data) center = [astats[a].avg if a.varType == orange.VarTypes.Continuous \ # else max(enumerate(orange.Distribution(a, data)), key=lambda x:x[1])[0] if a.varType == orange.VarTypes.Discrete else _modus(orange.Distribution(a, data)) if a.varType == orange.VarTypes.Discrete else None for a in atts] if data.domain.classVar: center.append(0) return orange.Example(data.domain, center)
def test_equalWidth(self): d = orange.ExampleTable("iris") ba = orange.DomainBasicAttrStat(d) ddisc = orange.DomainDiscretization(orange.EqualWidthDiscretization()) dd = ddisc(d) for i in range(4): self.assertEqual(len(dd[i].values), 4) mi, ma = ba[i].min, ba[i].max di = ma - mi trans = dd[i].get_value_from.transformer self.assertAlmostEqual(trans.first_cut, mi + di / 4, 1) self.assertAlmostEqual(trans.step, di / 4, 1) self.assertEqual(trans.n_intervals, 4) ddisc.discretization.n_intervals = 5 dd = ddisc(d) for i in range(4): self.assertEqual(len(dd[i].values), 5) mi, ma = ba[i].min, ba[i].max di = ma - mi trans = dd[i].get_value_from.transformer self.assertAlmostEqual(trans.first_cut, mi + di / 5, 1) self.assertAlmostEqual(trans.step, di / 5, 1) self.assertEqual(trans.n_intervals, 5) points = trans.points for j in range(4): self.assertAlmostEqual(trans.points[i], trans.first_cut + i * di / 5) d2 = orange.ExampleTable(dd, d) for e, e2 in zip(d[:5], d2): for i in range(4): trans = dd[i].get_value_from.transformer self.assertEqual( e2[i], math.floor((e[i] - trans.firstCut) / trans.step) + 1) s = pickle.dumps(dd) dd2 = pickle.loads(s) d3 = orange.ExampleTable(dd2, d) for e, e2 in zip(d[:5], d3): for i in range(4): trans = dd[i].get_value_from.transformer self.assertEqual( e2[i], math.floor((e[i] - trans.firstCut) / trans.step) + 1)
def MeasureAttribute_info(self, attr, data): # if basic statistics is not computed for this dataset -> compute it if not (self.stats and self.dataset == data): self.stats = {} self.dataset = data arr = [0] * len(data.domain.attributes) for val in data.domain.classVar.values: data2 = data.select({data.domain.classVar: val}) bas = orange.DomainBasicAttrStat(data2) self.stats[val] = bas for i in range(len(self.stats.keys())): statI = self.stats[self.stats.keys()[i]] if len(statI) == 0: continue for j in range(i + 1, len(self.stats.keys())): statJ = self.stats[self.stats.keys()[j]] if len(statJ) == 0: continue for attribute in range(len(data.domain.attributes)): if data.domain.attributes[ attribute].varType != orange.VarTypes.Continuous: continue bottom = (statI[attribute].n * statI[attribute].dev + statJ[attribute].n * statJ[attribute].dev) if bottom == 0.0: bottom = 0.001 val = abs(statI[attribute].avg - statJ[attribute].avg ) * (statI[attribute].n + statJ[attribute].n) / bottom arr[attribute] += val # normalize values in arr so that the largest value will be 1 and others will be proportionally smaller largest = max(arr) if largest != 0: arr = [val / largest for val in arr] for i in range(len(data.domain.attributes)): self.attrInfo[data.domain.attributes[i].name] = arr[i] return self.attrInfo[data.domain[attr].name]
def tubedRegression(cache, dimensions, progressCallback=None, **args): if not cache.findNearest: cache.findNearest = orange.FindNearestConstructor_BruteForce( cache.data, distanceConstructor=orange.ExamplesDistanceConstructor_Euclidean(), includeSame=True) if not cache.attrStat: cache.attrStat = orange.DomainBasicAttrStat(cache.data) normalizers = cache.findNearest.distance.normalizers if progressCallback: nExamples = len(cache.data) nPoints = 100.0 / nExamples / len(dimensions) effNeighbours = len(cache.contAttributes) > 1 and cache.nNeighbours or len( cache.deltas) for di, d in enumerate(dimensions): contIdx = cache.contIndices[d] minV, maxV = cache.attrStat[contIdx].min, cache.attrStat[contIdx].max if minV == maxV: continue oldNormalizer = normalizers[cache.contIndices[d]] normalizers[cache.contIndices[d]] = 0 for exi, ref_example in enumerate(cache.data): if ref_example[contIdx].isSpecial(): cache.deltas[exi][d] = "?" continue ref_x = float(ref_example[contIdx]) Sx = Sy = Sxx = Syy = Sxy = n = 0.0 nn = cache.findNearest(ref_example, 0, True) nn = [ex for ex in nn if not ex[contIdx].isSpecial()][:effNeighbours] mx = [abs(ex[contIdx] - ref_x) for ex in nn] if not mx: cache.deltas[exi][d] = "?" continue if max(mx) < 1e-10: kw = math.log(.001) else: kw = math.log(.001) / max(mx)**2 for ex in nn[:effNeighbours]: ex_x = float(ex[contIdx]) ex_y = float(ex.getclass()) w = math.exp(kw * (ex_x - ref_x)**2) Sx += w * ex_x Sy += w * ex_y Sxx += w * ex_x**2 Syy += w * ex_y**2 Sxy += w * ex_x * ex_y n += w div = n * Sxx - Sx**2 if div: # and i<40: b = (Sxy * n - Sx * Sy) / div # div = Sx*Sy/n - Sxy # if abs(div) < 1e-10: # cache.errors[exi][d] = 1 # else: # B = ((Syy - Sy**2/n) - (Sxx - Sx**2/n)) / 2 / div # # b_p = -B + math.sqrt(B**2+1) # a = Sy/n - b_p * Sx/n # error1 = 1/(1+b_p**2) * (Syy + a**2 + b_p**2*Sxx - 2*a*Sy + 2*a*b_p*Sx - 2*b_p*Sxy) # # b_2 = -B - math.sqrt(B**2+1) # a = Sy/n - b_p * Sx/n # error2 = 1/(1+b_p**2) * (Syy + a**2 + b_p**2*Sxx - 2*a*Sy + 2*a*b_p*Sx - 2*b_p*Sxy) # # if error1 < error2 and error1 >= 0: # cache.errors[exi][d] = error1 # elif error2 >= 0: # cache.errors[exi][d] = error2 # else: # cache.errors[exi][d] = 42 # print error1, error2 a = (Sy - b * Sx) / n err = (n * a**2 + b**2 * Sxx + Syy + 2 * a * b * Sx - 2 * a * Sy - 2 * b * Sxy) tot = Syy - Sy**2 / n mod = tot - err merr = err / (n - 2) if merr < 1e-10: F = 0 Fprob = 1 else: F = mod / merr Fprob = statc.fprob(F, 1, int(n - 2)) cache.errors[exi][d] = Fprob # print "%.4f" % Fprob, #print ("%.3f\t" + "%.0f\t"*6 + "%f\t%f") % (w, ref_x, ex_x, n, a, b, merr, F, Fprob) cache.deltas[exi][d] = b else: cache.deltas[exi][d] = "?" if progressCallback: progressCallback((nExamples * di + exi) * nPoints) normalizers[cache.contIndices[d]] = oldNormalizer
def commit_data(self): items = getattr(self.matrix, "items", None) if not items: return # nothing to commit self.selectionChanged = False self.selectedExamples = None selection = self.selected_clusters selection = sorted(selection, key=lambda c: c.first) maps = [ list(self.root_cluster.mapping[c.first:c.last]) for c in selection ] from operator import add selected_indices = reduce(add, maps, []) unselected_indices = sorted( set(self.root_cluster.mapping) - set(selected_indices)) self.selection = selected = [items[k] for k in selected_indices] unselected = [items[k] for k in unselected_indices] if not selected: self.send("Selected Data", None) self.send("Other Data", None) self.send("Centroids", None) return if isinstance(items, ExampleTable): c = [i for i in range(len(maps)) for j in maps[i]] aid = clustVar = None if self.AppendClusters: clustVar = orange.EnumVariable( str(self.ClassifyName), values=["Cluster " + str(i) for i in range(len(maps))] + ["Other"]) origDomain = items.domain if self.addIdAs == 0: domain = orange.Domain(origDomain.attributes, clustVar) if origDomain.classVar: domain.addmeta(orange.newmetaid(), origDomain.classVar) aid = -1 elif self.addIdAs == 1: domain = orange.Domain(origDomain.attributes + [clustVar], origDomain.classVar) aid = len(origDomain.attributes) else: domain = orange.Domain(origDomain.attributes, origDomain.classVar) aid = orange.newmetaid() domain.addmeta(aid, clustVar) domain.addmetas(origDomain.getmetas()) table1 = table2 = None if selected: table1 = orange.ExampleTable(domain, selected) for i in range(len(selected)): table1[i][clustVar] = clustVar("Cluster " + str(c[i])) if unselected: table2 = orange.ExampleTable(domain, unselected) for ex in table2: ex[clustVar] = clustVar("Other") self.selectedExamples = table1 self.unselectedExamples = table2 else: self.selectedExamples = orange.ExampleTable( selected) if selected else None self.unselectedExamples = orange.ExampleTable( unselected) if unselected else None self.send("Selected Data", self.selectedExamples) self.send("Other Data", self.unselectedExamples) self.centroids = None if self.selectedExamples: self.centroids = orange.ExampleTable( self.selectedExamples.domain) for i in range(len(maps)): clusterEx = [ ex for cluster, ex in zip(c, self.selectedExamples) if cluster == i ] clusterEx = orange.ExampleTable(clusterEx) contstat = orange.DomainBasicAttrStat(clusterEx) discstat = orange.DomainDistributions(clusterEx, 0, 0, 1) ex = [ cs.avg if cs else (ds.modus() if ds else "?") for cs, ds in zip(contstat, discstat) ] example = orange.Example(self.centroids.domain, ex) if clustVar is not None: example[clustVar] = clustVar(i) self.centroids.append(ex) self.send("Centroids", self.centroids) elif self.matrixSource == "Data Distance": names = list(set([d.strain for d in self.selection])) data = [(name, [ d for d in filter(lambda a: a.strain == name, self.selection) ]) for name in names] self.send("Structured Data Files", data)
# Description: Shows how to compute and print out the basic attribute statistics # Category: statistics # Classes: DomainBasicAttrStat, BasicAttrStat # Uses: iris # Referenced: basicstat.htm import orange data = orange.ExampleTable("iris") bas = orange.DomainBasicAttrStat(data) print "%20s %5s %5s %5s" % ("attribute", "min", "max", "avg") for a in bas: if a: print "%20s %5.3f %5.3f %5.3f" % (a.variable.name, a.min, a.max, a.avg) print bas["sepal length"].avg
def __call__(self, attribute, data): # if the data changed clear the attribute values if data != self.dataMix: self.attrInfoMix = {} self.attrInfo = {} self.dataMix = data if self.attrInfoMix == {}: attrs = range(len(data.domain.attributes)) classVar = data.domain.classVar #shortData = data.select(attrs + [classVar]) datas = [ data.select({classVar.name: [val]}) for val in classVar.values ] statistics = [orange.DomainBasicAttrStat(d) for d in datas] cls = [] for classVarIndex, c in enumerate( classVar.values ): # for each class value compute how good is each attribute for discriminating this class value against all other attrValsList = [] newData = mergeClassValues(data, c) for attrIndex in range(len(attrs)): if data.domain[ attrIndex].varType == orange.VarTypes.Discrete: # ignore discrete attributes continue val = S2NMeasure.__call__(self, attrs[attrIndex], newData) if statistics[0][attrIndex] == None: attrValsList.append((0, attrs[attrIndex])) else: aves = [stat[attrIndex].avg for stat in statistics] if max(aves) != aves[classVarIndex]: val = -val attrValsList.append((val, attrs[attrIndex])) attrValsList.sort() attrValsList = [element[1] for element in attrValsList ] # remove the value attrValsList.reverse() cls.append(attrValsList) attrPositionsDict = dict([(attr, []) for attr in cls[0]]) for arr in cls: for i in range(len(arr)): attrPositionsDict[arr[i]].append(i) numClasses = len(classVar.values) currPos = [0 for i in range(numClasses)] self.sortedAttrList = [] ableToAdd = 1 while ableToAdd: # sometimes some attributes are duplicated. in such cases we will add only one instance of such attribute to the list ableToAdd = 0 for i in range(numClasses): pos = currPos[i] while pos < len(cls[i]) and cls[i][pos] == None: pos += 1 currPos[i] = pos + 1 if pos >= len(cls[i]): continue ableToAdd = 1 attr = cls[i][pos] self.sortedAttrList.append(attr) attrPositions = attrPositionsDict[ attr] # get indices in cls where attribute attr is placed for j in range(numClasses): cls[j][attrPositions[j]] = None count = len(self.sortedAttrList) for (i, attr) in enumerate(self.sortedAttrList): self.attrInfoMix[data.domain[attr].name] = count - i if self.attrInfoMix.has_key(data.domain[attribute].name): return self.attrInfoMix[data.domain[attribute].name] else: return -1