def __call__(self, examples): examples = createFullNoDiscTable(examples) classifiers = [ LogRegLearner( orange.Preprocessor_dropMissing( examples.select(orange.Domain(x, examples.domain.classVar)))) for x in examples.domain.attributes ] maj_classifier = LogRegLearner( orange.Preprocessor_dropMissing( examples.select(orange.Domain(examples.domain.classVar)))) beta = [maj_classifier.beta[0]] + [x.beta[1] for x in classifiers] beta_se = [maj_classifier.beta_se[0] ] + [x.beta_se[1] for x in classifiers] P = [maj_classifier.P[0]] + [x.P[1] for x in classifiers] wald_Z = [maj_classifier.wald_Z[0] ] + [x.wald_Z[1] for x in classifiers] domain = examples.domain return Univariate_LogRegClassifier(beta=beta, beta_se=beta_se, P=P, wald_Z=wald_Z, domain=domain)
def __call__(self, examples, weight=0): imputer = getattr(self, "imputer", None) or None if getattr(self, "removeMissing", 0): examples = orange.Preprocessor_dropMissing(examples) ## if hasDiscreteValues(examples.domain): ## examples = createNoDiscTable(examples) if not len(examples): return None if getattr(self, "stepwiseLR", 0): addCrit = getattr(self, "addCrit", 0.2) removeCrit = getattr(self, "removeCrit", 0.3) numAttr = getattr(self, "numAttr", -1) attributes = StepWiseFSS(examples, addCrit=addCrit, deleteCrit=removeCrit, imputer=imputer, numAttr=numAttr) tmpDomain = orange.Domain(attributes, examples.domain.classVar) tmpDomain.addmetas(examples.domain.getmetas()) examples = examples.select(tmpDomain) learner = orange.LogRegLearner() learner.imputerConstructor = imputer if imputer: examples = self.imputer(examples)(examples) examples = orange.Preprocessor_dropMissing(examples) if self.fitter: learner.fitter = self.fitter if self.removeSingular: lr = learner.fitModel(examples, weight) else: lr = learner(examples, weight) while isinstance(lr, orange.Variable): if isinstance(lr.getValueFrom, orange.ClassifierFromVar) and isinstance( lr.getValueFrom.transformer, orange.Discrete2Continuous): lr = lr.getValueFrom.variable attributes = examples.domain.attributes[:] if lr in attributes: attributes.remove(lr) else: attributes.remove(lr.getValueFrom.variable) newDomain = orange.Domain(attributes, examples.domain.classVar) newDomain.addmetas(examples.domain.getmetas()) examples = examples.select(newDomain) lr = learner.fitModel(examples, weight) return lr
def getImputer(self): if self.methodInd in [0, 1, 2]: learner = self.IMPUTERS[self.methodInd][1]() imputer = Preprocessor_imputeByLearner(learner=learner) elif self.methodInd == 3: imputer = orange.Preprocessor_dropMissing() return imputer
def test_dropMissingValues(self): self.data += [[0, 0, 0], [0, 0, "?"], [0, 1, 0], [1, 0, 1], [0, "?", 0], [1, 0, 1]] data2 = orange.Preprocessor_dropMissing(self.data) self.assertEqual(len(data2), 4) self.assertIs(self.data, data2.base)
def MeasureAttribute_info(self, attr, data): table = data.select([attr, data.domain.classVar]) table = orange.Preprocessor_dropMissing(table) a1 = [table[k][0].value for k in range(len(table))] a2 = [table[k][1].value for k in range(len(table))] val, prob = statc.pearsonr(a1, a2) return val
def computeCorrelation(data, attr1, attr2): if data.domain[attr1].varType != orange.VarTypes.Continuous: return None if data.domain[attr2].varType != orange.VarTypes.Continuous: return None table = data.select([attr1, attr2]) table = orange.Preprocessor_dropMissing(table) a1 = [table[k][attr1].value for k in range(len(table))] a2 = [table[k][attr2].value for k in range(len(table))] try: val, prob = statc.pearsonr(a1, a2) except: val = 0.0 # possibly invalid a1 or a2 return val
def computeCorrelationInsideClasses(data, attr1, attr2): if data.domain[attr1].varType != orange.VarTypes.Continuous: return None if data.domain[attr2].varType != orange.VarTypes.Continuous: return None table = data.select([attr1, attr2, data.domain.classVar]) table = orange.Preprocessor_dropMissing(table) lengths = [] corrs = [] for val in table.domain.classVar.values: tab = table.filter({table.domain.classVar: val}) a1 = [tab[k][attr1].value for k in range(len(tab))] a2 = [tab[k][attr2].value for k in range(len(tab))] if len(a1) == 0: continue val, prob = statc.pearsonr(a1, a2) lengths.append(len(a1)) corrs.append(val) corr = 0 for ind in range(len(corrs)): corr += abs(corrs[ind]) * lengths[ind] corr /= sum(lengths) return corr, corrs, lengths
def __call__(self, examples): if getattr(self, "imputer", 0): examples = self.imputer(examples)(examples) if getattr(self, "removeMissing", 0): examples = orange.Preprocessor_dropMissing(examples) continuizer = orange.DomainContinuizer( zeroBased=1, continuousTreatment=orange.DomainContinuizer.Leave, multinomialTreatment=orange.DomainContinuizer.FrequentIsBase, classTreatment=orange.DomainContinuizer.Ignore) attr = [] remain_attr = examples.domain.attributes[:] # get LL for Majority Learner tempDomain = orange.Domain(attr, examples.domain.classVar) #tempData = orange.Preprocessor_dropMissing(examples.select(tempDomain)) tempData = orange.Preprocessor_dropMissing(examples.select(tempDomain)) ll_Old = getLikelihood(orange.LogRegFitter_Cholesky(), tempData) ll_Best = -1000000 length_Old = float(len(tempData)) stop = 0 while not stop: # LOOP until all variables are added or no further deletion nor addition of attribute is possible worstAt = None # if there are more than 1 attribute then perform backward elimination if len(attr) >= 2: minG = 1000 worstAt = attr[0] ll_Best = ll_Old length_Best = length_Old for at in attr: # check all attribute whether its presence enough increases LL? tempAttr = filter(lambda x: x != at, attr) tempDomain = orange.Domain(tempAttr, examples.domain.classVar) tempDomain.addmetas(examples.domain.getmetas()) # domain, calculate P for LL improvement. tempDomain = continuizer( orange.Preprocessor_dropMissing( examples.select(tempDomain))) tempData = orange.Preprocessor_dropMissing( examples.select(tempDomain)) ll_Delete = getLikelihood(orange.LogRegFitter_Cholesky(), tempData) length_Delete = float(len(tempData)) length_Avg = (length_Delete + length_Old) / 2.0 G = -2 * length_Avg * (ll_Delete / length_Delete - ll_Old / length_Old) # set new worst attribute if G < minG: worstAt = at minG = G ll_Best = ll_Delete length_Best = length_Delete # deletion of attribute if worstAt.varType == orange.VarTypes.Continuous: P = lchisqprob(minG, 1) else: P = lchisqprob(minG, len(worstAt.values) - 1) if P >= self.deleteCrit: attr.remove(worstAt) remain_attr.append(worstAt) nodeletion = 0 ll_Old = ll_Best length_Old = length_Best else: nodeletion = 1 else: nodeletion = 1 # END OF DELETION PART # if enough attributes has been chosen, stop the procedure if self.numAttr > -1 and len(attr) >= self.numAttr: remain_attr = [] # for each attribute in the remaining maxG = -1 ll_Best = ll_Old length_Best = length_Old bestAt = None for at in remain_attr: tempAttr = attr + [at] tempDomain = orange.Domain(tempAttr, examples.domain.classVar) tempDomain.addmetas(examples.domain.getmetas()) # domain, calculate P for LL improvement. tempDomain = continuizer( orange.Preprocessor_dropMissing( examples.select(tempDomain))) tempData = orange.Preprocessor_dropMissing( examples.select(tempDomain)) ll_New = getLikelihood(orange.LogRegFitter_Cholesky(), tempData) length_New = float( len(tempData) ) # get number of examples in tempData to normalize likelihood # P=PR(CHI^2>G), G=-2(L(0)-L(1))=2(E(0)-E(1)) length_avg = (length_New + length_Old) / 2 G = -2 * length_avg * (ll_Old / length_Old - ll_New / length_New) if G > maxG: bestAt = at maxG = G ll_Best = ll_New length_Best = length_New if not bestAt: stop = 1 continue if bestAt.varType == orange.VarTypes.Continuous: P = lchisqprob(maxG, 1) else: P = lchisqprob(maxG, len(bestAt.values) - 1) # Add attribute with smallest P to attributes(attr) if P <= self.addCrit: attr.append(bestAt) remain_attr.remove(bestAt) ll_Old = ll_Best length_Old = length_Best if (P > self.addCrit and nodeletion) or (bestAt == worstAt): stop = 1 return attr
data2 = pp(data) print "Removing 50% of class values:", for ex in data2: print ex.getclass(), print data2 = orange.Preprocessor_dropMissingClasses(data2) print "Removing examples with unknown class values:", for ex in data2: print ex.getclass(), print print "\n\nRemoving 20% of values of 'age' and 50% of astigmatism:" pp = orange.Preprocessor_addMissing() pp.proportions = {age: 0.2, astigm: 0.5} pp.specialType = orange.ValueTypes.DC data2 = pp(data) for ex in data2: print ex print "\n\nRemoving examples with unknown values" data3 = orange.Preprocessor_dropMissing(data2) for ex in data3: print ex print "\n\nSelecting examples with unknown values" data3 = orange.Preprocessor_takeMissing(data2) for ex in data3: print ex
def getFunctionalList(data): import orngCI bestQual = -10000000 bestAttr = -1 testAttrs = [] dataShort = orange.Preprocessor_dropMissing(data) # remove continuous attributes from data disc = [] for i in range(len(dataShort.domain.attributes)): # keep only discrete attributes that have more than one value if dataShort.domain.attributes[ i].varType == orange.VarTypes.Discrete and len( dataShort.domain.attributes[i].values) > 1: disc.append(dataShort.domain.attributes[i].name) if disc == []: return [] discData = dataShort.select(disc + [dataShort.domain.classVar.name]) remover = orngCI.AttributeRedundanciesRemover(noMinimization=1) newData = remover(discData, weight=0) for attr in newData.domain.attributes: testAttrs.append(attr.name) # compute the best attribute combination for i in range(len(newData.domain.attributes)): vals, qual = orngCI.FeatureByMinComplexity( newData, [newData.domain.attributes[i], newData.domain.classVar]) if qual > bestQual: bestQual = qual bestAttr = newData.domain.attributes[i].name mergedVals = vals mergedVals.name = newData.domain.classVar.name if bestAttr == -1: return [] outList = [bestAttr] newData = replaceAttributes(bestAttr, newData.domain.classVar, mergedVals, newData) testAttrs.remove(bestAttr) while (testAttrs != []): bestQual = -10000000 for attrName in testAttrs: vals, qual = orngCI.FeatureByMinComplexity(newData, [mergedVals, attrName]) if qual > bestQual: bestqual = qual bestAttr = attrName vals, qual = orngCI.FeatureByMinComplexity(newData, [mergedVals, bestAttr]) mergedVals = vals mergedVals.name = newData.domain.classVar.name newData = replaceAttributes(bestAttr, newData.domain.classVar, mergedVals, newData) outList.append(bestAttr) testAttrs.remove(bestAttr) # new attributes have "'" at the end of their names. we have to remove that in ored to identify them in the old domain for index in range(len(outList)): if outList[index][-1] == "'": outList[index] = outList[index][:-1] return outList