Exemple #1
0
    def __call__(self, data, weight_id=0, **kwds):
        import Orange.evaluation.testing, Orange.evaluation.scoring, statc

        self.__dict__.update(kwds)

        if self.remove_threshold < self.add_threshold:
            raise ValueError(
                "'remove_threshold' should be larger or equal to 'add_threshold'"
            )

        classVar = data.domain.classVar

        indices = Orange.core.MakeRandomIndicesCV(data,
                                                  folds=getattr(
                                                      self, "folds", 10))
        domain = Orange.data.Domain([], classVar)

        res = Orange.evaluation.testing.test_with_indices([self.learner],
                                                          Orange.data.Table(
                                                              domain, data),
                                                          indices)

        oldStat = self.stat(res)[0]
        oldStats = [
            self.stat(x)[0]
            for x in Orange.evaluation.scoring.split_by_iterations(res)
        ]
        print ".", oldStat, domain
        stop = False
        while not stop:
            stop = True
            if len(domain.attributes) >= 2:
                bestStat = None
                for attr in domain.attributes:
                    newdomain = Orange.data.Domain(
                        filter(lambda x: x != attr, domain.attributes),
                        classVar)
                    res = Orange.evaluation.testing.test_with_indices(
                        [self.learner],
                        (Orange.data.Table(newdomain, data), weight_id),
                        indices)

                    newStat = self.stat(res)[0]
                    newStats = [
                        self.stat(x)[0] for x in
                        Orange.evaluation.scoring.split_by_iterations(res)
                    ]
                    print "-", newStat, newdomain
                    ## If stat has increased (ie newStat is better than bestStat)
                    if not bestStat or cmp(newStat, bestStat) == self.statsign:
                        if cmp(newStat, oldStat) == self.statsign:
                            bestStat, bestStats, bestAttr = newStat, newStats, attr
                        elif statc.wilcoxont(
                                oldStats, newStats)[1] > self.remove_threshold:
                            bestStat, bestAttr, bestStats = newStat, newStats, attr
                if bestStat:
                    domain = Orange.data.Domain(
                        filter(lambda x: x != bestAttr, domain.attributes),
                        classVar)
                    oldStat, oldStats = bestStat, bestStats
                    stop = False
                    print "removed", bestAttr.name

            bestStat, bestAttr = oldStat, None
            for attr in data.domain.attributes:
                if not attr in domain.attributes:
                    newdomain = Orange.data.Domain(domain.attributes + [attr],
                                                   classVar)
                    res = Orange.evaluation.testing.test_with_indices(
                        [self.learner],
                        (Orange.data.Table(newdomain, data), weight_id),
                        indices)

                    newStat = self.stat(res)[0]
                    newStats = [
                        self.stat(x)[0] for x in
                        Orange.evaluation.scoring.split_by_iterations(res)
                    ]
                    print "+", newStat, newdomain

                    ## If stat has increased (ie newStat is better than bestStat)
                    if cmp(newStat,
                           bestStat) == self.statsign and statc.wilcoxont(
                               oldStats, newStats)[1] < self.add_threshold:
                        bestStat, bestStats, bestAttr = newStat, newStats, attr
            if bestAttr:
                domain = Orange.data.Domain(domain.attributes + [bestAttr],
                                            classVar)
                oldStat, oldStats = bestStat, bestStats
                stop = False
                print "added", bestAttr.name

        return self.learner(Orange.data.Table(domain, data), weight_id)
  def __call__(self, data, weight_id = 0, **kwds):
    import Orange.evaluation.testing, Orange.evaluation.scoring, statc
    
    self.__dict__.update(kwds)

    if self.remove_threshold < self.add_threshold:
        raise ValueError("'remove_threshold' should be larger or equal to 'add_threshold'")

    classVar = data.domain.classVar
    
    indices = Orange.core.MakeRandomIndicesCV(data, folds = getattr(self, "folds", 10))
    domain = Orange.data.Domain([], classVar)

    res = Orange.evaluation.testing.test_with_indices([self.learner], Orange.data.Table(domain, data), indices)
    
    oldStat = self.stat(res)[0]
    oldStats = [self.stat(x)[0] for x in Orange.evaluation.scoring.split_by_iterations(res)]
    print ".", oldStat, domain
    stop = False
    while not stop:
        stop = True
        if len(domain.attributes)>=2:
            bestStat = None
            for attr in domain.attributes:
                newdomain = Orange.data.Domain(filter(lambda x: x!=attr, domain.attributes), classVar)
                res = Orange.evaluation.testing.test_with_indices([self.learner], (Orange.data.Table(newdomain, data), weight_id), indices)
                
                newStat = self.stat(res)[0]
                newStats = [self.stat(x)[0] for x in Orange.evaluation.scoring.split_by_iterations(res)] 
                print "-", newStat, newdomain
                ## If stat has increased (ie newStat is better than bestStat)
                if not bestStat or cmp(newStat, bestStat) == self.statsign:
                    if cmp(newStat, oldStat) == self.statsign:
                        bestStat, bestStats, bestAttr = newStat, newStats, attr
                    elif statc.wilcoxont(oldStats, newStats)[1] > self.remove_threshold:
                            bestStat, bestAttr, bestStats = newStat, newStats, attr
            if bestStat:
                domain = Orange.data.Domain(filter(lambda x: x!=bestAttr, domain.attributes), classVar)
                oldStat, oldStats = bestStat, bestStats
                stop = False
                print "removed", bestAttr.name

        bestStat, bestAttr = oldStat, None
        for attr in data.domain.attributes:
            if not attr in domain.attributes:
                newdomain = Orange.data.Domain(domain.attributes + [attr], classVar)
                res = Orange.evaluation.testing.test_with_indices([self.learner], (Orange.data.Table(newdomain, data), weight_id), indices)
                
                newStat = self.stat(res)[0]
                newStats = [self.stat(x)[0] for x in Orange.evaluation.scoring.split_by_iterations(res)] 
                print "+", newStat, newdomain

                ## If stat has increased (ie newStat is better than bestStat)
                if cmp(newStat, bestStat) == self.statsign and statc.wilcoxont(oldStats, newStats)[1] < self.add_threshold:
                    bestStat, bestStats, bestAttr = newStat, newStats, attr
        if bestAttr:
            domain = Orange.data.Domain(domain.attributes + [bestAttr], classVar)
            oldStat, oldStats = bestStat, bestStats
            stop = False
            print "added", bestAttr.name

    return self.learner(Orange.data.Table(domain, data), weight_id)
Exemple #3
0
def WilcoxonRankTest(accLearner1, accLearner2):
    """
    The input is two list with the value pairs to be compared!
    Single sided Wilcoxon rank sum test. 
    See critical values: http://www.euronet.nl/users/warnar/demostatistiek/tables/WILCOXONTABEL.htm
    http://web.anglia.ac.uk/numbers/biostatistics/wilcoxon/local_folder/critical_values.html
    """
    # Learner 1 is the most accurate
    diffPlus = []
    # Learner 2 is the most accurate
    diffMinus = []
    for idx in range(len(accLearner2)):
        diff = accLearner1[idx] - accLearner2[idx]
        if diff > 0:
            diffPlus.append(abs(diff))
        elif diff < 0:
            diffMinus.append(abs(diff))
        else:
            diffPlus.append(abs(diff))
            diffMinus.append(abs(diff))
    diffPlus.sort()
    diffMinus.sort()

    # Rank the differences according to absolute values
    # R is a dictionary indexed by the rank number and with the values +, - or +/-
    # indicating which learner the rank number will be assigned to
    # The greater the diff the greater the rank idx
    R = {}
    for idx in range(len(accLearner1)):
        # Get the smallest value in each diff list (small diff -> small idx)
        try:
            diffPlusMin = diffPlus[0]
        except:
            diffPlusMin = 10000  # No more diffPlus elements, always take diffMinus
        try:
            diffMinusMin = diffMinus[0]
        except:
            diffMinusMin = 10000
        if diffPlusMin < diffMinusMin:
            if len(diffPlus) > 0: min = diffPlus.pop(0)
            R[str(idx)] = "+"
        elif diffPlusMin == diffMinusMin:
            if len(diffPlus) > 0: min = diffPlus.pop(0)
            if len(diffMinus) > 0: min = diffMinus.pop(0)
            R[str(idx)] = "+/-"
        else:
            if len(diffMinus) > 0: min = diffMinus.pop(0)
            R[str(idx)] = "-"

    # Get rank sums for the two learners - The greater the sum, the more accurate the learner
    Rplus = 0
    Rminus = 0
    for key, value in R.iteritems():
        if value == "+":
            Rplus = Rplus + int(key)
        elif value == "-":
            Rminus = Rminus + int(key)
        elif value == "+/-":
            Rplus = Rplus + (1.0 / 2) * int(key)
            Rminus = Rminus + (1.0 / 2) * int(key)

    Rlist = [Rplus, Rminus]
    # Does not work!!
    #print min(Rlist)
    Rlist.sort()
    # ***** Already in Orange - don't use the above *************
    #T = Rlist.pop(0)
    T = statc.wilcoxont(accLearner1, accLearner2)[0]
    N = len(R)

    print "Rank sum of learner 1"
    print Rplus
    print "Rank sum of learner 2"
    print Rminus
    if Rplus < Rminus:
        print "The hypothesis is that learner 2 is the most accurate"
    else:
        print "The hypothesis is that learner 1 is the most accurate"

    info = "If the number of data sets (N) is equal to 16 (our regression suite):\n"
    info += "N " + str(N) + "\n"
    info += "If T < 35 there is a 10% chance that the hypothesis is not true\n"
    info += "If T < 29 there is a 5% chance that the hypothesis is not true\n"
    info += "T " + str(T) + "\n"

    info += "If the number of data sets (N) is equal to 17 (our classification suite):\n"
    info += "N " + str(N) + "\n"
    info += "If T < 41 there is a 10% chance that the hypothesis is not true\n"
    info += "If T < 34 there is a 5% chance that the hypothesis is not true\n"
    info += "T " + str(T) + "\n"
    # If N > 20
    #z = (T - (1.0/4)*N*(N+1))/math.sqrt((1.0/24)*N*(N+1)*(2*N+1))
    #print z
    print info

    #Return the index of the best LEarner
    if Rplus < Rminus:
        return (1, info)
    else:
        return (0, info)
Exemple #4
0
    def __call__(self, examples, weightID=0, **kwds):
        import orngTest, orngStat, statc

        self.__dict__.update(kwds)

        if self.removeThreshold < self.addThreshold:
            raise "'removeThreshold' should be larger or equal to 'addThreshold'"

        classVar = examples.domain.classVar

        indices = orange.MakeRandomIndicesCV(examples,
                                             folds=getattr(self, "folds", 10))
        domain = orange.Domain([], classVar)

        res = orngTest.testWithIndices([self.learner],
                                       orange.ExampleTable(domain, examples),
                                       indices)

        oldStat = self.stat(res)[0]
        oldStats = [self.stat(x)[0] for x in orngStat.splitByIterations(res)]
        print ".", oldStat, domain
        stop = False
        while not stop:
            stop = True
            if len(domain.attributes) >= 2:
                bestStat = None
                for attr in domain.attributes:
                    newdomain = orange.Domain(
                        filter(lambda x: x != attr, domain.attributes),
                        classVar)
                    res = orngTest.testWithIndices(
                        [self.learner],
                        (orange.ExampleTable(newdomain, examples), weightID),
                        indices)

                    newStat = self.stat(res)[0]
                    newStats = [
                        self.stat(x)[0]
                        for x in orngStat.splitByIterations(res)
                    ]
                    print "-", newStat, newdomain
                    ## If stat has increased (ie newStat is better than bestStat)
                    if not bestStat or cmp(newStat, bestStat) == self.statsign:
                        if cmp(newStat, oldStat) == self.statsign:
                            bestStat, bestStats, bestAttr = newStat, newStats, attr
                        elif statc.wilcoxont(
                                oldStats, newStats)[1] > self.removeThreshold:
                            bestStat, bestAttr, bestStats = newStat, newStats, attr
                if bestStat:
                    domain = orange.Domain(
                        filter(lambda x: x != bestAttr, domain.attributes),
                        classVar)
                    oldStat, oldStats = bestStat, bestStats
                    stop = False
                    print "removed", bestAttr.name

            bestStat, bestAttr = oldStat, None
            for attr in examples.domain.attributes:
                if not attr in domain.attributes:
                    newdomain = orange.Domain(domain.attributes + [attr],
                                              classVar)
                    res = orngTest.testWithIndices(
                        [self.learner],
                        (orange.ExampleTable(newdomain, examples), weightID),
                        indices)

                    newStat = self.stat(res)[0]
                    newStats = [
                        self.stat(x)[0]
                        for x in orngStat.splitByIterations(res)
                    ]
                    print "+", newStat, newdomain

                    ## If stat has increased (ie newStat is better than bestStat)
                    if cmp(newStat,
                           bestStat) == self.statsign and statc.wilcoxont(
                               oldStats, newStats)[1] < self.addThreshold:
                        bestStat, bestStats, bestAttr = newStat, newStats, attr
            if bestAttr:
                domain = orange.Domain(domain.attributes + [bestAttr],
                                       classVar)
                oldStat, oldStats = bestStat, bestStats
                stop = False
                print "added", bestAttr.name

        return self.learner(orange.ExampleTable(domain, examples), weightID)
def WilcoxonRankTest(accLearner1, accLearner2):
    """
    The input is two list with the value pairs to be compared!
    Single sided Wilcoxon rank sum test. 
    See critical values: http://www.euronet.nl/users/warnar/demostatistiek/tables/WILCOXONTABEL.htm
    http://web.anglia.ac.uk/numbers/biostatistics/wilcoxon/local_folder/critical_values.html
    """
    # Learner 1 is the most accurate
    diffPlus = []
    # Learner 2 is the most accurate
    diffMinus = []
    for idx in range(len(accLearner2)):
        diff = accLearner1[idx]-accLearner2[idx]
        if diff > 0:
            diffPlus.append(abs(diff))
        elif diff < 0:
            diffMinus.append(abs(diff))
        else:
            diffPlus.append(abs(diff))
            diffMinus.append(abs(diff))
    diffPlus.sort()
    diffMinus.sort()

    # Rank the differences according to absolute values
    # R is a dictionary indexed by the rank number and with the values +, - or +/-
    # indicating which learner the rank number will be assigned to
    # The greater the diff the greater the rank idx
    R = {}
    for idx in range(len(accLearner1)):
        # Get the smallest value in each diff list (small diff -> small idx)
        try: diffPlusMin = diffPlus[0]
        except: diffPlusMin = 10000  # No more diffPlus elements, always take diffMinus
        try: diffMinusMin = diffMinus[0]
        except: diffMinusMin = 10000
        if diffPlusMin < diffMinusMin:
            if len(diffPlus) > 0: min = diffPlus.pop(0)
            R[str(idx)] = "+"
        elif diffPlusMin == diffMinusMin:
            if len(diffPlus) > 0: min = diffPlus.pop(0)
            if len(diffMinus) > 0: min = diffMinus.pop(0)
            R[str(idx)] = "+/-"
        else:
            if len(diffMinus) > 0: min = diffMinus.pop(0)
            R[str(idx)] = "-"

    # Get rank sums for the two learners - The greater the sum, the more accurate the learner
    Rplus = 0
    Rminus = 0
    for key, value in R.iteritems():
        if value == "+":
            Rplus = Rplus + int(key)
        elif value == "-":
            Rminus = Rminus + int(key)
        elif value == "+/-":
            Rplus = Rplus + (1.0/2)*int(key)
            Rminus = Rminus + (1.0/2)*int(key)

    Rlist = [Rplus, Rminus]
    # Does not work!!
    #print min(Rlist)
    Rlist.sort()
    # ***** Already in Orange - don't use the above *************
    #T = Rlist.pop(0)
    T = statc.wilcoxont(accLearner1, accLearner2)[0]
    N = len(R)

    print "Rank sum of learner 1"
    print Rplus
    print "Rank sum of learner 2"
    print Rminus
    if Rplus < Rminus:
        print "The hypothesis is that learner 2 is the most accurate"
    else:
        print "The hypothesis is that learner 1 is the most accurate"

    info = "If the number of data sets (N) is equal to 16 (our regression suite):\n"
    info += "N " + str(N) +"\n"
    info += "If T < 35 there is a 10% chance that the hypothesis is not true\n"
    info += "If T < 29 there is a 5% chance that the hypothesis is not true\n"
    info += "T " + str(T) + "\n"

    info += "If the number of data sets (N) is equal to 17 (our classification suite):\n"
    info += "N " + str(N) +"\n"
    info += "If T < 41 there is a 10% chance that the hypothesis is not true\n"
    info += "If T < 34 there is a 5% chance that the hypothesis is not true\n"
    info += "T " + str(T) + "\n"
    # If N > 20
    #z = (T - (1.0/4)*N*(N+1))/math.sqrt((1.0/24)*N*(N+1)*(2*N+1))
    #print z
    print info

    #Return the index of the best LEarner
    if Rplus < Rminus:
        return (1, info)
    else:
        return (0, info)