Exemple #1
0
def learningCurveN(learners,
                   examples,
                   folds=10,
                   strat=orange.MakeRandomIndices.StratifiedIfPossible,
                   proportions=orange.frange(0.1),
                   pps=[],
                   **argkw):
    """construct a learning curve for learners"""
    seed = argkw.get("indicesrandseed", -1) or argkw.get("randseed", -1)
    if seed:
        randomGenerator = orange.RandomGenerator(seed)
    else:
        randomGenerator = argkw.get("randomGenerator",
                                    orange.RandomGenerator())

    if strat:
        cv = orange.MakeRandomIndicesCV(folds=folds,
                                        stratified=strat,
                                        randomGenerator=randomGenerator)
        pick = orange.MakeRandomIndices2(stratified=strat,
                                         randomGenerator=randomGenerator)
    else:
        cv = orange.RandomIndicesCV(folds=folds,
                                    stratified=strat,
                                    randomGenerator=randomGenerator)
        pick = orange.RandomIndices2(stratified=strat,
                                     randomGenerator=randomGenerator)
    return learningCurve(*(learners, examples, cv, pick, proportions, pps),
                         **argkw)
Exemple #2
0
def cf(input_dict):
    #    tempfile = open("tempds.tab", 'w')
    #    tempfile.write(input_dict['data'])
    #    tempfile.close()
    #    data = orange.ExampleTable("tempds.tab")
    data = orange.ExampleTable(input_dict['data'])
    addMetaID(data)
    k = 10
    noisyIndices = []
    selection = orange.MakeRandomIndicesCV(data, folds=k)
    count_noisy = [0] * k
    for test_fold in range(k):
        train_data = data.select(selection, test_fold, negate=1)
        test_data = data.select(selection, test_fold)
        #print "\t\t", "Learned on", len(train_data), "examples"
        #file.flush()
        classifier = input_dict['learner'](train_data)
        for example in test_data:
            if classifier(example) != example.getclass():
                # selection_filter[int(example[meta_id].value)] = 0
                noisyIndices.append(int(example["meta_id"].value))
                count_noisy[test_fold] += 1
        # END test_data
    # END test_fold
    return noisyIndices
Exemple #3
0
    def test_sample(self):
        d = orange.ExampleTable("iris")

        li = [1] * 10 + [0] * 140
        d1 = d.sample(li)
        self.assertEqual(len(d1), 10)
        for i in range(10):
            self.assertEqual(d1[i], d[i])
            self.assertEqual(d1[i].id, d[i].id)
        d[0, 0] = 42
        self.assertEqual(d1[0, 0], 42)

        d1 = d.sample(li, copy=True)
        self.assertEqual(len(d1), 10)
        self.assertEqual(d1[0], d[0])
        self.assertNotEqual(d1[0].id, d[0].id)
        d[0, 0] = 41
        self.assertEqual(d1[0, 0], 42)

        li = [1, 2, 3, 4, 5] * 30
        d1 = d.sample(li, 2)
        self.assertEqual(len(d1), 30)
        for i in range(30):
            self.assertEqual(d1[i].id, d[1 + 5 * i].id)

        ri = orange.MakeRandomIndicesCV(d)
        for fold in range(10):
            d1 = d.sample(ri, fold)
            self.assertEqual(orange.get_class_distribution(d1), [5, 5, 5])
Exemple #4
0
def CVByPairs(data, dimensions=None, method=None, **dic):
    import orngTree
    cv = orange.MakeRandomIndicesCV(data, 10)
    meter = orange.ExamplesDistanceConstructor_Euclidean(data)

    maxDist = 0
    for i in range(100):
        maxDist = max(maxDist, meter(data.randomexample(),
                                     data.randomexample()))
    weightK = 10.0 / maxDist

    acc = amb = unre = 0
    for fold in range(10):
        train = data.select(cv, fold, negate=1)
        test = data.select(cv, fold)
        pa, qid, did, cid = pade(train,
                                 dimensions,
                                 method,
                                 originalAsMeta=True,
                                 **dic)
        tree = orngTree.TreeLearner(pa, maxDepth=4)

        tacc, tamb, tunre = computeDirectionAccuracyForPairs(
            tree, data, meter, weightK, -1)
        acc += tacc
        amb += tamb
        unre += tunre

    return acc / 10, amb / 10, unre / 10
Exemple #5
0
def cfweka(input_dict, widget, name):
    from services.webservice import WebService
    wseval = WebService('http://vihar.ijs.si:8092/Evaluation?wsdl',
                        float(input_dict['timeout']))
    wsutil = WebService('http://vihar.ijs.si:8092/Utilities?wsdl',
                        float(input_dict['timeout']))
    somelearner = input_dict['learner']
    print somelearner
    data = input_dict['data']
    #    arffstr = toARFFstring(data).getvalue()
    #    #print arffstr
    #    wekaInstances = wsutil.client.arff_to_weka_instances(arff = arffstr, class_index = odt.domain.index(odt.domain.classVar))
    #    #print wekaInstances
    #    model = wseval.client.build_classifier(learner = somelearner, instances = wekaInstances['instances'])
    #    #return {}

    #    addMetaID(data)
    k = int(input_dict['k_folds'])
    noisyIndices = []
    selection = orange.MakeRandomIndicesCV(data, folds=k)
    count_noisy = [0] * k
    for test_fold in range(k):
        train_arffstr = toARFFstring(
            data.select(selection, test_fold, negate=1)).getvalue()
        train_data = wsutil.client.arff_to_weka_instances(
            arff=train_arffstr,
            class_index=data.domain.index(data.domain.classVar))['instances']

        test_inds = [
            i for i in range(len(selection)) if selection[i] == test_fold
        ]
        test_arffstr = toARFFstring(data.select(selection,
                                                test_fold)).getvalue()
        test_data = wsutil.client.arff_to_weka_instances(
            arff=test_arffstr,
            class_index=data.domain.index(data.domain.classVar))['instances']
        #print "\t\t", "Learned on", len(train_data), "examples"
        #file.flush()
        print "pred cl build"
        classifier = wseval.client.build_classifier(
            learner=somelearner, instances=train_data)['classifier']
        print "po cl build"
        eval_test_data = wseval.client.apply_classifier(classifier=classifier,
                                                        instances=test_data)
        print "po eval"
        for i in range(len(eval_test_data)):
            #print "Test data length:", len(test_data), "Test inds length:", len(test_inds), "Eval Test data length:", len(eval_test_data)
            print i, "v for zanki", eval_test_data[i]['classes'], data[
                test_inds[i]].getclass()
            if eval_test_data[i]['classes'] != unicode(
                    data[test_inds[i]].getclass()):
                # selection_filter[int(example[meta_id].value)] = 0
                noisyIndices.append(test_inds[i])
                count_noisy[test_fold] += 1
        # END test_data
        widget.progress = int((test_fold + 1) * 1.0 / k * 100)
        widget.save()
    # END test_fold
    return {'inds': sorted(noisyIndices), 'name': getWekaName(name)}
def cf_run(learner, data, k_folds, name, widget=None):
    """Runs a classification filter

    :param learner: WekaClassifier
    :param data: Orange dataset
    :param k_folds:
    :param name:
    :param timeout:
    :param widget:
    :return:
    """

    somelearner = learner
    print somelearner

    noisyIndices = []
    selection = orange.MakeRandomIndicesCV(data, folds=k_folds)
    count_noisy = [0] * k_folds
    for test_fold in range(k_folds):
        # train_data = wsutil.client.arff_to_weka_instances(arff = train_arffstr, class_index = data.domain.index(data.domain.classVar))['instances']
        train_data = convert_dataset_from_orange_to_scikit(
            data.select(selection, test_fold, negate=1))

        test_inds = [
            i for i in range(len(selection)) if selection[i] == test_fold
        ]
        # test_data = wsutil.client.arff_to_weka_instances(arff = test_arffstr, class_index = data.domain.index(data.domain.classVar))['instances']
        test_data = convert_dataset_from_orange_to_scikit(
            data.select(selection, test_fold))

        #print "\t\t", "Learned on", len(train_data), "examples"
        #file.flush()

        print "before cl build"
        # classifier = wseval.client.build_classifier(learner = somelearner, instances = train_data)['classifier']
        learner.build_classifier(train_data)
        print "after cl build"

        # eval_test_data = wseval.client.apply_classifier(classifier = classifier, instances = test_data)
        scikit_dataset_predicted = learner.apply_classifier(test_data)

        print "after apply"

        for i in range(len(scikit_dataset_predicted.target)):
            #print "Test data length:", len(test_data), "Test inds length:", len(test_inds), "Eval Test data length:", len(eval_test_data)
            # print i, "v for zanki", eval_test_data[i]['classes'], data[test_inds[i]].getclass()
            # if eval_test_data[i]['classes'] != unicode(data[test_inds[i]].getclass()):

            if scikit_dataset_predicted.target[
                    i] != scikit_dataset_predicted.targetPredicted[i]:
                # selection_filter[int(example[meta_id].value)] = 0
                noisyIndices.append(test_inds[i])
                count_noisy[test_fold] += 1
        # END test_data
        if not (widget is None):
            widget.progress = int((test_fold + 1) * 1.0 / k_folds * 100)
            widget.save()
    # END test_fold
    return {'inds': sorted(noisyIndices), 'name': get_weka_name(name)}
Exemple #7
0
    def test_MakeRandomIndicesCV(self):
        d = orange.ExampleTable("iris")

        inds = orange.MakeRandomIndicesCV(100)
        for j in range(10):
            self.assertEqual(len([i for i in inds if i == j]), 10)

        inds = orange.MakeRandomIndicesCV(103)
        for j in range(3):
            self.assertEqual(len([i for i in inds if i == j]), 11)

        inds = orange.MakeRandomIndicesCV(100, folds=100)
        self.assertEqual(len([i for i in inds if not i]), 1)

        # Check that five of each iris types get into each fold
        mr = orange.MakeRandomIndicesCV()
        inds = mr(d)
        for j in range(10):
            self.assertEqual(len([i for i in inds if i == j]), 15)
            sel = [d[i].getclass() for i, fold in enumerate(inds) if fold == j]
            for k in range(2):
                self.assertEqual(len([i for i in sel if i == k]), 5)
Exemple #8
0
def crossValidation(learners,
                    examples,
                    folds=10,
                    strat=orange.MakeRandomIndices.StratifiedIfPossible,
                    pps=[],
                    indicesrandseed="*",
                    **argkw):
    """cross-validation evaluation of learners"""
    (examples, weight) = demangleExamples(examples)
    if indicesrandseed != "*":
        indices = orange.MakeRandomIndicesCV(examples,
                                             folds,
                                             randseed=indicesrandseed,
                                             stratified=strat)
    else:
        randomGenerator = argkw.get("randseed", 0) or argkw.get(
            "randomGenerator", 0)
        indices = orange.MakeRandomIndicesCV(examples,
                                             folds,
                                             stratified=strat,
                                             randomGenerator=randomGenerator)
    return testWithIndices(learners, (examples, weight), indices,
                           indicesrandseed, pps, **argkw)
Exemple #9
0
def cross_validation(data, learners, k=10):
    ar = [0.0] * len(learners)
    selection = orange.MakeRandomIndicesCV(data, folds=k)
    for test_fold in range(k):
        train_data = data.select(selection, test_fold, negate=1)
        test_data = data.select(selection, test_fold)
        classifiers = []
        for l in learners:
            classifiers.append(l(train_data))
        result = aroc(test_data, classifiers)
        for j in range(len(learners)):
            ar[j] += result[j]
    for j in range(len(learners)):
        ar[j] = ar[j] / k
    return ar
def cross_validation(data, learners, k=10):
    acc = [0.0] * len(learners)
    selection = orange.MakeRandomIndicesCV(data, folds=k)
    for test_fold in range(k):
        train_data = data.select(selection, test_fold, negate=1)
        test_data = data.select(selection, test_fold)
        classifiers = []
        for l in learners:
            classifiers.append(l(train_data))
        acc1 = accuracy(test_data, classifiers)
        print "%d: %s" % (test_fold + 1, ["%.6f" % a for a in acc1])
        for j in range(len(learners)):
            acc[j] += acc1[j]
    for j in range(len(learners)):
        acc[j] = acc[j] / k
    return acc
Exemple #11
0
def CVByNodes(data, dimensions=None, method=None, **dic):
    import orngTree
    cv = orange.MakeRandomIndicesCV(data, 10)
    for fold in range(10):
        train = data.select(cv, fold, negate=1)
        test = data.select(cv, fold)
        pa, qid, did, cid = pade(train,
                                 dimensions,
                                 method,
                                 originalAsMeta=True,
                                 **dic)
        tree = orngTree.TreeLearner(pa, maxDepth=4)

        mb, cc = computeAmbiguityAccuracy(tree, test, -1)
        amb += mb
        acc += cc
    return amb / 10, acc / 10
def cf_run_harf(learner, data_orange, k_folds, widget=None):
    """Classification filter for HARF learner

    :param learner:
    :param data_orange:
    :param k_folds:
    :param widget:
    :return:
    """

    somelearner = learner
    print "Before generate"
    learner = somelearner if not isinstance(
        somelearner, UnpicklableObject) else somelearner.generate()
    print "After generate"
    # data_orange = input_dict['data_orange']
    print len(data_orange)
    add_meta_id(data_orange)
    print 'Before for loop'
    k = k_folds
    noisyIndices = []
    selection = orange.MakeRandomIndicesCV(data_orange, folds=k)
    count_noisy = [0] * k
    print 'Before for loop'
    for test_fold in range(k):
        train_data = data_orange.select(selection, test_fold, negate=1)
        test_data = data_orange.select(selection, test_fold)
        #print "\t\t", "Learned on", len(train_data), "examples"
        #file.flush()
        print 'Before classifier construction'
        #print learner.hovername if learner.hovername != None else "ni hovernamea"
        classifier = learner(train_data)
        print 'After classifier construction'
        for example in test_data:
            exclassified = classifier(example)
            if exclassified != None and exclassified != example.getclass():
                # selection_filter[int(example[meta_id].value)] = 0
                noisyIndices.append(int(example["meta_id"].value))
                count_noisy[test_fold] += 1
        # END test_data
        if not (widget is None):
            widget.progress = int((test_fold + 1) * 1.0 / k * 100)
            widget.save()
    # END test_fold
    return {'inds': sorted(noisyIndices), 'name': learner.name}
Exemple #13
0
    def __call__(self, table, weight=None, verbose=0):
        import orngTest, orngStat, orngMisc

        verbose = verbose or getattr(self, "verbose", 0)
        evaluate = getattr(self, "evaluate", orngStat.CA)
        folds = getattr(self, "folds", 5)
        compare = getattr(self, "compare", lambda x, y: (x > y) - (x < y))
        returnWhat = getattr(self, "returnWhat",
                             Tune1Parameter.returnClassifier)

        if (type(self.parameter) == list) or (type(self.parameter) == tuple):
            to_set = [self.findobj(ld) for ld in self.parameter]
        else:
            to_set = [self.findobj(self.parameter)]

        cvind = orange.MakeRandomIndicesCV(table, folds)
        findBest = orngMisc.BestOnTheFly(seed=table.checksum(),
                                         callCompareOn1st=True)
        tableAndWeight = weight and (table, weight) or table
        for par in self.values:
            for i in to_set:
                setattr(i[0], i[1], par)
            res = evaluate(
                orngTest.testWithIndices([self.object], tableAndWeight, cvind))
            findBest.candidate((res, par))
            if verbose == 2:
                print('*** orngWrap  %s: %s:' % (par, res))

        bestpar = findBest.winner()[1]
        for i in to_set:
            setattr(i[0], i[1], bestpar)

        if verbose:
            print("*** Optimal parameter: %s = %s" % (self.parameter, bestpar))

        if returnWhat == Tune1Parameter.returnNone:
            return None
        elif returnWhat == Tune1Parameter.returnParameters:
            return bestpar
        elif returnWhat == Tune1Parameter.returnLearner:
            return self.object
        else:
            classifier = self.object(table)
            classifier.setattr("fittedParameter", bestpar)
            return classifier
Exemple #14
0
def crossValidateWithSeparateTrainingAndTesting(training,
                                                testing,
                                                learner,
                                                folds=10):
    assert len(training) == len(testing)
    indices = orange.MakeRandomIndicesCV(training, folds=folds)
    cm = orngStat.ConfusionMatrix()
    for i in range(folds):
        trainingFold = training.select(indices, i, negate=1)
        testingFold = testing.select(indices, i)
        results = orngTest.learnAndTestOnTestData([learner], trainingFold,
                                                  testingFold)
        fCm = orngStat.confusionMatrices(results, classIndex=0)[0]
        cm.TP += fCm.TP
        cm.FP += fCm.FP
        cm.FN += fCm.FN
        cm.TN += fCm.TN
    return cm
Exemple #15
0
Fichier : cv.py Projet : sloria/usv
def testClassAccuracies(data, learner, k=5):
    classes = data.domain.classVar.values
    classAccuracies = [0.0] * len(classes)
    selection = orange.MakeRandomIndicesCV(data, folds=k)
    for testFold in range(k):
        trainData = data.select(selection, testFold, negate=1)
        testData = data.select(selection, testFold)
        hits = [0.] * len(classes)
        totals = [0.] * len(classes)
        classifier = learner(trainData)
        for ex in testData:
            totals[int(ex.getclass())] += 1
            if (classifier(ex) == ex.getclass()):
                hits[int(ex.getclass())] += 1
        for i in range(len(classes)):
            classAccuracies[i] += hits[i] / totals[i]

    for i in range(len(classes)):
        print "%s: %.4f" % (classes[i], classAccuracies[i] / k)
Exemple #16
0
def cv_split(context, folds=10, random_seed=None):
    '''
    Returns a list of pairs (train_context, test_context), one for each cross-validation fold.

    The split is stratified.

        :param context: DBContext to be split
        :param folds: number of folds
        :param random_seed: random seed to be used

        :return: returns a list of (train_context, test_context) pairs
        :rtype: list

        :Example:

        >>> for train_context, test_context in cv_split(context, folds=10, random_seed=0):
        >>>     pass  # Your CV loop
    '''
    import orange
    random_seed = random.randint(0, 10**6) if not random_seed else random_seed
    input_list = context.orng_tables.get(context.target_table, None)
    indices = orange.MakeRandomIndicesCV(
        input_list,
        randseed=random_seed,
        folds=folds,
        stratified=orange.MakeRandomIndices.Stratified)

    fold_contexts = []
    for i in range(folds):
        train = input_list.select(indices, i, negate=1)
        test = input_list.select(indices, i)
        train.name = input_list.name
        test.name = input_list.name
        train_context = context.copy()
        train_context.orng_tables[context.target_table] = train
        test_context = context.copy()
        test_context.orng_tables[context.target_table] = test
        fold_contexts.append((train_context, test_context))

    return fold_contexts
Exemple #17
0
def cforange(input_dict, widget):
    from workflows.helpers import UnpicklableObject
    somelearner = input_dict['learner']
    print "Before generate"
    learner = somelearner if not isinstance(
        somelearner, UnpicklableObject) else somelearner.generate()
    print "After generate"
    data = input_dict['data']
    print len(data)
    addMetaID(data)
    print 'Before for loop'
    k = int(input_dict['k_folds'])
    noisyIndices = []
    selection = orange.MakeRandomIndicesCV(data, folds=k)
    count_noisy = [0] * k
    print 'Before for loop'
    for test_fold in range(k):
        train_data = data.select(selection, test_fold, negate=1)
        test_data = data.select(selection, test_fold)
        #print "\t\t", "Learned on", len(train_data), "examples"
        #file.flush()
        print 'Before classifier construction'
        #print learner.hovername if learner.hovername != None else "ni hovernamea"
        classifier = learner(train_data)
        print 'After classifier construction'
        for example in test_data:
            exclassified = classifier(example)
            if exclassified != None and exclassified != example.getclass():
                # selection_filter[int(example[meta_id].value)] = 0
                noisyIndices.append(int(example["meta_id"].value))
                count_noisy[test_fold] += 1
        # END test_data
        widget.progress = int((test_fold + 1) * 1.0 / k * 100)
        widget.save()
    # END test_fold
    return {'inds': sorted(noisyIndices), 'name': learner.name}
# Description: Constructs indices for cross-validation
# Category:    sampling
# Classes:     MakeRandomIndices, MakeRandomIndicesCV
# Uses:        lenses
# Referenced:  RandomIndices.htm

import orange

data = orange.ExampleTable("lenses")

print orange.MakeRandomIndicesCV(data)

print orange.MakeRandomIndicesCV(10, folds=5)
Exemple #19
0
def learningCurve(learners,
                  examples,
                  cv=None,
                  pick=None,
                  proportions=orange.frange(0.1),
                  pps=[],
                  **argkw):
    verb = argkw.get("verbose", 0)
    cache = argkw.get("cache", 0)
    callback = argkw.get("callback", 0)

    for pp in pps:
        if pp[0] != "L":
            raise SystemError("cannot preprocess testing examples")

    if not cv or not pick:
        seed = argkw.get("indicesrandseed", -1) or argkw.get("randseed", -1)
        if seed:
            randomGenerator = orange.RandomGenerator(seed)
        else:
            randomGenerator = argkw.get("randomGenerator",
                                        orange.RandomGenerator())
        if not cv:
            cv = orange.MakeRandomIndicesCV(
                folds=10,
                stratified=orange.MakeRandomIndices.StratifiedIfPossible,
                randomGenerator=randomGenerator)
        if not pick:
            pick = orange.MakeRandomIndices2(
                stratified=orange.MakeRandomIndices.StratifiedIfPossible,
                randomGenerator=randomGenerator)

    examples, weight = demangleExamples(examples)
    folds = cv(examples)
    ccsum = hex(examples.checksum())[2:]
    ppsp = encodePP(pps)
    nLrn = len(learners)

    allResults = []
    for p in proportions:
        printVerbose("Proportion: %5.3f" % p, verb)

        if (cv.randseed < 0) or (pick.randseed < 0):
            cache = 0
        else:
            fnstr = "{learningCurve}_%s_%s_%s_%s%s-%s" % (
                "%s", p, cv.randseed, pick.randseed, ppsp, ccsum)
            if "*" in fnstr:
                cache = 0

        conv = examples.domain.classVar.varType == orange.VarTypes.Discrete and int or float
        testResults = ExperimentResults(
            cv.folds, [l.name for l in learners],
            examples.domain.classVar.values.native(), weight != 0,
            examples.domain.classVar.baseValue)
        testResults.results = [
            TestedExample(folds[i], conv(examples[i].getclass()), nLrn,
                          examples[i].getweight(weight))
            for i in range(len(examples))
        ]

        if cache and testResults.loadFromFiles(learners, fnstr):
            printVerbose("  loaded from cache", verb)
        else:
            for fold in range(cv.folds):
                printVerbose("  fold %d" % fold, verb)

                # learning
                learnset = examples.selectref(folds, fold, negate=1)
                learnset = learnset.selectref(pick(learnset, p0=p), 0)
                if not len(learnset):
                    continue

                for pp in pps:
                    learnset = pp[1](learnset)

                classifiers = [None] * nLrn
                for i in range(nLrn):
                    if not cache or not testResults.loaded[i]:
                        classifiers[i] = learners[i](learnset, weight)

                # testing
                for i in range(len(examples)):
                    if (folds[i] == fold):
                        # This is to prevent cheating:
                        ex = orange.Example(examples[i])
                        ex.setclass("?")
                        for cl in range(nLrn):
                            if not cache or not testResults.loaded[cl]:
                                cls, pro = classifiers[cl](ex, orange.GetBoth)
                                testResults.results[i].setResult(cl, cls, pro)
                if callback: callback()
            if cache:
                testResults.saveToFiles(learners, fnstr)

        allResults.append(testResults)

    return allResults
Exemple #20
0
    def __call__(self, examples, weightID=0, **kwds):
        import orngTest, orngStat, statc

        self.__dict__.update(kwds)

        if self.removeThreshold < self.addThreshold:
            raise "'removeThreshold' should be larger or equal to 'addThreshold'"

        classVar = examples.domain.classVar

        indices = orange.MakeRandomIndicesCV(examples,
                                             folds=getattr(self, "folds", 10))
        domain = orange.Domain([], classVar)

        res = orngTest.testWithIndices([self.learner],
                                       orange.ExampleTable(domain, examples),
                                       indices)

        oldStat = self.stat(res)[0]
        oldStats = [self.stat(x)[0] for x in orngStat.splitByIterations(res)]
        print ".", oldStat, domain
        stop = False
        while not stop:
            stop = True
            if len(domain.attributes) >= 2:
                bestStat = None
                for attr in domain.attributes:
                    newdomain = orange.Domain(
                        filter(lambda x: x != attr, domain.attributes),
                        classVar)
                    res = orngTest.testWithIndices(
                        [self.learner],
                        (orange.ExampleTable(newdomain, examples), weightID),
                        indices)

                    newStat = self.stat(res)[0]
                    newStats = [
                        self.stat(x)[0]
                        for x in orngStat.splitByIterations(res)
                    ]
                    print "-", newStat, newdomain
                    ## If stat has increased (ie newStat is better than bestStat)
                    if not bestStat or cmp(newStat, bestStat) == self.statsign:
                        if cmp(newStat, oldStat) == self.statsign:
                            bestStat, bestStats, bestAttr = newStat, newStats, attr
                        elif statc.wilcoxont(
                                oldStats, newStats)[1] > self.removeThreshold:
                            bestStat, bestAttr, bestStats = newStat, newStats, attr
                if bestStat:
                    domain = orange.Domain(
                        filter(lambda x: x != bestAttr, domain.attributes),
                        classVar)
                    oldStat, oldStats = bestStat, bestStats
                    stop = False
                    print "removed", bestAttr.name

            bestStat, bestAttr = oldStat, None
            for attr in examples.domain.attributes:
                if not attr in domain.attributes:
                    newdomain = orange.Domain(domain.attributes + [attr],
                                              classVar)
                    res = orngTest.testWithIndices(
                        [self.learner],
                        (orange.ExampleTable(newdomain, examples), weightID),
                        indices)

                    newStat = self.stat(res)[0]
                    newStats = [
                        self.stat(x)[0]
                        for x in orngStat.splitByIterations(res)
                    ]
                    print "+", newStat, newdomain

                    ## If stat has increased (ie newStat is better than bestStat)
                    if cmp(newStat,
                           bestStat) == self.statsign and statc.wilcoxont(
                               oldStats, newStats)[1] < self.addThreshold:
                        bestStat, bestStats, bestAttr = newStat, newStats, attr
            if bestAttr:
                domain = orange.Domain(domain.attributes + [bestAttr],
                                       classVar)
                oldStat, oldStats = bestStat, bestStats
                stop = False
                print "added", bestAttr.name

        return self.learner(orange.ExampleTable(domain, examples), weightID)
Exemple #21
0
    def __call__(self, table, weight=None, verbose=0):
        import orngTest, orngStat, orngMisc

        evaluate = getattr(self, "evaluate", orngStat.CA)
        folds = getattr(self, "folds", 5)
        compare = getattr(self, "compare", cmp)
        verbose = verbose or getattr(self, "verbose", 0)
        returnWhat = getattr(self, "returnWhat",
                             Tune1Parameter.returnClassifier)
        progressCallback = getattr(self, "progressCallback", lambda i: None)

        to_set = []
        parnames = []
        for par in self.parameters:
            if (type(par[0]) == list) or (type(par[0]) == tuple):
                to_set.append([self.findobj(ld) for ld in par[0]])
                parnames.append(par[0])
            else:
                to_set.append([self.findobj(par[0])])
                parnames.append([par[0]])

        cvind = orange.MakeRandomIndicesCV(table, folds)
        findBest = orngMisc.BestOnTheFly(seed=table.checksum(),
                                         callCompareOn1st=True)
        tableAndWeight = weight and (table, weight) or table
        numOfTests = sum([len(x[1]) for x in self.parameters])
        milestones = set(range(0, numOfTests, max(numOfTests / 100, 1)))
        for itercount, valueindices in enumerate(
                orngMisc.LimitedCounter([len(x[1]) for x in self.parameters])):
            values = [
                self.parameters[i][1][x] for i, x in enumerate(valueindices)
            ]
            for pi, value in enumerate(values):
                for i, par in enumerate(to_set[pi]):
                    setattr(par[0], par[1], value)
                    if verbose == 2:
                        print("%s: %s" % (parnames[pi][i], value))

            res = evaluate(
                orngTest.testWithIndices([self.object], tableAndWeight, cvind))
            if itercount in milestones:
                progressCallback(100.0 * itercount / numOfTests)

            findBest.candidate((res, values))
            if verbose == 2:
                print("===> Result: %s\n" % res)

        bestpar = findBest.winner()[1]
        if verbose:
            print("*** Optimal set of parameters: ", end=' ')
        for pi, value in enumerate(bestpar):
            for i, par in enumerate(to_set[pi]):
                setattr(par[0], par[1], value)
                if verbose:
                    print("%s: %s" % (parnames[pi][i], value), end=' ')
        if verbose:
            print()

        if returnWhat == Tune1Parameter.returnNone:
            return None
        elif returnWhat == Tune1Parameter.returnParameters:
            return bestpar
        elif returnWhat == Tune1Parameter.returnLearner:
            return self.object
        else:
            classifier = self.object(table)
            classifier.fittedParameters = bestpar
            return classifier
Exemple #22
0
    #whole_table = proj_utils.load_data(sys.argv[1])
    #start_domain = Orange.data.Domain(whole_table.domain.attributes[4:])
    #start_data = Orange.data.Table(start_domain, whole_table)
    start_data = proj_utils.load_data(sys.argv[1])

    cv_folds = int(sys.argv[2])
    features = int(sys.argv[3])

    # default scoring algorithm
    scores = Orange.feature.scoring.score_all(start_data)
    data = Orange.feature.selection.select(start_data, scores, features)

    train_data, test_data = proj_utils.partition_data(data)

    selection = orange.MakeRandomIndicesCV(data, cv_folds)

    sen1 = 0.0
    spe1 = 0.0
    acc1 = 0.0
    sen2 = 0.0
    spe2 = 0.0
    acc2 = 0.0

    results = orngTest.crossValidation(learners, data, cv_folds=5)
    """
    based on http://orange.biolab.si/doc/ofb/c_performance.htm
    """
    for test_fold in range(cv_folds):
        train_data = data.select(selection, test_fold, negate=1)
        test_data = data.select(selection, test_fold)
Exemple #23
0
    def __call__(self, examples, weight = 0):
        if not(examples.domain.classVar.varType == 1 and len(examples.domain.classVar.values)==2):
            # failing the assumptions of margin-metalearner...
            return MarginMetaClassifierWrap(self.learner(examples))

        mv = orange.FloatVariable(name="margin")
        estdomain = orange.Domain([mv,examples.domain.classVar])
        mistakes = orange.ExampleTable(estdomain)
        if weight != 0:
            mistakes.addMetaAttribute(1)

        for replication in range(self.replications):
            # perform 10 fold CV, and create a new dataset
            try:
                selection = orange.MakeRandomIndicesCV(examples, self.folds, stratified=0, randomGenerator = orange.globalRandom) # orange 2.2
            except:
                selection = orange.RandomIndicesCVGen(examples, self.folds) # orange 2.1
            for fold in range(self.folds):
              if self.folds != 1: # no folds
                  learn_data = examples.selectref(selection, fold, negate=1)
                  test_data  = examples.selectref(selection, fold)
              else:
                  learn_data = examples
                  test_data  = examples

              # fulldata removes the influence of scaling on the distance dispersion.                  
              if weight!=0:
                  if self.fulldata:
                      classifier = self.learner(learn_data, weight=weight, fulldata=examples)
                  else:
                      classifier = self.learner(learn_data, weight=weight)
              else:
                  if self.fulldata:
                      classifier = self.learner(learn_data, fulldata=examples)
                  else:
                      classifier = self.learner(learn_data)
              # normalize the range
              if self.normalization:
                  mi = 1e100
                  ma = -1e100
                  for ex in learn_data:
                      margin = classifier.getmargin(ex)
                      mi = min(mi,margin)
                      ma = max(ma,margin)
                  coeff = 1.0/max(ma-mi,1e-16)
              else:
                  coeff = 1.0  
              for ex in test_data:
                  margin = coeff*classifier.getmargin(ex)
                  if type(margin)==type(1.0) or type(margin)==type(1):
                      # ignore those examples which are handled with
                      # the actual probability distribution
                      mistake = orange.Example(estdomain,[float(margin), ex.getclass()])
                      if weight!=0:
                          mistake.setmeta(ex.getMetaAttribute(weight),1)
                      mistakes.append(mistake)

        if len(mistakes) < 1:
            # nothing to learn from
            if weight == 0:
                return self.learner(examples)
            else:
                return self.learner(examples,weight)
        if weight != 0:
            # learn a classifier to estimate the probabilities from margins
            # learn a classifier for the whole training set
            estimate = self.metalearner(mistakes, weight = 1)
            classifier = self.learner(examples, weight)
        else:
            estimate = self.metalearner(mistakes)
            classifier = self.learner(examples)

        # normalize the range
        if self.normalization:
            mi = 1e100
            ma = -1e100
            for ex in examples:
                margin = classifier.getmargin(ex)
                mi = min(mi,margin)
                ma = max(ma,margin)
            coeff = 1.0/max(ma-mi,1e-16)
        else:
            coeff = 1.0
        #print estimate.classifier.classifier
        #for x in mistakes:
        #    print x,estimate(x,orange.GetBoth)

        return MarginMetaClassifier(classifier, estimate, examples.domain, estdomain, coeff)
Exemple #24
0
# Description: Adds two new numerical attributes to iris data set, and tests through cross validation if this helps in boosting classification accuracy
# Category:    modelling
# Uses:        iris
# Classes:     Domain, FloatVariable, MakeRandomIndicesCV, orngTest.testWithIndices
# Referenced:  domain.htm

import orange, orngTest, orngStat, orngTree

data = orange.ExampleTable('iris')

sa = orange.FloatVariable("sepal area")
sa.getValueFrom = lambda e, getWhat: e['sepal length'] * e['sepal width']
pa = orange.FloatVariable("petal area")
pa.getValueFrom = lambda e, getWhat: e['petal length'] * e['petal width']

newdomain = orange.Domain(data.domain.attributes +
                          [sa, pa, data.domain.classVar])
newdata = data.select(newdomain)

learners = [orngTree.TreeLearner(mForPruning=2.0)]

indices = orange.MakeRandomIndicesCV(data, 10)
res1 = orngTest.testWithIndices(learners, data, indices)
res2 = orngTest.testWithIndices(learners, newdata, indices)

print "original: %5.3f, new: %5.3f" % (orngStat.CA(res1)[0],
                                       orngStat.CA(res2)[0])
Exemple #25
0
def main():
    from sys import argv
        
    map_fn = argv[1]
    gtruth_tag_fn = argv[2]
    cluster_fn = argv[3]
    assignment_fns = argv[4:]
    tagFile = tag_util.tag_file(gtruth_tag_fn, map_fn)
    tagFile.get_map()
    tagFile.get_tag_names()
    
    
    skeleton = carmen_map_skeletonizer.load(cluster_fn, map_fn)

    assignments = [Assignment.load(assignment_fn, tagFile, skeleton) for assignment_fn 
                   in assignment_fns]
    
    engineMap = dict((x.name, x) for x in
                     [bring.Engine(), 
                      follow.Engine(), 
                      meet.Engine(), 
                      avoid.Engine(), 
                      #wander.Engine(), 
                      #go.Engine(),
                      ])
    
    
    for engine in engineMap.values():
        verb = engine.name
        if verb != "follow" and False:
            continue
        

        def run():
            return makeTable(engine, assignments)
        #cProfile.runctx("run()", globals(), locals(), "profile.out")
        #return
        table = run()
        print "verb", verb, len(table)  
    
        cv_indices = orange.MakeRandomIndicesCV(table, 2)
        humanLabeledTraining = table.select(cv_indices, 0)
        training = orange.ExampleTable(humanLabeledTraining.domain)
        training.extend(humanLabeledTraining)
        generatedTraining = makeSubsetExamples(engine, humanLabeledTraining)
        training.extend(generatedTraining)
        
        print "Using", len(generatedTraining), "subset examples"
        
        testing = table.select(cv_indices, 1)
        
        #testFeatureSubsets(engine, training, testing)
        
        #classifier  = orngBayes.BayesLearner(training)
        classifier  = RandomForestLearner(training)
        results = orngTest.testOnData([classifier], testing)
        print "results", results
        tuples = list(zip(testing, results.results))
        tuples.sort(key=lambda x: x[0]["description"])
        for e, r in tuples:
#            print e["description"], e["hasApproach"], e["hasFollow"],
            if r.actualClass == r.classes[0]:
                print "correct", e["description"], e["entry"].value.id 
            else:
                print "incorrect", e["description"], e["entry"].value.id 

        mpl.figure(figsize=(6,6))
        mpl.subplots_adjust(bottom=0.13)
        line, = orangeGui.rocCurve(results, engine.name, stepSize=0.001,
                                   plotArgs={"color":"black"})

        orangeUtils.displayResults(results)
        mpl.xlabel("FP", fontsize=32)
        mpl.ylabel("TP", fontsize=32)
        mpl.xticks((0, 1), fontsize=20)
        mpl.yticks((0, 1), fontsize=20)
        line.set_label(engine.name)
        mpl.title(engine.name.capitalize(), fontsize=32)
        mpl.savefig("roc_%s.png" % engine.name)
        mpl.savefig("roc_%s.ps" % engine.name)
    mpl.show()
Exemple #26
0
    def run(self):
        self.cleanup()
        if self.is_for_loop():
            fi = None
            fo = None
            for w in self.widgets:
                if w.type == 'for_input':
                    fi = w
                if w.type == 'for_output':
                    fo = w
            outer_output = self.parent.outputs[fo.inputs.all()
                                               [0].outer_output_id]
            outer_output.value = []
            input_list = self.parent.inputs[fi.outputs.all()
                                            [0].outer_input_id].value
            for i in input_list:
                self.cleanup()
                proper_output = fi.outputs.all()[0]
                proper_output.value = i
                fi.finished = True
                self.run_all_unfinished_widgets()
        elif self.is_cross_validation():
            import random as rand
            fi = None
            fo = None
            for w in self.widgets:
                if w.type == 'cv_input':
                    fi = w
                if w.type == 'cv_output':
                    fo = w
            outer_output = self.parent.outputs[fo.inputs.all()
                                               [0].outer_output_id]
            outer_output.value = []
            input_list = self.parent.inputs[fi.outputs.all()
                                            [0].outer_input_id].value
            input_fold = self.parent.inputs[fi.outputs.all()
                                            [1].outer_input_id].value
            input_seed = self.parent.inputs[fi.outputs.all()
                                            [2].outer_input_id].value
            if input_fold != None:
                input_fold = int(input_fold)
            else:
                input_fold = 10

            if input_seed != None:
                input_seed = int(input_seed)
            else:
                input_seed = random.randint(0, 10**9)

            input_type = input_list.__class__.__name__
            context = None
            if input_type == 'DBContext':
                context = input_list
                input_list = context.orng_tables.get(context.target_table,
                                                     None)
            elif input_type == 'DocumentCorpus':
                document_corpus = input_list
                input_list = document_corpus.documents

            if not input_list:
                raise Exception('CrossValidation: Empty input list!')

            folds = []
            if hasattr(input_list, "get_items_ref"):
                import orange
                indices = orange.MakeRandomIndicesCV(
                    input_list,
                    randseed=input_seed,
                    folds=input_fold,
                    stratified=orange.MakeRandomIndices.Stratified)
                for i in range(input_fold):
                    output_train = input_list.select(indices, i, negate=1)
                    output_test = input_list.select(indices, i)
                    output_train.name = input_list.name
                    output_test.name = input_list.name
                    folds.append((output_train, output_test))
            elif input_type == 'DocumentCorpus':
                from sklearn.cross_validation import StratifiedKFold, KFold

                if 'Labels' in document_corpus.features:
                    labels = document_corpus.get_document_labels()
                    #print "Seed:"+str(input_seed)
                    stf = StratifiedKFold(labels,
                                          n_folds=input_fold,
                                          random_state=input_seed)
                else:
                    stf = KFold(len(document_corpus.documents),
                                n_folds=input_fold,
                                random_state=input_seed)

                folds = [(list(train_index), list(test_index))
                         for train_index, test_index in stf]
            else:
                rand.seed(input_seed)
                rand.shuffle(input_list)
                folds = [input_list[i::input_fold] for i in range(input_fold)]

            proper_output = fi.outputs.all()[2]
            proper_output.value = input_seed

            for i in range(len(folds)):
                #import pdb; pdb.set_trace()
                if hasattr(input_list, "get_items_ref"):
                    output_test = folds[i][1]
                    output_train = folds[i][0]
                elif input_type == 'DocumentCorpus':
                    train_indices, test_indices = folds[i]
                    print "engine"
                    print("TRAIN:", train_indices, "TEST:", test_indices)

                    output_train, output_test = document_corpus.split(
                        train_indices, test_indices)
                else:
                    output_train = folds[:i] + folds[i + 1:]
                    output_test = folds[i]
                if input_type == 'DBContext':
                    output_train_obj = context.copy()
                    output_train_obj.orng_tables[
                        context.target_table] = output_train
                    output_test_obj = context.copy()
                    output_test_obj.orng_tables[
                        context.target_table] = output_test
                    output_train = output_train_obj
                    output_test = output_test_obj

                self.cleanup()
                proper_output = fi.outputs.all()[0]  # inner output
                proper_output.value = output_train
                proper_output = fi.outputs.all()[1]  # inner output
                proper_output.value = output_test
                fi.finished = True  # set the input widget as finished
                self.run_all_unfinished_widgets()
        else:
            self.run_all_unfinished_widgets()
        self.save()