def prune_sf(data, minExmplsInLeaf, progress_steps, widget=None):
    """Prune Saturation Filter

    :param data:
    :param minExmplsInLeaf:
    :param progress_steps:
    :param widget:
    :return:
    """

    print "\t", "Pruning + Saturation Filter:"
    #file.flush()
    classifier = orngTree.TreeLearner(data,
                                      sameMajorityPruning=1,
                                      mForPruning=0,
                                      storeExamples=1)
    print "\t\t", "Classifier complexity:\t", orngTree.countNodes(
        classifier), "nodes."
    #file.flush()
    ##    [noisyA, dataset] = exclude_pruned(data, classifier, minExmplsInLeaf)
    [noisePruned, dataset] = exclude_pruned(data, classifier, minExmplsInLeaf)
    print "\t\t", len(noisePruned), "example(s) were excluded by pruning."
    #file.flush()
    classifier2 = orngTree.TreeLearner(dataset,
                                       sameMajorityPruning=1,
                                       mForPruning=0,
                                       storeExamples=1)
    print "\t\t", "Pruned Classifier complexity:", orngTree.countNodes(
        classifier2), "nodes. "
    #file.flush()
    # Saturation filtering
    ##    [noisy_data, filtered_data] = saturation(dataset, "tree")

    n = len(data)
    #widget.progress = int(len(noisePruned)*1.0/len(data)*100)
    if not (widget == None):
        widget.progress = int(
            sum([n - i for i in range(len(noisePruned))]) * 1.0 /
            progress_steps * 100)
        widget.save()
        print "progress:", widget.progress

    #[noiseSF, filtered_data] = saturation(dataset, widget)#, "tree")
    noiseSF = saturation(dataset, widget)  #, "tree")
    #print "\t\t", "Size of filtered dataset:", len(filtered_data)
    print "\t\t", "Noisy examples (", len(noiseSF["inds"])+len(noisePruned),"(",len(noisePruned),"pruned,",\
          len(noiseSF["inds"]), "SF ))\n"#: (class, id)"
    #file.flush()
    #noisy_data.sort(meta_id)
    #noiseSF.sort()
    # Merge both obtained sets of noisy examples
    #noisyA.extend(noisy_data)
    noisePruned.extend(noiseSF["inds"])
    #return noisyA
    return {"inds": sorted(noisePruned), "name": "PruneSF"}
def main(phase, make):

    if (phase == 4):
        f = FeatureExtractor2.FeatureExtractor(createFile=make)
        ft = FeatureExtractor3.FeatureExtractor(createFile=make)
        idlist = f.IDs
        idlist2 = ft.IDs
        FeatureTable = orange.ExampleTable("table2")
        TestTable = orange.ExampleTable("table3")
        training, test = SplitDataInHalf(FeatureTable, f.size)
        learner = orngTree.TreeLearner(training)
        res = orngTest.testOnData([learner], test)
        if make == True:
            learner = orngTree.TreeLearner(FeatureTable)
            res = orngTest.testOnData([learner], TestTable)
            res2 = orngTest.testOnData([learner], FeatureTable)
            WriteToFile("dev_tonder_olsen.txt", res2, idlist)
            WriteToFile("test_tonder_olsen.txt", res, idlist2)
            printresult()
    else:
        f = featureExtractor.FeatureExtractor(createFile=True)
        FeatureTable = orange.ExampleTable("table")
        learner, res = CrossValidation(FeatureTable, f.size, 10)

        guessyes = 0
        guessno = 0
        correctyes = 0
        correctno = 0
        for r in res.results:
            if str(r.classes[0]) == "1":
                prtres = "Yes"
            else:
                prtres = "No"

            if str(r.actualClass) == "1":
                prttrue = "Yes"
                correctyes = correctyes + 1
            else:
                prttrue = "No"
                correctno = correctno + 1
            #print str(r.classes[0]) + " vs correct: " + str(r.actualClass)
            if prtres == "No" and prttrue == "No":
                guessno = guessno + 1
            elif prtres == "Yes" and prttrue == "Yes":
                guessyes = guessyes + 1
            print "Guessed " + prtres + " and the correct answer was: " + prttrue
        #res = orngTest.leaveOneOut([learner],FeatureTable)
        #printresult = orngStat.CA(res, orngStat.IS(res))
        #print "Yes Accuracy: " + str(float(guessyes)/float(correctyes))
        #print "No Accuracy: " + str(float(guessno)/float(correctno))
        printresult = orngStat.CA(res)
        print "Accuracy: " + str(printresult[0])
def CrossValidation(FeatureTable, n, p):
    """
    FeatureTable = an orange ExampeTable with training data
    n = the size of the test data
    p = the number of sections you will make of the training data
    """
    learner = None
    results = None
    best = 0
    for i in range(p):
        start = i * n / p
        end = start + (n / p)
        testData = FeatureTable.getItems(range(start, end))
        trainingData = FeatureTable.getItems(range(0, start))
        for x in range(end, n):
            trainingData.append(FeatureTable[x])
        l = orngTree.TreeLearner(trainingData)
        res = orngTest.testOnData([l], testData)
        c = 0
        for r in res.results:
            if r.classes[0] == r.actualClass:
                c = c + 1
        if c > best:
            best = c
            learner = l
            results = res

    return learner, results
    def constructLearner(self):
        rand = random.Random(self.rseed)

        attrs = None
        if self.attributes:
            attrs = self.attributesP

        smallLearner = orngTree.TreeLearner()

        if self.preNodeInst:
            smallLearner.stop.minExamples = self.preNodeInstP
        else:
            smallLearner.stop.minExamples = 0

        smallLearner.storeExamples = 1
        smallLearner.storeNodeClassifier = 1
        smallLearner.storeContingencies = 1
        smallLearner.storeDistributions = 1

        if self.limitDepth:
            smallLearner.maxDepth = self.limitDepthP

        learner = orngEnsemble.RandomForestLearner(base_learner=smallLearner,
                                                   trees=self.trees,
                                                   rand=rand,
                                                   attributes=attrs)

        if self.preprocessor:
            learner = self.preprocessor.wrapLearner(learner)
        learner.name = self.name
        return learner
Ejemplo n.º 5
0
    def __init__(self,
                 learner=None,
                 trees=100,
                 attributes=None,
                 name='Random Forest',
                 rand=None,
                 callback=None):
        """random forest learner"""
        self.trees = trees
        self.name = name
        self.learner = learner
        self.attributes = attributes
        self.callback = callback
        if rand:
            self.rand = rand
        else:
            self.rand = random.Random()
            self.rand.seed(0)

        self.randstate = self.rand.getstate()  #original state

        if not learner:
            # tree learner assembled as suggested by Brieman (2001)
            smallTreeLearner = orngTree.TreeLearner(storeNodeClassifier=0,
                                                    storeContingencies=0,
                                                    storeDistributions=1,
                                                    minExamples=5).instance()
            smallTreeLearner.split.discreteSplitConstructor.measure = smallTreeLearner.split.continuousSplitConstructor.measure = orange.MeasureAttribute_gini(
            )
            smallTreeLearner.split = SplitConstructor_AttributeSubset(
                smallTreeLearner.split, attributes, self.rand)
            self.learner = smallTreeLearner
Ejemplo n.º 6
0
    def setLearner(self):
        if hasattr(self, "btnApply"):
            self.btnApply.setFocus()
        if not self.limitDepth:
            mDepth = {}
        else:
            mDepth = {'maxDepth': self.maxDepth}
        self.learner = orngTree.TreeLearner(
            measure=self.measures[self.estim][1],
            reliefK=self.relK,
            reliefM=self.limitRef and self.relM or -1,
            binarization=self.bin,
            minExamples=self.preNodeInst and self.preNodeInstP,
            minSubset=self.preLeafInst and self.preLeafInstP,
            maxMajority=self.preNodeMaj and self.preNodeMajP / 100.0 or 1.0,
            sameMajorityPruning=self.postMaj,
            mForPruning=self.postMPruning and self.postM,
            storeExamples=1,
            **mDepth)

        self.learner.name = self.name
        if self.preprocessor:
            self.learner = self.preprocessor.wrapLearner(self.learner)

        self.send("Learner", self.learner)

        self.error()
        if self.data:
            try:
                self.classifier = self.learner(self.data)
                self.classifier.name = self.name
            except Exception, (errValue):
                self.error(str(errValue))
                self.classifier = None
Ejemplo n.º 7
0
def CVByPairs(data, dimensions=None, method=None, **dic):
    import orngTree
    cv = orange.MakeRandomIndicesCV(data, 10)
    meter = orange.ExamplesDistanceConstructor_Euclidean(data)

    maxDist = 0
    for i in range(100):
        maxDist = max(maxDist, meter(data.randomexample(),
                                     data.randomexample()))
    weightK = 10.0 / maxDist

    acc = amb = unre = 0
    for fold in range(10):
        train = data.select(cv, fold, negate=1)
        test = data.select(cv, fold)
        pa, qid, did, cid = pade(train,
                                 dimensions,
                                 method,
                                 originalAsMeta=True,
                                 **dic)
        tree = orngTree.TreeLearner(pa, maxDepth=4)

        tacc, tamb, tunre = computeDirectionAccuracyForPairs(
            tree, data, meter, weightK, -1)
        acc += tacc
        amb += tamb
        unre += tunre

    return acc / 10, amb / 10, unre / 10
Ejemplo n.º 8
0
 def __init__(self):
     self.last_tag_seen = '0'
     self.tag_visible = 'f'
     self.tag_x_coord = 180.0
     self.tag_distance = 2000.0
     self.bumping = 'f'
     self.data = orange.ExampleTable("analyzed_data/tag_data")
     self.tree = orngTree.TreeLearner(self.data, maxMajority=0.7)
     self.pub = rospy.Publisher('cmd_vel', Twist)
     self.picked_up = True
     self.generating_tree = False
Ejemplo n.º 9
0
def classify():

    import orange, orngTree

    testData = orange.ExampleTable('data/audioTest.tab')
    trainData = orange.ExampleTable('data/audioTrain.tab')
    bayes = orange.BayesLearner(trainData)
    bayes.name = "bayes"
    tree = orngTree.TreeLearner(trainData)
    tree.name = "tree"
    classifiers = [bayes, tree]

    return classifiers, trainData, testData
Ejemplo n.º 10
0
def CVByNodes(data, dimensions=None, method=None, **dic):
    import orngTree
    cv = orange.MakeRandomIndicesCV(data, 10)
    for fold in range(10):
        train = data.select(cv, fold, negate=1)
        test = data.select(cv, fold)
        pa, qid, did, cid = pade(train,
                                 dimensions,
                                 method,
                                 originalAsMeta=True,
                                 **dic)
        tree = orngTree.TreeLearner(pa, maxDepth=4)

        mb, cc = computeAmbiguityAccuracy(tree, test, -1)
        amb += mb
        acc += cc
    return amb / 10, acc / 10
Ejemplo n.º 11
0
    def btnBuildClicked(self):
        node = self.findCurrentNode()
        if not node or not len(node.examples):
            return

        try:
            newtree = (self.treeLearner or orngTree.TreeLearner(storeExamples = 1))(node.examples)

        except:
            return
        
        if not hasattr(newtree, "tree"):
            QMessageBox.critical( None, "Invalid Learner", "The learner on the input built a classifier which is not a tree.", QMessageBox.Ok)

        for k, v in newtree.tree.__dict__.items():
            node.setattr(k, v)
        self.updateTree()
Ejemplo n.º 12
0
    def makeLearner(self):
        # for icmi
        #from orangeUtils import ThresholdProbabilityLearner
        #import orngBayes
        #learner = orngBayes.BayesLearner()
        #learner.adjustThreshold = True
        #return learner

        #return orngEnsemble.RandomForestLearner(data)
        #return orngTree.TreeLearner(data)
        #return treefss.TreeFSS(N=7)(data)
        treeLearner = orngTree.TreeLearner(storeExamples=True)
        treeLearner.stop = orange.TreeStopCriteria_common()
        #treeLearner.stop.minExamples = 1
        treeLearner.maxDepth = 5
        #treeLearner.stop.maxMajority = 0.8
        #return treeLearner
        return preposition.RejectInsaneExampleLearner(treeLearner)
Ejemplo n.º 13
0
    def report_tree(self, name):
        filename = name + '.tsv'
        stream = file(filename, 'wt')

        # header
        stream.write('\t'.join(self.names) + '\n')
        stream.write('\t'.join(self.types) + '\n')
        stream.write('class\n')

        # rows
        for row in self.rows:
            if row[0] == 'skip':
                continue
            row += [''] * (len(self.names) - len(row))
            stream.write('\t'.join(row) + '\n')

        stream.close()

        # See http://www.ailab.si/orange/doc/ofb/c_otherclass.htm
        try:
            import orange
            import orngTree
        except ImportError:
            sys.stderr.write(
                'Install Orange from http://www.ailab.si/orange/ for a classification tree.\n'
            )
            return None

        data = orange.ExampleTable(filename)

        tree = orngTree.TreeLearner(data, sameMajorityPruning=1, mForPruning=2)

        orngTree.printTxt(tree, maxDepth=4)

        text_tree = orngTree.dumpTree(tree)

        file(name + '.txt', 'wt').write(text_tree)

        orngTree.printDot(tree,
                          fileName=name + '.dot',
                          nodeShape='ellipse',
                          leafShape='box')

        return text_tree
Ejemplo n.º 14
0
    def orange_dt_rules(self):

        start = time.time()
        bad_cutoff = self.influence_cutoff(self.bad_tables)
        good_cutoff = self.influence_cutoff(self.good_tables)
        _logger.debug("cutoffs\t%f\t%f", bad_cutoff, good_cutoff)
        self.cost_cutoff = time.time() - start

        _logger.debug("creating training data")
        training = self.create_training(bad_cutoff, good_cutoff)

        #_logger.debug( "training on %d points" , len(training))
        tree = orngTree.TreeLearner(training)
        rules = tree_to_clauses(training, tree.tree)
        #_logger.debug('\n'.join(map(lambda r: '\t%s' % r, rules)))

        # tree = Orange.classification.tree.C45Learner(training, cf=0.001)
        # rules = c45_to_clauses(training, tree.tree)
        return training, rules
Ejemplo n.º 15
0
def main():
    """Main script"""

    paper_table = build_papers_table()
    tree = orngTree.TreeLearner(minSubset=5, sameMajorityPruning=True)

    learners = [tree]

    FOLDS = 10

    results = Orange.evaluation.testing.cross_validation(learners,
                                                         paper_table,
                                                         folds=FOLDS,
                                                         storeClassifiers=1)

    confusions = []

    print "Learner  CA     Brier  AUC"
    for i in range(len(learners)):
        print "%-8s %5.3f  %5.3f  %5.3f" % (learners[i].name, \
        Orange.evaluation.scoring.CA(results)[i],
        Orange.evaluation.scoring.Brier_score(results)[i],
        Orange.evaluation.scoring.AUC(results)[i])

        for k in range(0, FOLDS):
            indices = [
                paper_table[x] for x in range(0, len(paper_table))
                if results.results[x].iteration_number == k
            ]

            confusions.append(
                buildConfusion(indices, results.classifiers[k][i],
                               TYPE_DIRS.keys()))

        confusion = buildTotalConfusions(confusions, TYPE_DIRS.keys())

        printConfusion(confusion, TYPE_DIRS.keys())
        printMeasures(confusion)

        orngTree.printTxt(results.classifiers[k][i],
                          leafStr="%V (%M / %N)",
                          nodeStr="(%M / %N)",
                          leafFields=['major', 'contingency'])
    def train(self, trainset):
        """
        Trains an ensemble of tree with Adaboost.M1.
        """

        self.n_classes = len(trainset.metadata['targets'])

        trainset_orange = make_orange_dataset(trainset)
        self.trainset_domain = trainset_orange.domain

        tree = orngTree.TreeLearner(max_majority=self.max_majority,
                                    max_depth=self.max_depth,
                                    min_instances=self.min_leaf_size,
                                    skip_prob=self.skip_prob)

        adaboost = orngEnsemble.BoostedLearner(learner=tree,
                                               t=self.n_trees,
                                               name="AdaBoost.M1")
        self.boosted_trees = adaboost(instances=trainset_orange)
Ejemplo n.º 17
0
    def summary(self):
        sys.stdout.write("%u tests, %u passed, %u skipped, %u failed\n\n" %
                         (self.tests, self.passed, self.skipped, self.failed))
        sys.stdout.flush()

        name, ext = os.path.splitext(os.path.basename(sys.argv[0]))
        filename = name + '.tsv'
        stream = file(filename, 'wt')

        # header
        stream.write('\t'.join(self.names) + '\n')
        stream.write('\t'.join(self.types) + '\n')
        stream.write('class\n')

        # rows
        for row in self.rows:
            row += [''] * (len(self.names) - len(row))
            stream.write('\t'.join(row) + '\n')

        stream.close()

        # See http://www.ailab.si/orange/doc/ofb/c_otherclass.htm
        try:
            import orange
            import orngTree
        except ImportError:
            sys.stderr.write(
                'Install Orange from http://www.ailab.si/orange/ for a classification tree.\n'
            )
            return

        data = orange.ExampleTable(filename)

        tree = orngTree.TreeLearner(data, sameMajorityPruning=1, mForPruning=2)

        orngTree.printTxt(tree, maxDepth=4)

        file(name + '.txt', 'wt').write(orngTree.dumpTree(tree))

        orngTree.printDot(tree,
                          fileName=name + '.dot',
                          nodeShape='ellipse',
                          leafShape='box')
Ejemplo n.º 18
0
 def test_author_classification_dummy_dataset(self):
     train_set = numpy.array([[0.2, 0.5, 0.2,  0.2, 0.1,  10.,  0],
                             [0.2, 0.3, 0.12, 0.1, 0.1,  10.,  0],
                             [0.2, 0.2, 0.08, 0.2, 0.01, 20.,  0],
                             [0.2, 0.5, 0.1,  0.1, 0.2,  5.,   0],
                             [0.2, 0.1, 0.2,  0.2, 0.3,  20.,  0],
                             [0.7, 0.5, 0.2,  0.8, 0.3,  0.1, 1],
                             [0.6, 0.8, 5.2,  0.2, 0.6,  0.3, 1],
                             [0.2, 0.6, 8.2,  0.9, 0.9,  0.1, 1],
                             [0.5, 0.9, 1.2,  0.1, 0.1,  0.2, 1],
                             [0.9, 0.1, 0.9,  0.6, 0.3,  0.6, 1]])
     
     attributes = ["retweets", "links", "retweeted", "replies", "mentions", "ff-ratio", "class"]
     
     table = construct_orange_table(attributes, train_set, classed=True)
     treeLearner = orngTree.TreeLearner()        
     treeClassifier = treeLearner(table)    
     example = Orange.data.Instance(table.domain, [0.2, 0.5, 0.2,  0.2, 0.1,  100,  0])    
     prediction = treeClassifier(example)
     self.assertEquals(0, prediction.value)
Ejemplo n.º 19
0
    def learnModel(self, X, y):
        if numpy.unique(y).shape[0] != 2:
            raise ValueError("Can only operate on binary data")

        classes = numpy.unique(y)
        self.worstResponse = classes[classes != self.bestResponse][0]

        #We need to convert y into indices
        newY = self.labelsToInds(y)

        XY = numpy.c_[X, newY]
        attrList = []
        for i in range(X.shape[1]):
            attrList.append(orange.FloatVariable("X" + str(i)))

        attrList.append(orange.EnumVariable("y"))
        attrList[-1].addValue(str(self.bestResponse))
        attrList[-1].addValue(str(self.worstResponse))

        self.domain = orange.Domain(attrList)
        eTable = orange.ExampleTable(self.domain, XY)

        #Weight examples
        preprocessor = orange.Preprocessor_addClassWeight(equalize=1)
        preprocessor.classWeights = [1 - self.weight, self.weight]
        eTable, weightID = preprocessor(eTable)
        eTable.domain.addmeta(weightID, orange.FloatVariable("w"))

        tree = orngTree.TreeLearner(mForPruning=self.m,
                                    measure="gainRatio",
                                    minExamples=self.minSplit,
                                    maxDepth=self.maxDepth).instance()

        self.learner = orngEnsemble.RandomForestLearner(
            learner=tree,
            trees=self.numTrees,
            attributes=numpy.round(X.shape[1] * self.featureSize))
        self.classifier = self.learner(eTable, weightID)
Ejemplo n.º 20
0
    def setLearner(self):
        learner = orngTree.TreeLearner(measure="retis",
                                       binarization=self.Bin,
                                       mForPruning=self.PostMPCheck
                                       and self.PostMPVal,
                                       minExamples=self.MinNodeCheck
                                       and self.MinNodeVal,
                                       storeExamples=1)
        if self.preprocessor:
            learner = self.preprocessor.wrapLearner(learner)
        learner.name = self.Name
        self.send("Learner", learner)
        self.error()

        classifier = None

        if self.data:
            try:
                classifier = learner(self.data)
                classifier.name = self.Name
            except orange.KernelException, (errValue):
                self.error(str(errValue))
                classifier = None
Ejemplo n.º 21
0
    def setLearner(self):
        learner = orngTree.TreeLearner(mesure="retis",
                                       binarization=self.Bin,
                                       mForPruning=self.PostMPCheck
                                       and self.PostMPVal,
                                       minExamples=self.MinNodeCheck
                                       and self.MinNodeVal,
                                       storeExamples=1)
        learner.name = self.Name
        self.send("Learner", learner)
        self.error()

        if not self.data:
            return

        try:
            classifier = learner(self.data)
            classifier.name = self.Name
            self.send("Regressor", classifier)
            self.send("Regression Tree", classifier)
        except orange.KernelException, (errValue):
            self.error(str(errValue))
            self.send("Regressor", None)
            self.send("Regression Tree", None)
Ejemplo n.º 22
0
    def learnModel(self, X, y):
        if numpy.unique(y).shape[0] != 2:
            raise ValueError("Can only operate on binary data")

        classes = numpy.unique(y)
        self.worstResponse = classes[classes != self.bestResponse][0]

        #We need to convert y into indices
        newY = self.labelsToInds(y)

        XY = numpy.c_[X, newY]
        attrList = []
        for i in range(X.shape[1]):
            attrList.append(orange.FloatVariable("X" + str(i)))

        attrList.append(orange.EnumVariable("y"))
        attrList[-1].addValue(str(self.bestResponse))
        attrList[-1].addValue(str(self.worstResponse))

        self.domain = orange.Domain(attrList)
        eTable = orange.ExampleTable(self.domain, XY)

        #Weight examples and equalise
        #Equalizing computes such weights that the weighted number of examples
        #in each class is equivalent.
        preprocessor = orange.Preprocessor_addClassWeight(equalize=1)
        preprocessor.classWeights = [1 - self.weight, self.weight]
        eTable, weightID = preprocessor(eTable)
        eTable.domain.addmeta(weightID, orange.FloatVariable("w"))

        self.learner = orngTree.TreeLearner(m_pruning=self.m,
                                            measure="gainRatio")
        self.learner.max_depth = self.maxDepth
        self.learner.stop = orange.TreeStopCriteria_common()
        self.learner.stop.min_instances = self.minSplit
        self.classifier = self.learner(eTable, weightID)
Ejemplo n.º 23
0
# Category:    modelling, evaluation
# Uses:        housing
# Classes:     orngTest.crossValidation, orngTree.TreeLearner, orange.kNNLearner, orngRegression.LinearRegressionLearner
# Referenced:  regression.htm

import orange
import orngRegression
import orngTree
import orngStat, orngTest

data = orange.ExampleTable("housing")

# definition of learners (regressors)
lr = orngRegression.LinearRegressionLearner(name="lr")
rt = orngTree.TreeLearner(measure="retis",
                          mForPruning=2,
                          minExamples=20,
                          name="rt")
maj = orange.MajorityLearner(name="maj")
knn = orange.kNNLearner(k=10, name="knn")
learners = [maj, lr, rt, knn]

# evaluation and reporting of scores
results = orngTest.crossValidation(learners, data, folds=10)
scores = [("MSE", orngStat.MSE), ("RMSE", orngStat.RMSE),
          ("MAE", orngStat.MAE), ("RSE", orngStat.RSE),
          ("RRSE", orngStat.RRSE), ("RAE", orngStat.RAE), ("R2", orngStat.R2)]

print "Learner  " + "".join(["%-7s" % s[0] for s in scores])
for i in range(len(learners)):
    print "%-8s " % learners[i].name + "".join(
        ["%6.3f " % s[1](results)[i] for s in scores])
Ejemplo n.º 24
0
# Description: Builds a regression tree and prints it out
# Category:    modelling
# Uses:        housing
# Classes:     orngTree.TreeLearner
# Referenced:  regression.htm

import orange, orngTree

data = orange.ExampleTable("../datasets/housing.tab")
rt = orngTree.TreeLearner(data, measure="retis", mForPruning=2, minExamples=20)
orngTree.printTxt(rt, leafStr="%V %I")
    appl = QApplication(sys.argv)
    ow = OWLearningCurveC()
    ow.show()

    l1 = orange.BayesLearner()
    l1.name = 'Naive Bayes'
    ow.learner(l1, 1)

    data = orange.ExampleTable('iris.tab')
    ow.dataset(data)

    l2 = orange.BayesLearner()
    l2.name = 'Naive Bayes (m=10)'
    l2.estimatorConstructor = orange.ProbabilityEstimatorConstructor_m(m=10)
    l2.conditionalEstimatorConstructor = orange.ConditionalProbabilityEstimatorConstructor_ByRows(
        estimatorConstructor=orange.ProbabilityEstimatorConstructor_m(m=10))

    l3 = orange.kNNLearner(name="k-NN")
    ow.learner(l3, 3)

    import orngTree
    l4 = orngTree.TreeLearner(minSubset=2)
    l4.name = "Decision Tree"
    ow.learner(l4, 4)

    #    ow.learner(None, 1)
    #    ow.learner(None, 2)
    #    ow.learner(None, 4)

    appl.exec_()
Ejemplo n.º 26
0
#!/usr/bin/env python
#
# See also:
#  http://www.ailab.si/orange/doc/ofb/c_otherclass.htm

import os.path
import sys

import orange
import orngTree

for arg in sys.argv[1:]:
    name, ext = os.path.splitext(arg)

    data = orange.ExampleTable(arg)

    tree = orngTree.TreeLearner(data, sameMajorityPruning=1, mForPruning=2)

    orngTree.printTxt(tree)

    file(name + '.txt', 'wt').write(orngTree.dumpTree(tree) + '\n')

    orngTree.printDot(tree,
                      fileName=name + '.dot',
                      nodeShape='ellipse',
                      leafShape='box')
Ejemplo n.º 27
0
Archivo: sqk.py Proyecto: sloria/usv
def main():
    version = "%prog version 0.1"
    usage = "usage: %prog [options] [input] [options [classification]]"
    desc = "QUICK START: To extract data from a trial, 'cd' to the \
trial's directory and type: 'sqk --classify'. To extract data \
from one channel of the trial (ch 1 in this case), type: \
'sqk --classify --channel=1'."

    # Parse command line options.
    parser = optparse.OptionParser(usage, version=version, description=desc)
    parser.add_option("-C", "--classify",
                    dest="classify",
                    action="store_true",
                    default=False,
                    help="Classify the trial. IMPORTANT: Trial folder must " \
                         "be the current directory.")
    parser.add_option("-m", "--channel", metavar="<CH>",
                    dest="channel",
                    action="store",
                    type="int",
                    default=0,
                    help="Specify which channel to extract data from. " \
                         "Default (%default) extracts data from both " \
                         "channels. Must choose 0 (both channels), 1, or 2.")
    parser.add_option("-l", "--log",
                    dest="log", action="store_true", default=False,
                    help="Parses a log file if it exists and adds time and" \
                         " duration information to the data file.")
    parser.add_option("-T",
                      "--traindata",
                      metavar="<DATA_FILE>",
                      dest="trainData",
                      action="store",
                      default=os.path.join(TRAIN_PATH, 'traindata'),
                      help="Specify training data set. Default is %default")
    parser.add_option("-L", "--learner", metavar="<TYPE>",
                    dest="learner",
                    action="store",
                    default="svm",
                    help="Specify the classifier algorithm. Options include:" \
                         " 'bayes' (Naive Bayes), 'knn' (k-Nearest Neighbor)," \
                         " 'svm' (SVM), 'forest' (random forest). " \
                         "Default is %default.")
    parser.add_option("-f",
                      "--file",
                      metavar="<AUDIO_FILE>",
                      dest="audio",
                      action="store",
                      help="Extract features and classify audio file (wav)")
    parser.add_option("-p", "--path", metavar="<PATH>",
                    dest="path",
                    action="store",
                    help="Extract features and classify all files in a " \
                         "directory. To extract from current directory: " \
                         "'usv.py -p .' ")
    parser.add_option("-r", "--rate", metavar="<SAMPLE_RATE>",
                    dest="sampleRate",
                    action="store",
                    default="11025",
                    help="Specify the sample rate of input files. Default is " \
                         "%default (Hz).")
    parser.add_option("-t",
                      "--train",
                      metavar="<CLASS>",
                      dest="exampleClass",
                      action="store",
                      type='string',
                      help="Label the training example(s).")
    parser.add_option("-d", "--data", metavar="<DATA_FILE>",
                    dest="data",
                    action="store",
                    default="data.tab",
                    help="Write to data file (.tab format). Default is " \
                         "'%default' or 'traindata.tab' for training data.")
    parser.add_option("-S", "--seg-resamp",
                    dest="segment",
                    action="store_true",
                    default=False,
                    help="Resample to 11025 Hz and split into multiple files " \
                         "based on silence. IMPORTANT: Trial folder must " \
                         "be the current directory.")
    (opts, args) = parser.parse_args()
    if opts.channel and not (opts.classify or opts.segment):
        parser.error("'--channel' option requires '--classify' option'")
    if opts.log and not opts.classify:
        parser.error("'--log' option requires '--classify' option'")

    # Open train data file or create it if it doesn't exist.
    if opts.exampleClass and opts.data == "data.tab":
        opts.data = os.path.join(TRAIN_PATH, 'traindata.tab')

    if opts.audio or opts.path:
        if not opts.segment:
            print 'Opening %r. . .' % (opts.data)
            data = open(opts.data, "a+")
    elif opts.segment:
        print "Resampling and segmenting trial. . ."
    elif opts.classify:
        print "Classifying trial. . ."
    else:
        parser.error('No input file or path specified.')

    # If user specifies an audio file (-f AUDIO_FILE)
    if opts.audio:
        file_name, ext = os.path.splitext(opts.audio)
        # Add MFCC 1-12 to data.
        if not opts.segment:
            write_features(opts.audio, opts.sampleRate, data)
        # If classification is specified, write to data.
        if opts.exampleClass:
            data.write(opts.exampleClass.lower() + "\n")
            print "Classified %r as %r." % (opts.audio,
                                            opts.exampleClass.lower())
        # Else if user chooses to segment file (-S)
        elif opts.segment:
            print "Resampling and segmenting %s. . ." % (opts.audio)
            if opts.channel == 0:
                run_commands(
                    seg_resamp(opts.audio,
                               int(opts.sampleRate),
                               outfile=file_name + '_call.wav',
                               directory=file_name + "_ch1_2",
                               ch1=True,
                               ch2=True))
            elif opts.channel == 1:
                run_commands(
                    seg_resamp(opts.audio,
                               int(opts.sampleRate),
                               outfile=file_name + '_ch1_.wav',
                               directory=file_name + "_ch1",
                               ch1=True,
                               ch2=False))
            elif opts.channel == 2:
                run_commands(
                    seg_resamp(opts.audio,
                               int(opts.sampleRate),
                               outfile=file_name + '_ch2_.wav',
                               directory=file_name + "_ch2",
                               ch1=False,
                               ch2=True))
            print "Wrote to './%s'." % (file_name + "_calls")
        else:
            print "Invalid data for %r. Skipping. . ." % opts.audio
            data.write('\n')
    # Else if user specifies path (-p PATH)
    elif opts.path:
        # Read all wav files in specified path
        try:
            for root, dirs, files in os.walk(opts.path):
                for basename in files:
                    if fnmatch.fnmatch(basename, "*.[wW][aA][vV]"):
                        audiofile = os.path.join(root, basename)
                        # Skip small files
                        if os.path.getsize(audiofile) < 100:
                            continue
                        file_name, ext = os.path.splitext(audiofile)
                        # Add MFCC 1-12 to data.
                        if not opts.segment:
                            write_features(audiofile, opts.sampleRate, data)
                        # Write filename
                        data.write(str(os.path.basename(audiofile)) + "\t")
                        # If classification is specified, write to file.
                        if opts.exampleClass:
                            data.write(opts.exampleClass.lower() + "\n")
                            print "Classified %r as %r." % (
                                audiofile, opts.exampleClass.lower())
                        # If user specifies resample and segment
                        elif opts.segment:
                            print "Resampling and segmenting %r. . ." % (
                                audiofile)
                            if opts.channel == 0:
                                run_commands(
                                    seg_resamp(
                                        audiofile,
                                        int(opts.sampleRate),
                                        outfile=os.path.basename(file_name) +
                                        '_call.wav',
                                        directory=os.path.basename(file_name) +
                                        "_ch1_2",
                                        ch1=True,
                                        ch2=True))
                            elif opts.channel == 1:
                                run_commands(
                                    seg_resamp(
                                        audiofile,
                                        int(opts.sampleRate),
                                        outfile=os.path.basename(file_name) +
                                        '_ch1_.wav',
                                        directory=os.path.basename(file_name) +
                                        "_ch1",
                                        ch1=True,
                                        ch2=False))
                            elif opts.channel == 2:
                                run_commands(
                                    seg_resamp(
                                        audiofile,
                                        int(opts.sampleRate),
                                        outfile=os.path.basename(file_name) +
                                        '_ch2_.wav',
                                        directory=os.path.basename(file_name) +
                                        "_ch2",
                                        ch1=False,
                                        ch2=True))
                        else:
                            data.write('\n')
        except (FloatingPointError, IOError):
            print "An error occurred. Skipping %. . .r" % audiofile
    # Else if user chooses to segment and resample the trial (current dir)
    elif opts.segment:
        for audiofile in glob(os.path.join('./', "*.[wW][aA][vV]")):
            file_name, ext = os.path.splitext(audiofile)
            print "Resampling and segmenting %r. . ." % (file_name)
            if opts.channel == 0:
                run_commands(
                    seg_resamp(audiofile,
                               int(opts.sampleRate),
                               outfile=file_name + '_call.wav',
                               directory=file_name + "_ch1_2",
                               ch1=True,
                               ch2=True))
            elif opts.channel == 1:
                run_commands(
                    seg_resamp(audiofile,
                               int(opts.sampleRate),
                               outfile=file_name + '_ch1_.wav',
                               directory=file_name + "_ch1",
                               ch1=True,
                               ch2=False))
            elif opts.channel == 2:
                run_commands(
                    seg_resamp(audiofile,
                               int(opts.sampleRate),
                               outfile=file_name + '_ch2_.wav',
                               directory=file_name + "_ch2",
                               ch1=False,
                               ch2=True))
    # Else if user chooses to classify the trial
    elif opts.classify:
        # TODO: Should not be able to classify if no data files in folder
        try:
            traindata = orange.ExampleTable(opts.trainData)
        except SystemError:
            print "Training data not found."
            sys.exit(1)
        # The logger
        if opts.log:
            logs = glob(os.path.join(os.getcwd(), "*.[lL][oO][gG]"))
            if len(logs) > 1:
                print "ERROR: Multiple log files."
                sys.exit(1)
            log = usv.avisoftlog.RecLog(open(logs[0], 'r'))

        # The classifier
        print "Constructing %s classifier \
(may take several minutes). . ." % (opts.learner)
        if opts.learner.lower() == "bayes":
            classifier = orange.BayesLearner(traindata)
            classifier.name = "naive_bayes"
        elif opts.learner.lower() == "knn":
            classifier = Orange.classification.knn.kNNLearner(traindata)
            classifier.name = "kNN"
        elif opts.learner.lower() == "svm":
            svm = SVMLearner(name="SVM",
                             kernel_type=kernels.RBF,
                             C=128,
                             gamma=2,
                             nu=0.1)
            classifier = svm(traindata)
            classifier.name = "SVM"
        elif opts.learner.lower() == "tree":
            classifier = orngTree.TreeLearner(traindata)
            classifier.name = "tree"
        elif opts.learner.lower() == "forest":
            classifier = Orange.ensemble.forest.RandomForestLearner(traindata)
            classifier.name = "random_forest"

        # Create data summary file
        if opts.channel == 0:
            datasummary_name = os.path.splitext(opts.data)[0] + "_ch1_2.tab"
        elif opts.channel == 1:
            datasummary_name = os.path.splitext(opts.data)[0] + "_ch1.tab"
        elif opts.channel == 2:
            datasummary_name = os.path.splitext(opts.data)[0] + "_ch2.tab"
        if os.path.exists(datasummary_name):
            print "Data file %r already exists." % (datasummary_name)
            print "Exiting . . ."
            sys.exit(1)
        else:
            summary = open(datasummary_name, "a+")
        # Write metadata
        summary.write("# data = %s\n" % (datasummary_name))
        summary.write("# channel = %d\n" % (opts.channel))
        summary.write("# sample_rate = %s\n" % (opts.sampleRate))
        summary.write("# classifier = %s\n" % (classifier.name))
        # Write header
        summary.write("FILE\t")
        for i in range(len(traindata.domain.classVar.values)):
            summary.write(traindata.domain.classVar.values[i].upper() + "\t")
        if opts.log:
            summary.write("start: " + str(log.start.time) + "\t")
            summary.write("Duration" + "\t")
        summary.write("\n")

        totals = [0] * len(traindata.domain.classVar.values)
        proportions = [0.0] * len(totals)
        for root, dirs, files in os.walk(os.getcwd()):
            # For each file's directory in this trial
            for dir in dirs:
                data = open(os.path.join(dir, dir + '.tab'), 'w+')
                if opts.channel == 0:
                    calls = glob(os.path.join(dir, "*ch1_2*.[wW][aA][vV]"))
                elif opts.channel == 1:
                    calls = glob(os.path.join(dir, "*ch1*.[wW][aA][vV]"))
                elif opts.channel == 2:
                    calls = glob(os.path.join(dir, "*ch2*.[wW][aA][vV]"))
                # For each call
                for c in calls:
                    # Skip small files
                    if os.path.getsize(c) < 100:
                        print "Skipping %s (not enough data)" % c
                        continue
                    # Write feature data
                    write_features(c, opts.sampleRate, data)
                    data.close()  # Ensures that data is saved
                    # Write filenames and classifications
                    data = open(os.path.join(dir, dir + '.tab'), 'a+')
                    datatable = orange.ExampleTable(
                        os.path.join(dir, dir + '.tab'))
                    classification = classifier(datatable[calls.index(c)])
                    data.write(str(os.path.basename(c)) + '\t')
                    data.write(str(classification))
                    data.write('\n')
            try:
                data.close()
            except UnboundLocalError:
                parser.error(
                    'No directories in this folder. Did you remember to segment the files?'
                )

            # Write class count data to summary table
            for dir in dirs:
                if opts.channel == 0:
                    data_files = glob(os.path.join(dir, "*ch1_2.tab"))
                elif opts.channel == 1:
                    data_files = glob(os.path.join(dir, "*ch1.tab"))
                elif opts.channel == 2:
                    data_files = glob(os.path.join(dir, "*ch2.tab"))
                for c in data_files:
                    if os.path.getsize(c) == 0:
                        continue
                    file_name, ext = os.path.splitext(os.path.basename(c))
                    summary.write(file_name + '\t')
                    callsdata = orange.ExampleTable(os.path.join("./", c))
                    # Vector of class counts
                    counts = [0] * len(callsdata.domain.classVar.values)
                    for e in callsdata:
                        counts[int(e.getclass())] += 1
                    # Write counts
                    for i in range(len(callsdata.domain.classVar.values)):
                        summary.write(str(counts[i]) + "\t")
                        totals[i] += counts[i]
                    # Write log data
                    if opts.log:
                        tmp = str(os.path.basename(dir)).lower()
                        entry = tmp[0:tmp.find("_")] + ".wav"
                        summary.write(str(log.getevent(entry).time) + "\t")
                        summary.write(log.getevent(entry).duration + "\t")
                        log.close()
                    summary.write('\n')
        # Write totals. Exclude BGNOISE.
        summary.write("TOTAL" + "\t\t")
        for i in range(1, len(totals)):
            summary.write(str(totals[i]) + "\t")
        if opts.log:
            summary.write("end: " + str(log.end.time) + "\t")
        summary.write("\n")
        # Write proportions. Exclude BGNOISE.
        summary.write("P" + "\t\t")
        for i in range(1, len(proportions)):
            try:
                proportions[i] = float(
                    totals[i]) / float(sum(totals) - totals[0])
            except ZeroDivisionError:
                proportions[i] = 0.0
            summary.write("%.4f\t" % (proportions[i]))
        summary.write("\n")
        summary.close()
        # Open data file when finished
        subprocess.call('open %s' % (datasummary_name), shell=True)

    else:
        data.write("\n")

    if not opts.segment:
        data.close()
    print "Success!"
Ejemplo n.º 28
0
 def makeLearner(self):
     treeLearner = orngTree.TreeLearner(storeExamples=True)
     return treeLearner
Ejemplo n.º 29
0
# Description: Demostration of use of cross-validation as provided in orngEval module
# Category:    evaluation
# Uses:        voting.tab
# Classes:     orngTest.crossValidation
# Referenced:  c_performance.htm

import orange, orngTest, orngStat, orngTree

# set up the learners
bayes = orange.BayesLearner()
tree = orngTree.TreeLearner(mForPruning=2)
bayes.name = "bayes"
tree.name = "tree"
learners = [bayes, tree]

# compute accuracies on data
data = orange.ExampleTable("voting")
results = orngTest.crossValidation(learners, data, folds=10)

# output the results
print "Learner  CA     IS     Brier    AUC"
for i in range(len(learners)):
    print "%-8s %5.3f  %5.3f  %5.3f  %5.3f" % (learners[i].name, \
        orngStat.CA(results)[i], orngStat.IS(results)[i],
        orngStat.BrierScore(results)[i], orngStat.AUC(results)[i])
Ejemplo n.º 30
0
if __name__ == "__main__":
    a = QApplication(sys.argv)
    ow = OWPredictions()
    ow.show()

    import orngTree

    dataset = orange.ExampleTable('../../doc/datasets/iris.tab')
    #    dataset = orange.ExampleTable('../../doc/datasets/auto-mpg.tab')
    ind = orange.MakeRandomIndices2(p0=0.5)(dataset)
    data = dataset.select(ind, 0)
    test = dataset.select(ind, 1)
    testnoclass = orange.ExampleTable(
        orange.Domain(test.domain.attributes, False), test)
    tree = orngTree.TreeLearner(data)
    tree.name = "tree"
    maj = orange.MajorityLearner(data)
    maj.name = "maj"
    knn = orange.kNNLearner(data, k=10)
    knn.name = "knn"

    if 0:  # data set only
        ow.setData(test)
    if 0:  # two predictors, test data with class
        ow.setPredictor(maj, 1)
        ow.setPredictor(tree, 2)
        ow.setData(test)
    if 0:  # two predictors, test data with no class
        ow.setPredictor(maj, 1)
        ow.setPredictor(tree, 2)