Esempio n. 1
0
def load_project_data(project_name, n_folds=5):
    """returns two sets of data from the specified project. The first one
    contains 4/5 of the data for training. The second contains the remaining 1/5 of 
    the data for testing."""
    root_dir = abspath(DATA_PATH)

    data = parse_c45(project_name, root_dir)

    n_data = len(data)
    pos_data = []
    neg_data = []
    for ex in data:
        if ex[-1]:
            pos_data.append(ex)
        else:
            neg_data.append(ex)

    n_pos = len(pos_data)
    n_neg = len(neg_data)

    random.shuffle(pos_data)
    random.shuffle(neg_data)

    n_pos_fold = int(ceil(n_pos / float(n_folds)))  #n fold cross validation
    n_neg_fold = int(ceil(n_neg / float(n_folds)))

    folds = []
    for i in range(0, n_folds):  #n_folds folds
        pos_fold = pos_data[n_pos_fold * i:n_pos_fold * i + n_pos_fold]
        neg_fold = neg_data[n_neg_fold * i:n_neg_fold * i + n_neg_fold]

        #not sure you need this, but it seems like a bad idea to train a
        # all on positive then negative examples. It' can't hurt really.
        # should not do anything for deterministic backprop, but will for
        # stochastic backprop
        pos_fold.extend(neg_fold)
        random.shuffle(pos_fold)
        folds.append(pos_fold)

    #create the different training and test set pairs
    fold_sets = []
    for i in range(0, n_folds):
        test = folds.pop(i)
        train = []
        for fold in folds:
            train.extend(fold)
        fold_sets.append((ExampleSet(train), ExampleSet(test)))
        folds.insert(i, test)
    return fold_sets
Esempio n. 2
0
    def test_part_discrete_data(self):
        train_data, test_data = load_project_data('example')

        examples = [ex for ex in train_data] + [ex for ex in test_data]
        data = ExampleSet(examples)

        n = Node()

        H_x, H_y_x, part_data = n.partition_data(data, 1)

        self.assertAlmostEqual(0.61219, H_y_x, 3)
        self.assertAlmostEqual(1.5849, H_x, 3)

        H_x, H_y_x, part_data = n.partition_data(data, 3)
        self.assertAlmostEqual(0.61219, H_y_x, 3)
        self.assertAlmostEqual(1.5849, H_x, 3)

        attr_index, part_data = n.max_GR(examples, [1, 3])
        self.assertEqual(attr_index, 1)

        part_data_test = {}
        for ex in examples:
            part_data_test.setdefault(ex[1], []).append(ex)
        self.assertEqual(part_data_test, part_data)

        attr_index, part_data = n.max_GR(examples, [3, 1])
        self.assertEqual(attr_index, 3)

        part_data_test = {}
        for ex in examples:
            part_data_test.setdefault(ex[3], []).append(ex)
        self.assertEqual(part_data_test, part_data)
Esempio n. 3
0
 def train(self,ex_set,attr_set,depth=0): 
     """trains a tree, based on the given data. depth is used to track tree depth 
     so that stopping conditions can be enforced"""   
     ex_set = ExampleSet(ex_set)
            
     mcc,partable = self.check_ex_set(ex_set,attr_set)
     #print "Depth: ", depth, ", Data Length: ", len(ex_set)
     attr,part_data = self.max_GR(ex_set,attr_set)
     
     if part_data and not (depth == MAX_DEPTH and MAX_DEPTH > 0):
         
         self.attr_index = attr
         
         if ex_set.schema[attr].type == "CONTINUOUS": 
             new_attr_set = attr_set[:]
             new_attr_set.remove(attr)
         else: 
             new_attr_set = attr_set    
         #print "Data Length: ", sum(len(sub_data) for f,sub_data in part_data.iteritems()) 
         #print 
         #print
         for feature,sub_data in part_data.iteritems():
             self.children[feature] = Node()
             self.children[feature].train(sub_data,new_attr_set,depth+1)
             
     else: 
         self.is_leaf = True
         self.classifier = mcc #most common classifier
Esempio n. 4
0
def load_project_data(project_name): 
    """returns two sets of data from the specified project. The first one
    contains 4/5 of the data for training. The second contains the remaining 1/5 of 
    the data for testing.""" 
    root_dir = abspath(DATA_PATH)
    
    data = parse_c45(project_name,root_dir)
    
    n_data = len(data)
    n_train,n_test = int(floor(4/5.0*n_data)),int(ceil(1/5.0*n_data))
    
    train_choices = set(random.sample(xrange(n_data),n_train))
        
    train_data, test_data = [],[]
    for i,ex in enumerate(data): 
        if i in train_choices: 
            train_data.append(ex)
        else: 
            test_data.append(ex)
    
    return ExampleSet(train_data),ExampleSet(test_data)
Esempio n. 5
0
    def test_discrete_predict(self):
        train_data, test_data = load_project_data('example')
        examples = [ex for ex in train_data] + [ex for ex in test_data]
        data = ExampleSet(examples)

        n = Node()

        n.train(data, [1, 3])  #only train on the discrete data

        self.assertTrue(all([ex[-1] == n.predict(ex) for ex in data]))

        n.train(data, [3, 1])  #only train on the discrete data

        self.assertTrue(all([ex[-1] == n.predict(ex) for ex in data]))
Esempio n. 6
0
 def partition_data(self,ex_set,attr_index): 
     """returns a 3-tuple of (H_x,H_y_x,partitioned_data)"""  
     part_data = {}
     ex_set = ExampleSet(ex_set)
     #print "  check",len(ex_set),
     if ex_set.schema[attr_index].type == 'CONTINUOUS': 
         #print "CONTINUOUS"
         self.binner = ContBinner()
         
         ex_set = sorted(ex_set,key=lambda x:x[attr_index])
         
         max_entropy_set = (None,None,None)
         print "test"
         for i,(ex1,ex2) in enumerate(zip(ex_set[:-1],ex_set[1:])): 
             part_data = {}
             if ex1[-1] == ex2[-1]: #not a threshold
                 continue
             else:     
                 self.binner.threshold = ex1[attr_index]
                 for ex in ex_set: 
                     bin = self.binner(ex[attr_index])
                     part_data.setdefault(bin,[]).append(ex)     
                 
                 H_x,H_y_x = self.calc_entropies(part_data)  
             if H_y_x > max_entropy_set[1]: 
                 max_entropy_set =  H_x,H_y_x,part_data
         print i   
         H_x,H_y_x,part_data = max_entropy_set
                    
     else: 
         print "check"  
         #print "DISCRETE"  
         for i,ex in enumerate(ex_set): 
                #converts the value to a binned value, really only needed for continuous attrs though
                bin = self.binner(ex[attr_index])    
                part_data.setdefault(bin,[]).append(ex)
     
                H_x,H_y_x = self.calc_entropies(part_data)  
     return H_x,H_y_x,part_data 
Esempio n. 7
0
    def test_is_partable_data(self):
        train_data, test_data = load_project_data('example')

        examples = [ex for ex in train_data] + [ex for ex in test_data]
        data = ExampleSet(examples)

        n = Node()

        self.assertTrue(n.check_ex_set(examples, [1, 3]))
        self.assertTrue(n.check_ex_set(examples, [1, 3]))

        index, part_data = n.max_GR(examples, [1, 3])
        self.assertEqual(index, 1)
        test_part_data = [ex for ex in examples if ex[1] == 'red']
        self.assertEqual(set(test_part_data), set(part_data['red']))

        self.assertTrue(n.check_ex_set(part_data['red'], [
            3,
        ])[1])
        index, sub_data = n.max_GR(part_data['red'], [3])
        self.assertEqual(index, 3)

        test_part_data = [ex for ex in examples if ex[1] == 'green']
        self.assertEqual(set(test_part_data), set(part_data['green']))
        self.assertFalse(n.check_ex_set(part_data['green'], [
            3,
        ])[1])

        test_part_data = [ex for ex in examples if ex[1] == 'blue']
        self.assertEqual(set(test_part_data), set(part_data['blue']))
        self.assertTrue(n.check_ex_set(part_data['blue'], [
            3,
        ])[1])
        index, sub_data = n.max_GR(part_data['blue'], [3])
        self.assertEqual(index, 3)

        index, sub_data = n.max_GR(examples, [3, 1])
        self.assertEqual(index, 3)