Esempio n. 1
0
def main():
    trainset, trainraw = makeData(training, labels)
    testset, testraw = makeData(testing, labels)
    medians = medianAssign(trainset, labels)
    trainset = removeNums(trainset, medians)
    testset = removeNums(testset, medians)
    fix = fixUnknown(trainset, labels)
    trainsetU = replaceUnknown(trainset, labels, fix)
    testsetU = replaceUnknown(testset, labels, fix)

    trainLabels = [item[-1] for item in trainraw]
    testLabels = [item[-1] for item in testraw]
    #print(trainset[1])
    print(
        "Running decision tree algorithm on the bank dataset with unknown values"
    )
    algotype = ['gini', 'entropy', 'ME']
    for item in algotype:
        for i in range(1, 17):
            currentTree = id3(trainset, labels, label_attr, labels[-1], i,
                              item, None)

            trainPred = [predict(currentTree, x, labels) for x in trainset]
            testPred = [predict(currentTree, x, labels) for x in testset]

            trainAcc = accuracy(trainPred, trainLabels)
            testAcc = accuracy(testPred, testLabels)

            print("Decision tree of depth", i, "using", item,
                  "has a test accuracy of", testAcc,
                  'and a training accuracy of', trainAcc)

    print(
        "Running decision tree algorithm on the bank dataset with unknown's replaced"
    )
    print("\n \n \n \n \n")

    for item in algotype:
        for i in range(1, 17):
            currentTree = id3(trainsetU, labels, label_attr, labels[-1], i,
                              item, None)

            trainPred = [predict(currentTree, x, labels) for x in trainsetU]
            testPred = [predict(currentTree, x, labels) for x in testsetU]

            trainAcc = accuracy(trainPred, trainLabels)
            testAcc = accuracy(testPred, testLabels)

            print("Decision tree of depth", i, "using", item,
                  "has a test accuracy of", testAcc,
                  'and a training accuracy of', trainAcc)
Esempio n. 2
0
def main():
    trainset, trainRaw = makeData(training, labels)
    testset, testRaw = makeData(testing, labels)
    trainLabels = [item[-1] for item in trainRaw]
    testLabels = [item[-1] for item in testRaw]

    #print(gf.gainE(trainset,labels[5], labels[-1]))

    #    mytree = id3(trainset, labels, label_attr, labels[-1], 6, "entropy", None)
    #    printTree(mytree)
    print("Running the decision tree algorithm on the 'Cars' dataset.")

    algotype = ['gini', 'entropy', 'ME']
    for item in algotype:
        for i in range(1, 7):
            currentTree = id3.id3(trainset, labels, label_attr, labels[-1], i,
                                  item, None)

            trainPred = [id3.predict(currentTree, x, labels) for x in trainset]
            testPred = [id3.predict(currentTree, x, labels) for x in testset]

            trainAcc = id3.accuracy(trainPred, trainLabels)
            testAcc = id3.accuracy(testPred, testLabels)

            print("Decision tree of depth", i, "using", item,
                  "has a test accuracy of", testAcc,
                  'and a training accuracy of', trainAcc)
Esempio n. 3
0
def boost(data, labels, attr_list, target, iterations, answers):
    treelist = []
    for i in range(iterations):
        normalize = 0
        for item in data:
            normalize += item['weight']
        currentTree = id3(data, labels, attr_list, target, 2, 'entropy', None)
        #        predictor = []
        #        for item in data:
        #            predictor.append([predict(currentTree, item, labels), item['weight']])

        trainError = 0
        #        print('weight =' , data[0]['weight'])
        for item in data:
            if predict(currentTree, item, labels) != item[labels[-1]]:
                trainError += item['weight'] / normalize
#        print(trainError)
#        print(normalize)
#        alpha = 1/2*m.log((1-trainError)/trainError)
        alpha = 1 / 4 * m.log((1 - trainError) / trainError)
        #        print(m.exp(alpha), m.exp(-alpha))
        #        print(alpha, " = alpha")
        for item in data:
            if predict(currentTree, item, labels) != item[labels[-1]]:
                item['weight'] = item['weight'] * m.exp(alpha)
#                print(item['weight'])
            else:
                item['weight'] = item['weight'] * m.exp(-alpha)


#                print(item['weight'])
        treelist.append({'tree': currentTree, 'alpha': alpha})
        #print(treelist)
    return treelist
Esempio n. 4
0
File: rf.py Progetto: palmagro/mrrf
 def __init__(self,gr,nodes,tipo,target,vtarget,narboles,nnodes,ntrels,maxdepth,exrel,umbral):
     self.graph_db = gr
     self.tipo = tipo
     self.target = target
     self.vtarget = vtarget
     self.nnodes = nnodes
     self.ntrels = ntrels
     self.maxdepth = maxdepth
     
     TC = neo4j.CypherQuery(self.graph_db, "MATCH (a)-[r]->(b) WHERE labels(a) <> [] AND labels(b) <> [] RETURN DISTINCT head(labels(a)) AS This, type(r) as To, head(labels(b)) AS That limit "+str(self.ntrels)).execute()
     print "Tipos de aristas cargadas: "+ str(len(TC.data)) + " elementos."
     while(len(self.arboles)<narboles):  
         tempn = nodes#random.sample(nodes , ( random.randint(1,len(nodes)/2))) + (random.sample(set(nodes[-len(nodes)/2:]), random.randint(1,len(nodes)/2)))
         tempr = random.sample(set(TC.data), random.randint(3,len(TC)))
         #tempr = TC.data
         arbol = id3(gr,target,vtarget,tempr)
         res = arbol.execute(tempn,"match (n:"+self.tipo+")",self.tipo,self.maxdepth,-999,999,exrel,umbral,target)
         tab = []
         for l in res.get_leaves():
             a,b,c = l.name.partition("*")
             tab.append(a) 
         entra = True
         for c in self.arboles:
            if entra == True:
                if self.checkequals(c.arbol,res):
                 entra = not self.checkequals(c.arbol,res)
         if len(res.get_edges()) > 2 and len(set(tab)) > 1 and entra:    
             print "Arbol "+str(len(self.arboles)+1)+"("+str(len(tempn))+" nodos):"
             print res.get_ascii(show_internal=True)      
             self.arboles.append(arbol)
Esempio n. 5
0
def raise_forest(Xtrain, ytrain, n, train_size, att_size):
	print "Raising forest with " + str(n) + " trees"
	trees = []
	for i in xrange(n):
		sub_train_x, sub_train_ind = sample_with_rep(Xtrain, train_size)	
		sub_train_y = ytrain[sub_train_ind]
		examples = [(sub_train_x[i], sub_train_y[i]) for i in xrange(len(sub_train_ind))]
		sub_att, sub_att_ind = sample_with_rep(attributes,att_size)
		sub_att = set(sub_att)
		trees.append(id3.id3(examples, sub_att))	
	return trees
Esempio n. 6
0
def raise_forest(Xtrain, ytrain, n, train_size, att_size):
    print "Raising forest with " + str(n) + " trees"
    trees = []
    for i in xrange(n):
        sub_train_x, sub_train_ind = sample_with_rep(Xtrain, train_size)
        sub_train_y = ytrain[sub_train_ind]
        examples = [(sub_train_x[i], sub_train_y[i])
                    for i in xrange(len(sub_train_ind))]
        sub_att, sub_att_ind = sample_with_rep(attributes, att_size)
        sub_att = set(sub_att)
        trees.append(id3.id3(examples, sub_att))
    return trees
Esempio n. 7
0
def evaluate(U, k):
    D = get_decisions(U)
    Y = get_classes(U)

    shuffle(U)
    U = divide(U, k)

    evaluation = []
    for i in range(k):
        tree = id3(Y, D, substract(U, U[i]))
        evaluation.append(avg_loss(tree, U[i]))

    return sum(evaluation) / k
Esempio n. 8
0
def __k_fold(dataset, k):
    split_dataset = np.array_split(dataset, k)
    results = pd.DataFrame(columns=["TP", "TN", "FP", "FN"])
    for i in range(k):
        train_set = split_dataset.copy()
        test_set = split_dataset[i]
        del train_set[i]
        train = pd.concat(train_set, sort=False)

        attributes = train.keys().drop(config.file_label)
        tree = id3.id3(train, attributes)
        tmp_results = __test(tree, test_set)
        results = results.append(tmp_results, ignore_index=True)

    results = results.sum()
    return results
Esempio n. 9
0
def validation_of_full_set_multirun_for_different_dataset_size(dataset, starting_set_part, min_set_part):
    full_results = pd.DataFrame(columns=COLUMNS)
    dataset_size = len(dataset)
    part = starting_set_part
    while part >= min_set_part:
        print("part: " + str(part))
        results = pd.DataFrame(columns=["TP", "TN", "FP", "FN"])
        training_size = round(part * dataset_size)
        for i in range(config.num_of_reruns):
            print("rerun: " + str(i))
            training = dataset.sample(frac=part, random_state=config.rng_seed + i)
            attributes = training.keys().drop(config.file_label)
            tree = id3.id3(training, attributes)
            results = results.append(__test(tree, dataset))

        dataframe = __build_final_dataframe_for_full_validation(results, training_size, dataset_size)
        full_results = full_results.append(dataframe, ignore_index=True)
        part /= 2

    return full_results
Esempio n. 10
0
    print_time = cmd_args.time
    print_tree = cmd_args.tree
    print_smt = cmd_args.smt
    print_model = cmd_args.model
    if cmd_args.verbose:
        print_tree = True
        print_smt = True
        print_model = True

    #print("# reading from stdin")
    header, samples = parse(sys.stdin)

    # print("# getting upper bound from ID3")
    if print_time: start = time.time()

    id3_sol = id3(samples)
    if print_time:
        end = time.time()
        id3_time = end - start

    if (id3_sol == -1):
        print(f"UNSAT")
        exit(0)

    results = {}
    upper_bound = max(3, id3_sol)  # because our solver won't work with N < 3
    for solver_i in solvers:
        for search in [
                searches.SAT_UNSAT(3, upper_bound),
                searches.UNSAT_SAT(3, upper_bound),
                searches.Binary(3, upper_bound)
def readCommand( argv ):
  "Processes the command used to run from the command line."
  from optparse import OptionParser  
  parser = OptionParser(USAGE_STRING)
  
  parser.add_option('-c', '--classifier', help=default('The type of classifier'), choices=['id3','mostFrequent', 'nb', 'naiveBayes', 'perceptron', 'mira', 'minicontest'], default='mostFrequent')
  parser.add_option('-d', '--data', help=default('Dataset to use'), choices=['digits', 'faces'], default='digits')
  parser.add_option('-t', '--training', help=default('The size of the training set'), default=100, type="int")
  parser.add_option('-f', '--features', help=default('Whether to use enhanced features'), default=False, action="store_true")
  parser.add_option('-o', '--odds', help=default('Whether to compute odds ratios'), default=False, action="store_true")
  parser.add_option('-1', '--label1', help=default("First label in an odds ratio comparison"), default=0, type="int")
  parser.add_option('-2', '--label2', help=default("Second label in an odds ratio comparison"), default=1, type="int")
  parser.add_option('-w', '--weights', help=default('Whether to print weights'), default=False, action="store_true")
  parser.add_option('-k', '--smoothing', help=default("Smoothing parameter (ignored when using --autotune)"), type="float", default=2.0)
  parser.add_option('-a', '--autotune', help=default("Whether to automatically tune hyperparameters"), default=False, action="store_true")
  parser.add_option('-i', '--iterations', help=default("Maximum iterations to run training"), default=3, type="int")
  parser.add_option('-s', '--test', help=default("Amount of test data to use"), default=TEST_SET_SIZE, type="int")

  options, otherjunk = parser.parse_args(argv)
  if len(otherjunk) != 0: raise Exception('Command line input not understood: ' + str(otherjunk))
  args = {}
  
  # Set up variables according to the command line input.
  print "Doing classification"
  print "--------------------"
  print "data:\t\t" + options.data
  print "classifier:\t\t" + options.classifier
  if not options.classifier == 'minicontest':
    print "using enhanced features?:\t" + str(options.features)
  else:
    print "using minicontest feature extractor"
  print "training set size:\t" + str(options.training)
  if(options.data=="digits"):
    printImage = ImagePrinter(DIGIT_DATUM_WIDTH, DIGIT_DATUM_HEIGHT).printImage
    if (options.features):
      featureFunction = enhancedFeatureExtractorDigit
    else:
      featureFunction = basicFeatureExtractorDigit
    if (options.classifier == 'minicontest'):
      featureFunction = contestFeatureExtractorDigit
  elif(options.data=="faces"):
    printImage = ImagePrinter(FACE_DATUM_WIDTH, FACE_DATUM_HEIGHT).printImage
    if (options.features):
      featureFunction = enhancedFeatureExtractorFace
    else:
      featureFunction = basicFeatureExtractorFace      
  else:
    print "Unknown dataset", options.data
    print USAGE_STRING
    sys.exit(2)
    
  if(options.data=="digits"):
    legalLabels = range(10)
  else:
    legalLabels = range(2)
    
  if options.training <= 0:
    print "Training set size should be a positive integer (you provided: %d)" % options.training
    print USAGE_STRING
    sys.exit(2)
    
  if options.smoothing <= 0:
    print "Please provide a positive number for smoothing (you provided: %f)" % options.smoothing
    print USAGE_STRING
    sys.exit(2)
    
  if options.odds:
    if options.label1 not in legalLabels or options.label2 not in legalLabels:
      print "Didn't provide a legal labels for the odds ratio: (%d,%d)" % (options.label1, options.label2)
      print USAGE_STRING
      sys.exit(2)

  if(options.classifier == "id3"):
    classifier = id3.id3(legalLabels)
    if (options.autotune):
        print "using automatic tuning for id3"
        classifier.automaticTuning = True
  elif(options.classifier == "mostFrequent"):
    classifier = mostFrequent.MostFrequentClassifier(legalLabels)
  elif(options.classifier == "naiveBayes" or options.classifier == "nb"):
    classifier = naiveBayes.NaiveBayesClassifier(legalLabels)
    classifier.setSmoothing(options.smoothing)
    if (options.autotune):
        print "using automatic tuning for naivebayes"
        classifier.automaticTuning = True
    else:
        print "using smoothing parameter k=%f for naivebayes" %  options.smoothing
  elif(options.classifier == "perceptron"):
    classifier = perceptron.PerceptronClassifier(legalLabels,options.iterations)
  elif(options.classifier == "mira"):
    classifier = mira.MiraClassifier(legalLabels, options.iterations)
    if (options.autotune):
        print "using automatic tuning for MIRA"
        classifier.automaticTuning = True
    else:
        print "using default C=0.001 for MIRA"
  elif(options.classifier == 'minicontest'):
    import minicontest
    classifier = minicontest.contestClassifier(legalLabels)
  else:
    print "Unknown classifier:", options.classifier
    print USAGE_STRING
    
    sys.exit(2)

  args['classifier'] = classifier
  args['featureFunction'] = featureFunction
  args['printImage'] = printImage
  
  return args, options
            sys.stderr.write('\nerr>> ' + '\nerr>> '.join(pe) + '\n')
    return None if p.returncode == 20 else read_model(po)


if __name__ == "__main__":
    header, samples = parse_samples(sys.stdin)
    # print ("# solver:", solver)
    print_time = True

    if print_time:
        solver = ['timeout', '300', '/usr/bin/time', '-f', '%e'] + solver
    solver_time = 0
    num_solver_calls = 0

    # print("# getting upper bound from ID3")
    id3_cost, id3_model = id3(samples)

    if id3_cost == -1:
        # print(f"UNSAT")
        exit(0)

    # print('# id3', id3_cost)

    times_dict = {}
    for search_class in searches:
        for encoding in encodings:
            solver_time = 0
            time_per_call = {}
            num_solver_calls = 0

            if id3_cost <= 3:
Esempio n. 13
0
# get training data
id3TrainingData = open('data/id3train.txt', 'r').readlines()
id3TrainingMatrix = []
for line in id3TrainingData:
    id3TrainingMatrix.append(np.fromstring(line, dtype=float, sep=' '))
id3TrainingMatrix = np.array(id3TrainingMatrix)

# get test data
id3TestData = open('data/id3test.txt', 'r').readlines()
id3TestMatrix = []
for line in id3TestData:
    id3TestMatrix.append(np.fromstring(line, dtype=float, sep=' '))
id3TestMatrix = np.array(id3TestMatrix)

# ID3 test error without pruning
rootNode = id3(id3TrainingMatrix)
numErrors = 0
for row in id3TestMatrix:
    currNode = rootNode
    while isinstance(currNode.data, list):
        featureIndex = currNode.data[1]
        featureVal = currNode.data[0]
        if row[featureIndex] <= featureVal:
            currNode = currNode.yesPtr
        else:
            currNode = currNode.noPtr
    if currNode.data != row[-1]:
        numErrors = numErrors + 1
print(float(numErrors) / len(id3TestMatrix))

# ID3 test error with pruning
    feature_indices = range(n_features)
    data_size = int(data_size * train_data.shape[0])
    feature_size = int(feature_size * n_features)
    split_size = int(split_size * n_features)
    max_depth = n_features + 2  # No pruning

    trees = []
    for i in range(num_trees):
        ri = np.random.choice(data_indices, size=data_size, replace=True)
        rf = np.random.choice(feature_indices,
                              size=n_features - feature_size,
                              replace=False)

        tree, depth = id3(train_data[ri],
                          train_labels[ri],
                          used_features=rf,
                          max_depth=max_depth,
                          split_size=split_size)
        trees.append(tree)

        if (i + 1) % (num_trees / 10) == 0:
            print(str((i + 1) / num_trees * 100) + '%')

    np.save('../data/trees', trees)
    train_acc = evaluate_forest(train_data, train_labels, trees,
                                '../data/new_train')
    test_acc = evaluate_forest(test_data, test_labels, trees,
                               '../data/new_test')

    def predictor(row):
        return mode(list(map(lambda tree: classify(row, tree), trees)))[0][0]
Esempio n. 15
0
import id3 as id3
from data import Data

DATA_DIR = 'data/'


def get_data_obj(filename):
    data = np.loadtxt(DATA_DIR + filename + '.csv', delimiter=',', dtype=str)
    return Data(data=data)


if __name__ == '__main__':

    print("\nFull Decision Tree: ")
    data_obj = get_data_obj('train')
    id3_tree = id3.id3(data_obj, data_obj.attributes,
                       data_obj.get_column('label'))

    error, depth = id3.report_error(data_obj, id3_tree)
    print("Accuracy on training data: {}%; Depth: {}".format(
        100 - error, depth))

    data_obj_test = get_data_obj('test')

    error, depth = id3.report_error(data_obj_test, id3_tree)
    print("    Accuracy on test data: {}%; Depth: {}".format(
        100 - error, depth))

    print("\nTree with Max Depth 5")

    max_depth = 5
    pruned_tree = id3.pruning_tree(id3_tree, max_depth)
Esempio n. 16
0
	if instance[label_attribute] not in monks_labels: monks_labels.append(instance[label_attribute])

digits_attributes, digits_full_dataset = iotools.parse_file('../DecisionTrees/data/opticalDigit.csv')
digits_labels = []
for instance in digits_full_dataset:
	if instance[label_attribute] not in digits_labels: digits_labels.append(instance[label_attribute])

results = [[[],[],[]],[[],[],[]]]
for seed in range(100,130):
	partitions = iotools.split_dataset(monks_full_dataset, seed=seed, num_partitions=3)
	monks_training_set = []
	for p in partitions[:-1]:
		monks_training_set += p
	monks_test_set = partitions[-1]
	# run id3 on monks
	tree = id3.id3(copy.deepcopy(monks_attributes), monks_training_set)
	id3_labels, id3_matrix = test(monks_test_set, tree.classify, copy.deepcopy(monks_labels))
	print(iotools.print_confusion_matrix(id3_matrix, id3_labels))
	print(accuracy(id3_matrix))
	results[0][0].append(accuracy(id3_matrix))
	# run c4.5 on monks
	rules = c45.c45(copy.deepcopy(monks_attributes), monks_training_set)
	c45_labels, c45_matrix = test(monks_test_set, lambda inst: decision_rule.classify_on_rule_list(inst, rules), copy.deepcopy(monks_labels))
	print(iotools.print_confusion_matrix(c45_matrix, c45_labels))
	print(accuracy(c45_matrix))
	results[0][1].append(accuracy(c45_matrix))
	# run NB on monks
	nb_attributes = copy.deepcopy(monks_attributes)
	nb_attributes.remove(label_attribute)
	nb = naive_bayes.BayesianClassifier()
	nb.train(monks_training_set, nb_attributes)
Esempio n. 17
0
	test_block = partitions[j]
	test_label_block = label_partitions[j] 
	train_examples = zip(training_block, training_label_block.T.tolist()[0])
	test_examples = zip(test_block, test_label_block.T.tolist()[0])

	#cross validation for random forest
	print "Cross Validating Random Forest..."
	train_size = int(training_block.shape[0])
	att_size = int(len(attributes))
	forest_size = 100
	[ensemble_error, ensemble_pred] = rf.ensemble(test_block, test_label_block, rf.raise_forest(training_block,training_label_block, forest_size, train_size, att_size))
	error[0]+= (1.0/k) * ensemble_error

	#cross validation for decision tree
	print "Cross Validating Decision Tree..."
	dec_tree = id3.id3(train_examples, attributes)
	dec_tree_errors = 0
	for i in xrange(len(test_block)):
		if id3.classify(dec_tree, test_block[i]) != test_label_block[i]:
			dec_tree_errors += 1
	error[1] += (1.0/k) * (float(dec_tree_errors) / set_size)

	print "Cross Validating AdaBoost..."
	adaboost_classifier = adaboost.adaboost(train_examples, adaboost_rounds)
	adaboost_errors = 0
	for i in xrange(len(test_block)):
		if adaboost.classify(adaboost_classifier, test_block[i]) != test_label_block[i]:
			adaboost_errors += 1
	error[2] += (1.0/k) * (float(adaboost_errors) / set_size)

 def setUp(self):
     data = np.loadtxt(DATA_DIR + 'tennis.csv', delimiter=' ', dtype=str)
     self.data_obj = Data(data=data)
     self.id3_tree = id3(self.data_obj, self.data_obj.attributes, self.data_obj.get_column('label'))
Esempio n. 19
0
def main_slow():
    treenums = [1, 100, 200, 300, 400, 500, 700, 900, 1000]
    trainset, trainraw = makeData(training, labels)
    testset, testraw = makeData(testing, labels)
    medians = medianAssign(trainset, labels)
    trainset = removeNums(trainset, medians)
    testset = removeNums(testset, medians)

    trainLabels = [item[-1] for item in trainraw]
    testLabels = [item[-1] for item in testraw]

    for element in trainset:
        element['weight'] = 1
    trainAcc = []
    testAcc = []

    for num in treenums:
        treelist = []
        for i in range(num):
            newTraining = rand.choices(trainset, k=len(trainset))
            newTree = id3(newTraining, labels, label_attr, labels[-1], 18,
                          'entropy', None)
            treelist.append(newTree)
        trainPred = []
        testPred = []
        for entry in trainset:
            thing = bag_guess(treelist, entry, labels, label_attr['outcome'])
            trainPred.append(thing)
        for entry in testset:
            thing = bag_guess(treelist, entry, labels, label_attr["outcome"])
            testPred.append(thing)
        trainAcc.append(accuracy(trainPred, trainLabels))
        testAcc.append(accuracy(testPred, testLabels))
        print(trainAcc, 'train accuracy')
        print(testAcc, 'test accuracy')

    tree_preds = []
    basics = []
    for i in range(50):
        train_i = rand.choices(trainset, k=1000)
        treelist_i = []
        for j in range(300):
            train_j = rand.choices(train_i, k=1000)
            newTree = id3(train_j, labels, label_attr, labels[-1], 18,
                          'entropy', None)
            treelist_i.append(newTree)
#            if j%100 == 0:
#                print("100 more trees from set", i, 'have been trained.  iteration = ',j)
        tree_preds.append(treelist_i)
        basics.append(treelist_i[0])
        print("Tree set", i, "has been trained")

    singleVar = []
    singleBias = []
    singleMean = []
    for entry in testset:
        guess_agg = 0
        predictions = []
        for tree in basics:
            guess = predict(tree, entry, labels)
            if guess == label_attr['outcome'][0]:
                guess_agg += 1
                predictions.append(1)
            else:
                predictions.append(0)
        ave = guess_agg / len(basics)
        singleMean.append(ave)
        value = 0
        if entry['outcome'] == label_attr['outcome'][0]:
            value = 1
        bias = (value - ave)**2
        singleBias.append(bias)
        subVar = []
        for h in predictions:
            mini = (h - ave)**2
            subVar.append(mini)
        var = (1 / (len(basics) - 1)) * sum(subVar)
        singleVar.append(var)

    bagVar = []
    bagBias = []
    bagMean = []
    for entry in testset:
        guess_agg = 0
        predictions = []
        for trees in tree_preds:
            guess = bag_guess(trees, entry, labels, label_attr['outcome'])
            if guess == label_attr['outcome'][0]:
                guess_agg += 1
                predictions.append(1)
            else:
                predictions.append(0)
        ave = guess_agg / len(basics)
        bagMean.append(ave)
        value = 0
        if entry['outcome'] == label_attr['outcome'][0]:
            value = 1
        bias = (value - ave)**2
        bagBias.append(bias)
        subVar = []
        for h in predictions:
            mini = (h - ave)**2
            subVar.append(mini)
        var = (1 / (len(basics) - 1)) * sum(subVar)
        bagVar.append(var)

    sVariance = mean(singleVar)
    sBias = mean(singleBias)
    sMSE = sBias + sVariance
    print("The bias and the variance of the single trees are: Variance:",
          sVariance, 'Bias:', sBias, "and the general squared error is:", sMSE)
    bVariance = mean(bagVar)
    bBias = mean(bagBias)
    bMSE = bBias + bVariance
    print("The bias and the variance of the bagged trees are: Variance:",
          bVariance, 'Bias:', bBias, "and the general squared error is:", bMSE)
                continue

            datas.append(np.loadtxt(DATA_DIR + filename + '.csv', delimiter=',', dtype=str))

        data = np.concatenate(datas)
        data_obj = Data(data=data)
        train_objs.append(data_obj)

    avg_accuracies = []
    for max_depth in depths:
        accuracies = []
        print('**********************************')
        print('****** Hyperparameter is {} ******'.format(max_depth))
        print('**********************************\n')

        for i in range(len(filenames)):
            id3_tree = id3.id3(train_objs[i], train_objs[i].attributes, train_objs[i].get_column('label'))
            pruned_tree = id3.pruning_tree(id3_tree, max_depth)

            error, depth = id3.report_error(test_objs[i], pruned_tree)
            accuracies.append(100.0-error)

            print('***** Testing on {} *****'.format(filenames[i]))

        avg_accuracy = st.mean(accuracies)
        avg_accuracies.append(avg_accuracy)
        print("Average accuracy: {}%; Standard Deviation: {}\n".format(avg_accuracy, np.std(accuracies)))

    # print(dict(zip(depths, avg_accuracies)))

Esempio n. 21
0
import functions as f
import id3


attributes, data, output = f.read_data("train.txt")


for i, j in zip(attributes, data):
    print(i, j, sep='\t')
print(attributes[-1], output, sep='\t')


decision_tree = id3.id3(attributes, data, output)
print('\n', decision_tree.to_string(1))


f.draw_dt(decision_tree)
Esempio n. 22
0
import pydot
import utils
import id3
import decisiontree

# Read weather dataset
print('Reading weather dataset...')
weatherAttributes, weatherDataSet = utils.readDataSet(
    './datasets/weatherDataSetTrain.csv')
weatherTargetAttribute = weatherAttributes[-1]
weatherAttributes.remove(weatherTargetAttribute)

# Train
print('Training weather dataset...')
weatherTree = id3.id3(weatherAttributes, weatherTargetAttribute,
                      weatherDataSet)
print(weatherTree)

print('Plotting weather decision tree...')
weatherGraph = pydot.Dot(graph_type='digraph')
decisiontree.drawTree(weatherGraph, weatherTree)
weatherGraph.write('./images/weather.png', prog=None, format='png')
print('Done.')

# Read car evaluation dataset
print('===========================================')
print('Reading car evaluation training dataset...')
carAttributes, carEvaluationTrainDataSet = utils.readDataSet(
    './datasets/car-evaluation-train.csv')
targetAttribute = carAttributes[-1]
carAttributes.remove(targetAttribute)
Esempio n. 23
0
	return examples

# given a list of test examples, the target attribute, and a decision tree,
# returns a count of correct and incorrect classifications
def test_tree(tree, examples, target):
	def classify(example, tree):		
		while isinstance(tree, dict):
			attr = list(tree.keys())[0]
			attr_val = example[attr]	
			if attr_val not in tree[attr]:
				c = Counter([e[attr] for e in examples])
				attr_val = c.most_common(1)[0][0]
			tree = tree[attr][attr_val]			
		return tree
	
	return Counter([classify(e, tree) == e[target] for e in examples])

train_file = 'vote_train.txt'
training = preprocess(train_file)

test_file = 'vote_test.txt'
testing = preprocess(test_file)

target = 'party'
attributes = [a for a in training[0].keys() if a is not target]

tree = id3(training, attributes, target)

results = test_tree(tree, testing, target)
print('%f%% correct' % (results[True] * 100.0 / len(testing)))
Esempio n. 24
0
def main():
    if sys.argv[1] == '-h':
        print("TODO: use argparse and put pretty messages here")
        sys.exit(0)
    if (len(sys.argv) < 4):
        print(
            "There's a missing parameter. Remember to include path to data, alg and seed"
        )
        sys.exit(1)

    path = sys.argv[1]
    algorithm = sys.argv[2].lower()
    seed = sys.argv[3]
    if (len(sys.argv) == 5):
        output_dir = sys.argv[4]
    else:
        output_dir = './'
    attributes, full_dataset = iotools.parse_file(path)
    partitions = iotools.split_dataset(full_dataset,
                                       seed=seed,
                                       num_partitions=3)
    labels = []
    label_attribute = 'label'
    for instance in full_dataset:
        if instance[label_attribute] not in labels:
            labels.append(instance[label_attribute])

    training_set = []
    for p in partitions[:-1]:
        training_set += p
    if algorithm == 'id3':
        tree = id3.id3(attributes, training_set)
        #iotools.output_graph_image_source(tree, 'pretty_picture.gv')
        labels, matrix = test(partitions[-1], tree.classify, labels)
    elif algorithm == 'c4.5':
        rule_list = c45.c45(attributes, training_set)
        print("\nFinal Rules:\n")
        for rule in rule_list:
            print(rule)
        print(decision_rule.rule_list_to_tree(rule_list))
        labels, matrix = test(
            partitions[-1],
            lambda inst: decision_rule.classify_on_rule_list(inst, rule_list),
            labels)
    elif algorithm == 'c4.5np':
        rule_list = c45.c45(attributes, training_set, pruning=False)
        print("\nFinal Rules:\n")
        for rule in rule_list:
            print(rule)
        labels, matrix = test(
            partitions[-1],
            lambda inst: decision_rule.classify_on_rule_list(inst, rule_list),
            labels)
    elif algorithm == 'c4.5nsi':
        rule_list = c45.c45(attributes, training_set, split_info=False)
        print("\nFinal Rules:\n")
        for rule in rule_list:
            print(rule)
        labels, matrix = test(
            partitions[-1],
            lambda inst: decision_rule.classify_on_rule_list(inst, rule_list),
            labels)
    elif algorithm == 'naivebayes':
        attributes.remove(label_attribute)
        nb = naive_bayes.BayesianClassifier()
        nb.train(training_set, attributes)
        labels, matrix = test(partitions[-1],
                              lambda inst: nb.classify(inst, attributes),
                              labels)
    #elif algorithm == 'neuralnets':

    #        nn = ?
    #        nn.train(training_set, attributes)
    else:
        print("Sorry, that algorithm is not implemented yet")
        sys.exit(1)
    iotools.output_confusion_matrix(matrix, labels,
                                    re.sub(r'.*/([^/\.]*)\.csv', r'\1', path),
                                    algorithm, seed, output_dir)
Esempio n. 25
0
        # header = header[1:]
        print(header+"|"+nestedList)
    else:
        # print(nestedList[0])
        header += "  "
        for index in range(0,len(nestedList)):
            printList(nestedList[index],header)

# def printList(nestedList):
#     if len(nestedList) == 0:
#         return



relationName, attributeList, dataList = reader.readARFF("contact-lenses.arff")
tree = id3.id3(dataList,attributeList,dataList)
header = ""
print(relationName)
printList(tree,header)

relationName, attributeList, dataList = reader.readARFF("restaurants.arff")
tree = id3.id3(dataList,attributeList,dataList)
header = ""
print("\n\n"+relationName)
printList(tree,header)

relationName, attributeList, dataList = reader.readARFF("weather.nominal.arff")
tree = id3.id3(dataList,attributeList,dataList)
header = ""
print("\n\n"+relationName)
printList(tree,header)
Esempio n. 26
0
def main(args):
    dataset = stateoftheunions()

    for sample in dataset:
        preprocess(sample)

    common_words = AtLeastNDups((word for sample in dataset \
                                      for sent in sample['speech'] \
                                      for word in sent), 4)

    for sample in dataset:
        unkify(sample, common_words)

    computefeatures(dataset, common_words.union(set(unknown_token)))

    for sample in dataset:
        sample['classes'] = [0] if sample['party'] == "Republican" else [1]

    featureset = set()
    for sample in dataset:
        featureset = featureset.union(set(sample['features'].keys()))

    #globalfeatureset = set((feature for feature in featureset if all((feature in sample['features'] for sample in dataset))))

    commonfeatureset = set((feature for feature in featureset if sum((1 if feature in sample['features'] else 0 for sample in dataset)) > 5))

    print "featureset len", len(featureset)
    print "commonfeaturest len", len(commonfeatureset)

    print "binarizing"

    splits = {feature: decisionsplit(dataset, feature) for feature in commonfeatureset}

    bfeatureset = set()

    for sample in dataset:
        sample['bfeatures'] = Counter()
        for feature in [f for f in sample['features'] if f in commonfeatureset]:
            bfeature = str(feature) + ' > ' + str(splits[feature])
            if not bfeature in bfeatureset:
                bfeatureset = bfeatureset.union(set([bfeature]))
            if sample['features'][feature] <= splits[feature]:
                sample['bfeatures'][bfeature] = 0
            else:
                sample['bfeatures'][bfeature] = 1

    print "filtering"

    featureentropy = {}
    for feature in bfeatureset:
        ent0 = entropy(normalize({feature: sample['classes'][0] for sample in dataset if sample['bfeatures'][feature] == 0}))
        ent1 = entropy(normalize({feature: sample['classes'][0] for sample in dataset if sample['bfeatures'][feature] == 1}))
        featureentropy[feature] = (len([s for s in dataset if s['bfeatures'][feature] == 0]) / len(dataset)) * ent0 + (len([s for s in dataset if s['bfeatures'][feature] == 1]) / len(dataset)) * ent1

    bestfeatures = sorted(featureentropy, key=featureentropy.get)[:1000]

    training = [{'features': Counter({f: sample['bfeatures'][f] for f in sample['bfeatures'] if f in bestfeatures}), 'classes': sample['classes']} for sample in dataset]

    baselinetrainer = lambda t: baselineclassifier

    baselinecv = stratifiedcrossvalidate(baselinetrainer, training, 5)

    print "cross-validated accuracy with baseline:", str(baselinecv)
    print "average accuracy:", str(sum(baselinecv) / 5.0)


    nbtrainer = lambda t: lambda s: naivebayesclassify(t, bestfeatures, s)

    nbcv = stratifiedcrossvalidate(nbtrainer, training, 5)

    print "cross-validated accuracy with naive bayes:", str(nbcv)
    print "average accuracy:", str(sum(nbcv) / 5.0)


    for maxdepth in [5, 10, 20, 40, 80]:

        id3trainer = lambda t: id3(t, bestfeatures, maxdepth).classify

        id3cv = stratifiedcrossvalidate(id3trainer, training, 5)

        print "cross-validated accuracy with id3 and max depth of", str(maxdepth), ":", str(id3cv)
        print "average accuracy:", str(sum(id3cv) / 5.0)



    for maxdepth in [5, 10, 20, 40, 80]:
        for numtrees in [64, 80, 96, 112, 128]:
            rftrainer = lambda t: RandomForest(lambda o,f: id3(o, f, maxdepth).classprobabilities, t, bestfeatures, numtrees).classify

            rfcv = stratifiedcrossvalidate(rftrainer, training, 5)

            print "cross-validated accuracy with random forest with", str(numtrees), "trees and", str(maxdepth), "max depth:", str(rfcv)
            print "average accuracy:", str(sum(rfcv) / 5.0)

    """
    use random forest since it has by far the highest accuracy
    random forest democrat/republican semantic differential:
    rf.classprobabilities(speech)[(1,)]
    where 0 is republican, 1 is democrat
    this is just the probability of democrat
    so when it is low, we are closer to republican (0)
    and when it is high, we are closer to democrat (1)

    """

    """
Esempio n. 27
0
        '../data/data-splits/data.train', n_features=n_features, preprocessor=preprocessor)
    test_data, test_labels = load_data(
        '../data/data-splits/data.test', n_features=n_features, preprocessor=preprocessor)

    cv_data = np.array_split(np.hstack((train_data, train_labels)), 5)

    max_acc = 0
    opt_depth = 0
    for i in range(2, n_features + 2):
        acc = []

        for j in range(len(cv_data)):
            cv_test = cv_data[j]
            cv_train = np.vstack(cv_data[:j] + cv_data[j + 1:])

            tree, depth = id3(cv_train[:, :-1], cv_train[:, -1], max_depth=i)

            cv_acc = evaluate_tree(cv_test[:, :-1], cv_test[:, -1], tree)
            acc.append(cv_acc)

        avg_acc = np.mean(acc)
        if avg_acc > max_acc:
            opt_depth = i
            max_acc = avg_acc

    tree, depth = id3(train_data, train_labels, max_depth=opt_depth)
    train_acc = evaluate_tree(train_data, train_labels, tree)
    test_acc = evaluate_tree(test_data, test_labels, tree)

    write_output('ID3', opt_depth, max_acc, train_acc, test_acc)
    write_predictions('id3', lambda row: classify(row, tree),
Esempio n. 28
0
__author__ = 'paul'
#no other partner for this homework

import nightoutparser
import id3
import nightoutdata


def constructQuery():
    return nightoutdata.Row('Large', 'Moderate', 'Cheap', 'Loud',
                            'City-Center', 'No', 'No', "")


OUTCOME = nightoutdata.ENJOY
rows = nightoutparser.parsefile("dt-data.txt")

node = id3.id3(rows, nightoutdata.allnontargetattributes, nightoutdata.ENJOY)

id3.visit([node])
id3.check(rows, node)

queryrow = constructQuery()
prediction = id3.getDecision(queryrow, node)
print('Prediction to enjoy was ' + prediction)
Esempio n. 29
0
import file_manager
import id3
import config

dataset = file_manager.read_dataset()

attributes = dataset.keys().drop(config.file_label)
tree = id3.id3(dataset, attributes)
id3.print_tree(tree)
print(tree)