Esempi in Python per id3, esempi in Python per id3.id3

Esempio n. 1

0

Mostra file

File: main_bank.py Progetto: TravDraper/CS-6350-Code

def main():
    trainset, trainraw = makeData(training, labels)
    testset, testraw = makeData(testing, labels)
    medians = medianAssign(trainset, labels)
    trainset = removeNums(trainset, medians)
    testset = removeNums(testset, medians)
    fix = fixUnknown(trainset, labels)
    trainsetU = replaceUnknown(trainset, labels, fix)
    testsetU = replaceUnknown(testset, labels, fix)

    trainLabels = [item[-1] for item in trainraw]
    testLabels = [item[-1] for item in testraw]
    #print(trainset[1])
    print(
        "Running decision tree algorithm on the bank dataset with unknown values"
    )
    algotype = ['gini', 'entropy', 'ME']
    for item in algotype:
        for i in range(1, 17):
            currentTree = id3(trainset, labels, label_attr, labels[-1], i,
                              item, None)

            trainPred = [predict(currentTree, x, labels) for x in trainset]
            testPred = [predict(currentTree, x, labels) for x in testset]

            trainAcc = accuracy(trainPred, trainLabels)
            testAcc = accuracy(testPred, testLabels)

            print("Decision tree of depth", i, "using", item,
                  "has a test accuracy of", testAcc,
                  'and a training accuracy of', trainAcc)

    print(
        "Running decision tree algorithm on the bank dataset with unknown's replaced"
    )
    print("\n \n \n \n \n")

    for item in algotype:
        for i in range(1, 17):
            currentTree = id3(trainsetU, labels, label_attr, labels[-1], i,
                              item, None)

            trainPred = [predict(currentTree, x, labels) for x in trainsetU]
            testPred = [predict(currentTree, x, labels) for x in testsetU]

            trainAcc = accuracy(trainPred, trainLabels)
            testAcc = accuracy(testPred, testLabels)

            print("Decision tree of depth", i, "using", item,
                  "has a test accuracy of", testAcc,
                  'and a training accuracy of', trainAcc)

Esempio n. 2

0

Mostra file

File: main_car.py Progetto: TravDraper/CS-6350-Code

def main():
    trainset, trainRaw = makeData(training, labels)
    testset, testRaw = makeData(testing, labels)
    trainLabels = [item[-1] for item in trainRaw]
    testLabels = [item[-1] for item in testRaw]

    #print(gf.gainE(trainset,labels[5], labels[-1]))

    #    mytree = id3(trainset, labels, label_attr, labels[-1], 6, "entropy", None)
    #    printTree(mytree)
    print("Running the decision tree algorithm on the 'Cars' dataset.")

    algotype = ['gini', 'entropy', 'ME']
    for item in algotype:
        for i in range(1, 7):
            currentTree = id3.id3(trainset, labels, label_attr, labels[-1], i,
                                  item, None)

            trainPred = [id3.predict(currentTree, x, labels) for x in trainset]
            testPred = [id3.predict(currentTree, x, labels) for x in testset]

            trainAcc = id3.accuracy(trainPred, trainLabels)
            testAcc = id3.accuracy(testPred, testLabels)

            print("Decision tree of depth", i, "using", item,
                  "has a test accuracy of", testAcc,
                  'and a training accuracy of', trainAcc)

Esempio n. 3

0

Mostra file

File: bank_boost.py Progetto: TravDraper/CS-6350-Code

def boost(data, labels, attr_list, target, iterations, answers):
    treelist = []
    for i in range(iterations):
        normalize = 0
        for item in data:
            normalize += item['weight']
        currentTree = id3(data, labels, attr_list, target, 2, 'entropy', None)
        #        predictor = []
        #        for item in data:
        #            predictor.append([predict(currentTree, item, labels), item['weight']])

        trainError = 0
        #        print('weight =' , data[0]['weight'])
        for item in data:
            if predict(currentTree, item, labels) != item[labels[-1]]:
                trainError += item['weight'] / normalize
#        print(trainError)
#        print(normalize)
#        alpha = 1/2*m.log((1-trainError)/trainError)
        alpha = 1 / 4 * m.log((1 - trainError) / trainError)
        #        print(m.exp(alpha), m.exp(-alpha))
        #        print(alpha, " = alpha")
        for item in data:
            if predict(currentTree, item, labels) != item[labels[-1]]:
                item['weight'] = item['weight'] * m.exp(alpha)
#                print(item['weight'])
            else:
                item['weight'] = item['weight'] * m.exp(-alpha)


#                print(item['weight'])
        treelist.append({'tree': currentTree, 'alpha': alpha})
        #print(treelist)
    return treelist

Esempio n. 4

0

Mostra file

File: rf.py Progetto: palmagro/mrrf

 def __init__(self,gr,nodes,tipo,target,vtarget,narboles,nnodes,ntrels,maxdepth,exrel,umbral):
     self.graph_db = gr
     self.tipo = tipo
     self.target = target
     self.vtarget = vtarget
     self.nnodes = nnodes
     self.ntrels = ntrels
     self.maxdepth = maxdepth
     
     TC = neo4j.CypherQuery(self.graph_db, "MATCH (a)-[r]->(b) WHERE labels(a) <> [] AND labels(b) <> [] RETURN DISTINCT head(labels(a)) AS This, type(r) as To, head(labels(b)) AS That limit "+str(self.ntrels)).execute()
     print "Tipos de aristas cargadas: "+ str(len(TC.data)) + " elementos."
     while(len(self.arboles)<narboles):  
         tempn = nodes#random.sample(nodes , ( random.randint(1,len(nodes)/2))) + (random.sample(set(nodes[-len(nodes)/2:]), random.randint(1,len(nodes)/2)))
         tempr = random.sample(set(TC.data), random.randint(3,len(TC)))
         #tempr = TC.data
         arbol = id3(gr,target,vtarget,tempr)
         res = arbol.execute(tempn,"match (n:"+self.tipo+")",self.tipo,self.maxdepth,-999,999,exrel,umbral,target)
         tab = []
         for l in res.get_leaves():
             a,b,c = l.name.partition("*")
             tab.append(a) 
         entra = True
         for c in self.arboles:
            if entra == True:
                if self.checkequals(c.arbol,res):
                 entra = not self.checkequals(c.arbol,res)
         if len(res.get_edges()) > 2 and len(set(tab)) > 1 and entra:    
             print "Arbol "+str(len(self.arboles)+1)+"("+str(len(tempn))+" nodos):"
             print res.get_ascii(show_internal=True)      
             self.arboles.append(arbol)

Esempio n. 5

0

Mostra file

File: rf.py Progetto: varunkp/BerkeleyCS

def raise_forest(Xtrain, ytrain, n, train_size, att_size):
	print "Raising forest with " + str(n) + " trees"
	trees = []
	for i in xrange(n):
		sub_train_x, sub_train_ind = sample_with_rep(Xtrain, train_size)	
		sub_train_y = ytrain[sub_train_ind]
		examples = [(sub_train_x[i], sub_train_y[i]) for i in xrange(len(sub_train_ind))]
		sub_att, sub_att_ind = sample_with_rep(attributes,att_size)
		sub_att = set(sub_att)
		trees.append(id3.id3(examples, sub_att))	
	return trees

Esempio n. 6

0

Mostra file

def raise_forest(Xtrain, ytrain, n, train_size, att_size):
    print "Raising forest with " + str(n) + " trees"
    trees = []
    for i in xrange(n):
        sub_train_x, sub_train_ind = sample_with_rep(Xtrain, train_size)
        sub_train_y = ytrain[sub_train_ind]
        examples = [(sub_train_x[i], sub_train_y[i])
                    for i in xrange(len(sub_train_ind))]
        sub_att, sub_att_ind = sample_with_rep(attributes, att_size)
        sub_att = set(sub_att)
        trees.append(id3.id3(examples, sub_att))
    return trees

Esempio n. 7

0

Mostra file

def evaluate(U, k):
    D = get_decisions(U)
    Y = get_classes(U)

    shuffle(U)
    U = divide(U, k)

    evaluation = []
    for i in range(k):
        tree = id3(Y, D, substract(U, U[i]))
        evaluation.append(avg_loss(tree, U[i]))

    return sum(evaluation) / k

Esempio n. 8

0

Mostra file

File: validation.py Progetto: steciuk/PSZT-ID3-algortihm

def __k_fold(dataset, k):
    split_dataset = np.array_split(dataset, k)
    results = pd.DataFrame(columns=["TP", "TN", "FP", "FN"])
    for i in range(k):
        train_set = split_dataset.copy()
        test_set = split_dataset[i]
        del train_set[i]
        train = pd.concat(train_set, sort=False)

        attributes = train.keys().drop(config.file_label)
        tree = id3.id3(train, attributes)
        tmp_results = __test(tree, test_set)
        results = results.append(tmp_results, ignore_index=True)

    results = results.sum()
    return results

Esempio n. 9

0

Mostra file

File: validation.py Progetto: steciuk/PSZT-ID3-algortihm

def validation_of_full_set_multirun_for_different_dataset_size(dataset, starting_set_part, min_set_part):
    full_results = pd.DataFrame(columns=COLUMNS)
    dataset_size = len(dataset)
    part = starting_set_part
    while part >= min_set_part:
        print("part: " + str(part))
        results = pd.DataFrame(columns=["TP", "TN", "FP", "FN"])
        training_size = round(part * dataset_size)
        for i in range(config.num_of_reruns):
            print("rerun: " + str(i))
            training = dataset.sample(frac=part, random_state=config.rng_seed + i)
            attributes = training.keys().drop(config.file_label)
            tree = id3.id3(training, attributes)
            results = results.append(__test(tree, dataset))

        dataframe = __build_final_dataframe_for_full_validation(results, training_size, dataset_size)
        full_results = full_results.append(dataframe, ignore_index=True)
        part /= 2

    return full_results

Esempio n. 10

0

Mostra file

    print_time = cmd_args.time
    print_tree = cmd_args.tree
    print_smt = cmd_args.smt
    print_model = cmd_args.model
    if cmd_args.verbose:
        print_tree = True
        print_smt = True
        print_model = True

    #print("# reading from stdin")
    header, samples = parse(sys.stdin)

    # print("# getting upper bound from ID3")
    if print_time: start = time.time()

    id3_sol = id3(samples)
    if print_time:
        end = time.time()
        id3_time = end - start

    if (id3_sol == -1):
        print(f"UNSAT")
        exit(0)

    results = {}
    upper_bound = max(3, id3_sol)  # because our solver won't work with N < 3
    for solver_i in solvers:
        for search in [
                searches.SAT_UNSAT(3, upper_bound),
                searches.UNSAT_SAT(3, upper_bound),
                searches.Binary(3, upper_bound)

Esempio n. 11

0

Mostra file

File: dataClassifier.py Progetto: iChiragMandot/Artificial-Intelligence-Algorithms

def readCommand( argv ):
  "Processes the command used to run from the command line."
  from optparse import OptionParser  
  parser = OptionParser(USAGE_STRING)
  
  parser.add_option('-c', '--classifier', help=default('The type of classifier'), choices=['id3','mostFrequent', 'nb', 'naiveBayes', 'perceptron', 'mira', 'minicontest'], default='mostFrequent')
  parser.add_option('-d', '--data', help=default('Dataset to use'), choices=['digits', 'faces'], default='digits')
  parser.add_option('-t', '--training', help=default('The size of the training set'), default=100, type="int")
  parser.add_option('-f', '--features', help=default('Whether to use enhanced features'), default=False, action="store_true")
  parser.add_option('-o', '--odds', help=default('Whether to compute odds ratios'), default=False, action="store_true")
  parser.add_option('-1', '--label1', help=default("First label in an odds ratio comparison"), default=0, type="int")
  parser.add_option('-2', '--label2', help=default("Second label in an odds ratio comparison"), default=1, type="int")
  parser.add_option('-w', '--weights', help=default('Whether to print weights'), default=False, action="store_true")
  parser.add_option('-k', '--smoothing', help=default("Smoothing parameter (ignored when using --autotune)"), type="float", default=2.0)
  parser.add_option('-a', '--autotune', help=default("Whether to automatically tune hyperparameters"), default=False, action="store_true")
  parser.add_option('-i', '--iterations', help=default("Maximum iterations to run training"), default=3, type="int")
  parser.add_option('-s', '--test', help=default("Amount of test data to use"), default=TEST_SET_SIZE, type="int")

  options, otherjunk = parser.parse_args(argv)
  if len(otherjunk) != 0: raise Exception('Command line input not understood: ' + str(otherjunk))
  args = {}
  
  # Set up variables according to the command line input.
  print "Doing classification"
  print "--------------------"
  print "data:\t\t" + options.data
  print "classifier:\t\t" + options.classifier
  if not options.classifier == 'minicontest':
    print "using enhanced features?:\t" + str(options.features)
  else:
    print "using minicontest feature extractor"
  print "training set size:\t" + str(options.training)
  if(options.data=="digits"):
    printImage = ImagePrinter(DIGIT_DATUM_WIDTH, DIGIT_DATUM_HEIGHT).printImage
    if (options.features):
      featureFunction = enhancedFeatureExtractorDigit
    else:
      featureFunction = basicFeatureExtractorDigit
    if (options.classifier == 'minicontest'):
      featureFunction = contestFeatureExtractorDigit
  elif(options.data=="faces"):
    printImage = ImagePrinter(FACE_DATUM_WIDTH, FACE_DATUM_HEIGHT).printImage
    if (options.features):
      featureFunction = enhancedFeatureExtractorFace
    else:
      featureFunction = basicFeatureExtractorFace      
  else:
    print "Unknown dataset", options.data
    print USAGE_STRING
    sys.exit(2)
    
  if(options.data=="digits"):
    legalLabels = range(10)
  else:
    legalLabels = range(2)
    
  if options.training <= 0:
    print "Training set size should be a positive integer (you provided: %d)" % options.training
    print USAGE_STRING
    sys.exit(2)
    
  if options.smoothing <= 0:
    print "Please provide a positive number for smoothing (you provided: %f)" % options.smoothing
    print USAGE_STRING
    sys.exit(2)
    
  if options.odds:
    if options.label1 not in legalLabels or options.label2 not in legalLabels:
      print "Didn't provide a legal labels for the odds ratio: (%d,%d)" % (options.label1, options.label2)
      print USAGE_STRING
      sys.exit(2)

  if(options.classifier == "id3"):
    classifier = id3.id3(legalLabels)
    if (options.autotune):
        print "using automatic tuning for id3"
        classifier.automaticTuning = True
  elif(options.classifier == "mostFrequent"):
    classifier = mostFrequent.MostFrequentClassifier(legalLabels)
  elif(options.classifier == "naiveBayes" or options.classifier == "nb"):
    classifier = naiveBayes.NaiveBayesClassifier(legalLabels)
    classifier.setSmoothing(options.smoothing)
    if (options.autotune):
        print "using automatic tuning for naivebayes"
        classifier.automaticTuning = True
    else:
        print "using smoothing parameter k=%f for naivebayes" %  options.smoothing
  elif(options.classifier == "perceptron"):
    classifier = perceptron.PerceptronClassifier(legalLabels,options.iterations)
  elif(options.classifier == "mira"):
    classifier = mira.MiraClassifier(legalLabels, options.iterations)
    if (options.autotune):
        print "using automatic tuning for MIRA"
        classifier.automaticTuning = True
    else:
        print "using default C=0.001 for MIRA"
  elif(options.classifier == 'minicontest'):
    import minicontest
    classifier = minicontest.contestClassifier(legalLabels)
  else:
    print "Unknown classifier:", options.classifier
    print USAGE_STRING
    
    sys.exit(2)

  args['classifier'] = classifier
  args['featureFunction'] = featureFunction
  args['printImage'] = printImage
  
  return args, options

Esempio n. 12

0

Mostra file

File: proj4_compare.py Progetto: Marghrid/Algorithms-for-Computational-Logic

            sys.stderr.write('\nerr>> ' + '\nerr>> '.join(pe) + '\n')
    return None if p.returncode == 20 else read_model(po)


if __name__ == "__main__":
    header, samples = parse_samples(sys.stdin)
    # print ("# solver:", solver)
    print_time = True

    if print_time:
        solver = ['timeout', '300', '/usr/bin/time', '-f', '%e'] + solver
    solver_time = 0
    num_solver_calls = 0

    # print("# getting upper bound from ID3")
    id3_cost, id3_model = id3(samples)

    if id3_cost == -1:
        # print(f"UNSAT")
        exit(0)

    # print('# id3', id3_cost)

    times_dict = {}
    for search_class in searches:
        for encoding in encodings:
            solver_time = 0
            time_per_call = {}
            num_solver_calls = 0

            if id3_cost <= 3:

Esempio n. 13

0

Mostra file

# get training data
id3TrainingData = open('data/id3train.txt', 'r').readlines()
id3TrainingMatrix = []
for line in id3TrainingData:
    id3TrainingMatrix.append(np.fromstring(line, dtype=float, sep=' '))
id3TrainingMatrix = np.array(id3TrainingMatrix)

# get test data
id3TestData = open('data/id3test.txt', 'r').readlines()
id3TestMatrix = []
for line in id3TestData:
    id3TestMatrix.append(np.fromstring(line, dtype=float, sep=' '))
id3TestMatrix = np.array(id3TestMatrix)

# ID3 test error without pruning
rootNode = id3(id3TrainingMatrix)
numErrors = 0
for row in id3TestMatrix:
    currNode = rootNode
    while isinstance(currNode.data, list):
        featureIndex = currNode.data[1]
        featureVal = currNode.data[0]
        if row[featureIndex] <= featureVal:
            currNode = currNode.yesPtr
        else:
            currNode = currNode.noPtr
    if currNode.data != row[-1]:
        numErrors = numErrors + 1
print(float(numErrors) / len(id3TestMatrix))

# ID3 test error with pruning

Esempio n. 14

0

Mostra file

File: run_random_forest.py Progetto: kyledpierson/machine-learning

    feature_indices = range(n_features)
    data_size = int(data_size * train_data.shape[0])
    feature_size = int(feature_size * n_features)
    split_size = int(split_size * n_features)
    max_depth = n_features + 2  # No pruning

    trees = []
    for i in range(num_trees):
        ri = np.random.choice(data_indices, size=data_size, replace=True)
        rf = np.random.choice(feature_indices,
                              size=n_features - feature_size,
                              replace=False)

        tree, depth = id3(train_data[ri],
                          train_labels[ri],
                          used_features=rf,
                          max_depth=max_depth,
                          split_size=split_size)
        trees.append(tree)

        if (i + 1) % (num_trees / 10) == 0:
            print(str((i + 1) / num_trees * 100) + '%')

    np.save('../data/trees', trees)
    train_acc = evaluate_forest(train_data, train_labels, trees,
                                '../data/new_train')
    test_acc = evaluate_forest(test_data, test_labels, trees,
                               '../data/new_test')

    def predictor(row):
        return mode(list(map(lambda tree: classify(row, tree), trees)))[0][0]

Esempio n. 15

0

Mostra file

import id3 as id3
from data import Data

DATA_DIR = 'data/'


def get_data_obj(filename):
    data = np.loadtxt(DATA_DIR + filename + '.csv', delimiter=',', dtype=str)
    return Data(data=data)


if __name__ == '__main__':

    print("\nFull Decision Tree: ")
    data_obj = get_data_obj('train')
    id3_tree = id3.id3(data_obj, data_obj.attributes,
                       data_obj.get_column('label'))

    error, depth = id3.report_error(data_obj, id3_tree)
    print("Accuracy on training data: {}%; Depth: {}".format(
        100 - error, depth))

    data_obj_test = get_data_obj('test')

    error, depth = id3.report_error(data_obj_test, id3_tree)
    print("    Accuracy on test data: {}%; Depth: {}".format(
        100 - error, depth))

    print("\nTree with Max Depth 5")

    max_depth = 5
    pruned_tree = id3.pruning_tree(id3_tree, max_depth)

Esempio n. 16

0

Mostra file

	if instance[label_attribute] not in monks_labels: monks_labels.append(instance[label_attribute])

digits_attributes, digits_full_dataset = iotools.parse_file('../DecisionTrees/data/opticalDigit.csv')
digits_labels = []
for instance in digits_full_dataset:
	if instance[label_attribute] not in digits_labels: digits_labels.append(instance[label_attribute])

results = [[[],[],[]],[[],[],[]]]
for seed in range(100,130):
	partitions = iotools.split_dataset(monks_full_dataset, seed=seed, num_partitions=3)
	monks_training_set = []
	for p in partitions[:-1]:
		monks_training_set += p
	monks_test_set = partitions[-1]
	# run id3 on monks
	tree = id3.id3(copy.deepcopy(monks_attributes), monks_training_set)
	id3_labels, id3_matrix = test(monks_test_set, tree.classify, copy.deepcopy(monks_labels))
	print(iotools.print_confusion_matrix(id3_matrix, id3_labels))
	print(accuracy(id3_matrix))
	results[0][0].append(accuracy(id3_matrix))
	# run c4.5 on monks
	rules = c45.c45(copy.deepcopy(monks_attributes), monks_training_set)
	c45_labels, c45_matrix = test(monks_test_set, lambda inst: decision_rule.classify_on_rule_list(inst, rules), copy.deepcopy(monks_labels))
	print(iotools.print_confusion_matrix(c45_matrix, c45_labels))
	print(accuracy(c45_matrix))
	results[0][1].append(accuracy(c45_matrix))
	# run NB on monks
	nb_attributes = copy.deepcopy(monks_attributes)
	nb_attributes.remove(label_attribute)
	nb = naive_bayes.BayesianClassifier()
	nb.train(monks_training_set, nb_attributes)

Esempio n. 17

0

Mostra file

	test_block = partitions[j]
	test_label_block = label_partitions[j] 
	train_examples = zip(training_block, training_label_block.T.tolist()[0])
	test_examples = zip(test_block, test_label_block.T.tolist()[0])

	#cross validation for random forest
	print "Cross Validating Random Forest..."
	train_size = int(training_block.shape[0])
	att_size = int(len(attributes))
	forest_size = 100
	[ensemble_error, ensemble_pred] = rf.ensemble(test_block, test_label_block, rf.raise_forest(training_block,training_label_block, forest_size, train_size, att_size))
	error[0]+= (1.0/k) * ensemble_error

	#cross validation for decision tree
	print "Cross Validating Decision Tree..."
	dec_tree = id3.id3(train_examples, attributes)
	dec_tree_errors = 0
	for i in xrange(len(test_block)):
		if id3.classify(dec_tree, test_block[i]) != test_label_block[i]:
			dec_tree_errors += 1
	error[1] += (1.0/k) * (float(dec_tree_errors) / set_size)

	print "Cross Validating AdaBoost..."
	adaboost_classifier = adaboost.adaboost(train_examples, adaboost_rounds)
	adaboost_errors = 0
	for i in xrange(len(test_block)):
		if adaboost.classify(adaboost_classifier, test_block[i]) != test_label_block[i]:
			adaboost_errors += 1
	error[2] += (1.0/k) * (float(adaboost_errors) / set_size)

Esempio n. 18

0

Mostra file

File: testing_ground.py Progetto: em23/machine_learning_cs5350

 def setUp(self):
     data = np.loadtxt(DATA_DIR + 'tennis.csv', delimiter=' ', dtype=str)
     self.data_obj = Data(data=data)
     self.id3_tree = id3(self.data_obj, self.data_obj.attributes, self.data_obj.get_column('label'))

Esempio n. 19

0

Mostra file

File: bagging.py Progetto: TravDraper/CS-6350-Code

def main_slow():
    treenums = [1, 100, 200, 300, 400, 500, 700, 900, 1000]
    trainset, trainraw = makeData(training, labels)
    testset, testraw = makeData(testing, labels)
    medians = medianAssign(trainset, labels)
    trainset = removeNums(trainset, medians)
    testset = removeNums(testset, medians)

    trainLabels = [item[-1] for item in trainraw]
    testLabels = [item[-1] for item in testraw]

    for element in trainset:
        element['weight'] = 1
    trainAcc = []
    testAcc = []

    for num in treenums:
        treelist = []
        for i in range(num):
            newTraining = rand.choices(trainset, k=len(trainset))
            newTree = id3(newTraining, labels, label_attr, labels[-1], 18,
                          'entropy', None)
            treelist.append(newTree)
        trainPred = []
        testPred = []
        for entry in trainset:
            thing = bag_guess(treelist, entry, labels, label_attr['outcome'])
            trainPred.append(thing)
        for entry in testset:
            thing = bag_guess(treelist, entry, labels, label_attr["outcome"])
            testPred.append(thing)
        trainAcc.append(accuracy(trainPred, trainLabels))
        testAcc.append(accuracy(testPred, testLabels))
        print(trainAcc, 'train accuracy')
        print(testAcc, 'test accuracy')

    tree_preds = []
    basics = []
    for i in range(50):
        train_i = rand.choices(trainset, k=1000)
        treelist_i = []
        for j in range(300):
            train_j = rand.choices(train_i, k=1000)
            newTree = id3(train_j, labels, label_attr, labels[-1], 18,
                          'entropy', None)
            treelist_i.append(newTree)
#            if j%100 == 0:
#                print("100 more trees from set", i, 'have been trained.  iteration = ',j)
        tree_preds.append(treelist_i)
        basics.append(treelist_i[0])
        print("Tree set", i, "has been trained")

    singleVar = []
    singleBias = []
    singleMean = []
    for entry in testset:
        guess_agg = 0
        predictions = []
        for tree in basics:
            guess = predict(tree, entry, labels)
            if guess == label_attr['outcome'][0]:
                guess_agg += 1
                predictions.append(1)
            else:
                predictions.append(0)
        ave = guess_agg / len(basics)
        singleMean.append(ave)
        value = 0
        if entry['outcome'] == label_attr['outcome'][0]:
            value = 1
        bias = (value - ave)**2
        singleBias.append(bias)
        subVar = []
        for h in predictions:
            mini = (h - ave)**2
            subVar.append(mini)
        var = (1 / (len(basics) - 1)) * sum(subVar)
        singleVar.append(var)

    bagVar = []
    bagBias = []
    bagMean = []
    for entry in testset:
        guess_agg = 0
        predictions = []
        for trees in tree_preds:
            guess = bag_guess(trees, entry, labels, label_attr['outcome'])
            if guess == label_attr['outcome'][0]:
                guess_agg += 1
                predictions.append(1)
            else:
                predictions.append(0)
        ave = guess_agg / len(basics)
        bagMean.append(ave)
        value = 0
        if entry['outcome'] == label_attr['outcome'][0]:
            value = 1
        bias = (value - ave)**2
        bagBias.append(bias)
        subVar = []
        for h in predictions:
            mini = (h - ave)**2
            subVar.append(mini)
        var = (1 / (len(basics) - 1)) * sum(subVar)
        bagVar.append(var)

    sVariance = mean(singleVar)
    sBias = mean(singleBias)
    sMSE = sBias + sVariance
    print("The bias and the variance of the single trees are: Variance:",
          sVariance, 'Bias:', sBias, "and the general squared error is:", sMSE)
    bVariance = mean(bagVar)
    bBias = mean(bagBias)
    bMSE = bBias + bVariance
    print("The bias and the variance of the bagged trees are: Variance:",
          bVariance, 'Bias:', bBias, "and the general squared error is:", bMSE)

Esempio n. 20

0

Mostra file

File: cross_validation.py Progetto: em23/machine_learning_cs5350

                continue

            datas.append(np.loadtxt(DATA_DIR + filename + '.csv', delimiter=',', dtype=str))

        data = np.concatenate(datas)
        data_obj = Data(data=data)
        train_objs.append(data_obj)

    avg_accuracies = []
    for max_depth in depths:
        accuracies = []
        print('**********************************')
        print('****** Hyperparameter is {} ******'.format(max_depth))
        print('**********************************\n')

        for i in range(len(filenames)):
            id3_tree = id3.id3(train_objs[i], train_objs[i].attributes, train_objs[i].get_column('label'))
            pruned_tree = id3.pruning_tree(id3_tree, max_depth)

            error, depth = id3.report_error(test_objs[i], pruned_tree)
            accuracies.append(100.0-error)

            print('***** Testing on {} *****'.format(filenames[i]))

        avg_accuracy = st.mean(accuracies)
        avg_accuracies.append(avg_accuracy)
        print("Average accuracy: {}%; Standard Deviation: {}\n".format(avg_accuracy, np.std(accuracies)))

    # print(dict(zip(depths, avg_accuracies)))

Esempio n. 21

0

Mostra file

import functions as f
import id3


attributes, data, output = f.read_data("train.txt")


for i, j in zip(attributes, data):
    print(i, j, sep='\t')
print(attributes[-1], output, sep='\t')


decision_tree = id3.id3(attributes, data, output)
print('\n', decision_tree.to_string(1))


f.draw_dt(decision_tree)

Esempio n. 22

0

Mostra file

File: main.py Progetto: divyapamujula/decisionTree

import pydot
import utils
import id3
import decisiontree

# Read weather dataset
print('Reading weather dataset...')
weatherAttributes, weatherDataSet = utils.readDataSet(
    './datasets/weatherDataSetTrain.csv')
weatherTargetAttribute = weatherAttributes[-1]
weatherAttributes.remove(weatherTargetAttribute)

# Train
print('Training weather dataset...')
weatherTree = id3.id3(weatherAttributes, weatherTargetAttribute,
                      weatherDataSet)
print(weatherTree)

print('Plotting weather decision tree...')
weatherGraph = pydot.Dot(graph_type='digraph')
decisiontree.drawTree(weatherGraph, weatherTree)
weatherGraph.write('./images/weather.png', prog=None, format='png')
print('Done.')

# Read car evaluation dataset
print('===========================================')
print('Reading car evaluation training dataset...')
carAttributes, carEvaluationTrainDataSet = utils.readDataSet(
    './datasets/car-evaluation-train.csv')
targetAttribute = carAttributes[-1]
carAttributes.remove(targetAttribute)

Esempio n. 23

0

Mostra file

File: voting.py Progetto: jdanray/blog-code

	return examples

# given a list of test examples, the target attribute, and a decision tree,
# returns a count of correct and incorrect classifications
def test_tree(tree, examples, target):
	def classify(example, tree):		
		while isinstance(tree, dict):
			attr = list(tree.keys())[0]
			attr_val = example[attr]	
			if attr_val not in tree[attr]:
				c = Counter([e[attr] for e in examples])
				attr_val = c.most_common(1)[0][0]
			tree = tree[attr][attr_val]			
		return tree
	
	return Counter([classify(e, tree) == e[target] for e in examples])

train_file = 'vote_train.txt'
training = preprocess(train_file)

test_file = 'vote_test.txt'
testing = preprocess(test_file)

target = 'party'
attributes = [a for a in training[0].keys() if a is not target]

tree = id3(training, attributes, target)

results = test_tree(tree, testing, target)
print('%f%% correct' % (results[True] * 100.0 / len(testing)))

Esempio n. 24

0

Mostra file

def main():
    if sys.argv[1] == '-h':
        print("TODO: use argparse and put pretty messages here")
        sys.exit(0)
    if (len(sys.argv) < 4):
        print(
            "There's a missing parameter. Remember to include path to data, alg and seed"
        )
        sys.exit(1)

    path = sys.argv[1]
    algorithm = sys.argv[2].lower()
    seed = sys.argv[3]
    if (len(sys.argv) == 5):
        output_dir = sys.argv[4]
    else:
        output_dir = './'
    attributes, full_dataset = iotools.parse_file(path)
    partitions = iotools.split_dataset(full_dataset,
                                       seed=seed,
                                       num_partitions=3)
    labels = []
    label_attribute = 'label'
    for instance in full_dataset:
        if instance[label_attribute] not in labels:
            labels.append(instance[label_attribute])

    training_set = []
    for p in partitions[:-1]:
        training_set += p
    if algorithm == 'id3':
        tree = id3.id3(attributes, training_set)
        #iotools.output_graph_image_source(tree, 'pretty_picture.gv')
        labels, matrix = test(partitions[-1], tree.classify, labels)
    elif algorithm == 'c4.5':
        rule_list = c45.c45(attributes, training_set)
        print("\nFinal Rules:\n")
        for rule in rule_list:
            print(rule)
        print(decision_rule.rule_list_to_tree(rule_list))
        labels, matrix = test(
            partitions[-1],
            lambda inst: decision_rule.classify_on_rule_list(inst, rule_list),
            labels)
    elif algorithm == 'c4.5np':
        rule_list = c45.c45(attributes, training_set, pruning=False)
        print("\nFinal Rules:\n")
        for rule in rule_list:
            print(rule)
        labels, matrix = test(
            partitions[-1],
            lambda inst: decision_rule.classify_on_rule_list(inst, rule_list),
            labels)
    elif algorithm == 'c4.5nsi':
        rule_list = c45.c45(attributes, training_set, split_info=False)
        print("\nFinal Rules:\n")
        for rule in rule_list:
            print(rule)
        labels, matrix = test(
            partitions[-1],
            lambda inst: decision_rule.classify_on_rule_list(inst, rule_list),
            labels)
    elif algorithm == 'naivebayes':
        attributes.remove(label_attribute)
        nb = naive_bayes.BayesianClassifier()
        nb.train(training_set, attributes)
        labels, matrix = test(partitions[-1],
                              lambda inst: nb.classify(inst, attributes),
                              labels)
    #elif algorithm == 'neuralnets':

    #        nn = ?
    #        nn.train(training_set, attributes)
    else:
        print("Sorry, that algorithm is not implemented yet")
        sys.exit(1)
    iotools.output_confusion_matrix(matrix, labels,
                                    re.sub(r'.*/([^/\.]*)\.csv', r'\1', path),
                                    algorithm, seed, output_dir)

Esempio n. 25

0

Mostra file

File: main.py Progetto: tchen95/eda132_machine_learning

        # header = header[1:]
        print(header+"|"+nestedList)
    else:
        # print(nestedList[0])
        header += "  "
        for index in range(0,len(nestedList)):
            printList(nestedList[index],header)

# def printList(nestedList):
#     if len(nestedList) == 0:
#         return



relationName, attributeList, dataList = reader.readARFF("contact-lenses.arff")
tree = id3.id3(dataList,attributeList,dataList)
header = ""
print(relationName)
printList(tree,header)

relationName, attributeList, dataList = reader.readARFF("restaurants.arff")
tree = id3.id3(dataList,attributeList,dataList)
header = ""
print("\n\n"+relationName)
printList(tree,header)

relationName, attributeList, dataList = reader.readARFF("weather.nominal.arff")
tree = id3.id3(dataList,attributeList,dataList)
header = ""
print("\n\n"+relationName)
printList(tree,header)

Esempio n. 26

0

Mostra file

File: test.py Progetto: jwde/MLFinalProject

def main(args):
    dataset = stateoftheunions()

    for sample in dataset:
        preprocess(sample)

    common_words = AtLeastNDups((word for sample in dataset \
                                      for sent in sample['speech'] \
                                      for word in sent), 4)

    for sample in dataset:
        unkify(sample, common_words)

    computefeatures(dataset, common_words.union(set(unknown_token)))

    for sample in dataset:
        sample['classes'] = [0] if sample['party'] == "Republican" else [1]

    featureset = set()
    for sample in dataset:
        featureset = featureset.union(set(sample['features'].keys()))

    #globalfeatureset = set((feature for feature in featureset if all((feature in sample['features'] for sample in dataset))))

    commonfeatureset = set((feature for feature in featureset if sum((1 if feature in sample['features'] else 0 for sample in dataset)) > 5))

    print "featureset len", len(featureset)
    print "commonfeaturest len", len(commonfeatureset)

    print "binarizing"

    splits = {feature: decisionsplit(dataset, feature) for feature in commonfeatureset}

    bfeatureset = set()

    for sample in dataset:
        sample['bfeatures'] = Counter()
        for feature in [f for f in sample['features'] if f in commonfeatureset]:
            bfeature = str(feature) + ' > ' + str(splits[feature])
            if not bfeature in bfeatureset:
                bfeatureset = bfeatureset.union(set([bfeature]))
            if sample['features'][feature] <= splits[feature]:
                sample['bfeatures'][bfeature] = 0
            else:
                sample['bfeatures'][bfeature] = 1

    print "filtering"

    featureentropy = {}
    for feature in bfeatureset:
        ent0 = entropy(normalize({feature: sample['classes'][0] for sample in dataset if sample['bfeatures'][feature] == 0}))
        ent1 = entropy(normalize({feature: sample['classes'][0] for sample in dataset if sample['bfeatures'][feature] == 1}))
        featureentropy[feature] = (len([s for s in dataset if s['bfeatures'][feature] == 0]) / len(dataset)) * ent0 + (len([s for s in dataset if s['bfeatures'][feature] == 1]) / len(dataset)) * ent1

    bestfeatures = sorted(featureentropy, key=featureentropy.get)[:1000]

    training = [{'features': Counter({f: sample['bfeatures'][f] for f in sample['bfeatures'] if f in bestfeatures}), 'classes': sample['classes']} for sample in dataset]

    baselinetrainer = lambda t: baselineclassifier

    baselinecv = stratifiedcrossvalidate(baselinetrainer, training, 5)

    print "cross-validated accuracy with baseline:", str(baselinecv)
    print "average accuracy:", str(sum(baselinecv) / 5.0)


    nbtrainer = lambda t: lambda s: naivebayesclassify(t, bestfeatures, s)

    nbcv = stratifiedcrossvalidate(nbtrainer, training, 5)

    print "cross-validated accuracy with naive bayes:", str(nbcv)
    print "average accuracy:", str(sum(nbcv) / 5.0)


    for maxdepth in [5, 10, 20, 40, 80]:

        id3trainer = lambda t: id3(t, bestfeatures, maxdepth).classify

        id3cv = stratifiedcrossvalidate(id3trainer, training, 5)

        print "cross-validated accuracy with id3 and max depth of", str(maxdepth), ":", str(id3cv)
        print "average accuracy:", str(sum(id3cv) / 5.0)



    for maxdepth in [5, 10, 20, 40, 80]:
        for numtrees in [64, 80, 96, 112, 128]:
            rftrainer = lambda t: RandomForest(lambda o,f: id3(o, f, maxdepth).classprobabilities, t, bestfeatures, numtrees).classify

            rfcv = stratifiedcrossvalidate(rftrainer, training, 5)

            print "cross-validated accuracy with random forest with", str(numtrees), "trees and", str(maxdepth), "max depth:", str(rfcv)
            print "average accuracy:", str(sum(rfcv) / 5.0)

    """
    use random forest since it has by far the highest accuracy
    random forest democrat/republican semantic differential:
    rf.classprobabilities(speech)[(1,)]
    where 0 is republican, 1 is democrat
    this is just the probability of democrat
    so when it is low, we are closer to republican (0)
    and when it is high, we are closer to democrat (1)

    """

    """

Esempio n. 27

0

Mostra file

File: run_id3.py Progetto: kyledpierson/machine-learning

        '../data/data-splits/data.train', n_features=n_features, preprocessor=preprocessor)
    test_data, test_labels = load_data(
        '../data/data-splits/data.test', n_features=n_features, preprocessor=preprocessor)

    cv_data = np.array_split(np.hstack((train_data, train_labels)), 5)

    max_acc = 0
    opt_depth = 0
    for i in range(2, n_features + 2):
        acc = []

        for j in range(len(cv_data)):
            cv_test = cv_data[j]
            cv_train = np.vstack(cv_data[:j] + cv_data[j + 1:])

            tree, depth = id3(cv_train[:, :-1], cv_train[:, -1], max_depth=i)

            cv_acc = evaluate_tree(cv_test[:, :-1], cv_test[:, -1], tree)
            acc.append(cv_acc)

        avg_acc = np.mean(acc)
        if avg_acc > max_acc:
            opt_depth = i
            max_acc = avg_acc

    tree, depth = id3(train_data, train_labels, max_depth=opt_depth)
    train_acc = evaluate_tree(train_data, train_labels, tree)
    test_acc = evaluate_tree(test_data, test_labels, tree)

    write_output('ID3', opt_depth, max_acc, train_acc, test_acc)
    write_predictions('id3', lambda row: classify(row, tree),

Esempio n. 28

0

Mostra file

File: hw1.py Progetto: hakuliu/inf552

__author__ = 'paul'
#no other partner for this homework

import nightoutparser
import id3
import nightoutdata


def constructQuery():
    return nightoutdata.Row('Large', 'Moderate', 'Cheap', 'Loud',
                            'City-Center', 'No', 'No', "")


OUTCOME = nightoutdata.ENJOY
rows = nightoutparser.parsefile("dt-data.txt")

node = id3.id3(rows, nightoutdata.allnontargetattributes, nightoutdata.ENJOY)

id3.visit([node])
id3.check(rows, node)

queryrow = constructQuery()
prediction = id3.getDecision(queryrow, node)
print('Prediction to enjoy was ' + prediction)

Esempio n. 29

0

Mostra file

File: main.py Progetto: steciuk/PSZT-ID3-algortihm

import file_manager
import id3
import config

dataset = file_manager.read_dataset()

attributes = dataset.keys().drop(config.file_label)
tree = id3.id3(dataset, attributes)
id3.print_tree(tree)
print(tree)