def main(): trainset, trainraw = makeData(training, labels) testset, testraw = makeData(testing, labels) medians = medianAssign(trainset, labels) trainset = removeNums(trainset, medians) testset = removeNums(testset, medians) fix = fixUnknown(trainset, labels) trainsetU = replaceUnknown(trainset, labels, fix) testsetU = replaceUnknown(testset, labels, fix) trainLabels = [item[-1] for item in trainraw] testLabels = [item[-1] for item in testraw] #print(trainset[1]) print( "Running decision tree algorithm on the bank dataset with unknown values" ) algotype = ['gini', 'entropy', 'ME'] for item in algotype: for i in range(1, 17): currentTree = id3(trainset, labels, label_attr, labels[-1], i, item, None) trainPred = [predict(currentTree, x, labels) for x in trainset] testPred = [predict(currentTree, x, labels) for x in testset] trainAcc = accuracy(trainPred, trainLabels) testAcc = accuracy(testPred, testLabels) print("Decision tree of depth", i, "using", item, "has a test accuracy of", testAcc, 'and a training accuracy of', trainAcc) print( "Running decision tree algorithm on the bank dataset with unknown's replaced" ) print("\n \n \n \n \n") for item in algotype: for i in range(1, 17): currentTree = id3(trainsetU, labels, label_attr, labels[-1], i, item, None) trainPred = [predict(currentTree, x, labels) for x in trainsetU] testPred = [predict(currentTree, x, labels) for x in testsetU] trainAcc = accuracy(trainPred, trainLabels) testAcc = accuracy(testPred, testLabels) print("Decision tree of depth", i, "using", item, "has a test accuracy of", testAcc, 'and a training accuracy of', trainAcc)
def main(): trainset, trainRaw = makeData(training, labels) testset, testRaw = makeData(testing, labels) trainLabels = [item[-1] for item in trainRaw] testLabels = [item[-1] for item in testRaw] #print(gf.gainE(trainset,labels[5], labels[-1])) # mytree = id3(trainset, labels, label_attr, labels[-1], 6, "entropy", None) # printTree(mytree) print("Running the decision tree algorithm on the 'Cars' dataset.") algotype = ['gini', 'entropy', 'ME'] for item in algotype: for i in range(1, 7): currentTree = id3.id3(trainset, labels, label_attr, labels[-1], i, item, None) trainPred = [id3.predict(currentTree, x, labels) for x in trainset] testPred = [id3.predict(currentTree, x, labels) for x in testset] trainAcc = id3.accuracy(trainPred, trainLabels) testAcc = id3.accuracy(testPred, testLabels) print("Decision tree of depth", i, "using", item, "has a test accuracy of", testAcc, 'and a training accuracy of', trainAcc)
def boost(data, labels, attr_list, target, iterations, answers): treelist = [] for i in range(iterations): normalize = 0 for item in data: normalize += item['weight'] currentTree = id3(data, labels, attr_list, target, 2, 'entropy', None) # predictor = [] # for item in data: # predictor.append([predict(currentTree, item, labels), item['weight']]) trainError = 0 # print('weight =' , data[0]['weight']) for item in data: if predict(currentTree, item, labels) != item[labels[-1]]: trainError += item['weight'] / normalize # print(trainError) # print(normalize) # alpha = 1/2*m.log((1-trainError)/trainError) alpha = 1 / 4 * m.log((1 - trainError) / trainError) # print(m.exp(alpha), m.exp(-alpha)) # print(alpha, " = alpha") for item in data: if predict(currentTree, item, labels) != item[labels[-1]]: item['weight'] = item['weight'] * m.exp(alpha) # print(item['weight']) else: item['weight'] = item['weight'] * m.exp(-alpha) # print(item['weight']) treelist.append({'tree': currentTree, 'alpha': alpha}) #print(treelist) return treelist
def __init__(self,gr,nodes,tipo,target,vtarget,narboles,nnodes,ntrels,maxdepth,exrel,umbral): self.graph_db = gr self.tipo = tipo self.target = target self.vtarget = vtarget self.nnodes = nnodes self.ntrels = ntrels self.maxdepth = maxdepth TC = neo4j.CypherQuery(self.graph_db, "MATCH (a)-[r]->(b) WHERE labels(a) <> [] AND labels(b) <> [] RETURN DISTINCT head(labels(a)) AS This, type(r) as To, head(labels(b)) AS That limit "+str(self.ntrels)).execute() print "Tipos de aristas cargadas: "+ str(len(TC.data)) + " elementos." while(len(self.arboles)<narboles): tempn = nodes#random.sample(nodes , ( random.randint(1,len(nodes)/2))) + (random.sample(set(nodes[-len(nodes)/2:]), random.randint(1,len(nodes)/2))) tempr = random.sample(set(TC.data), random.randint(3,len(TC))) #tempr = TC.data arbol = id3(gr,target,vtarget,tempr) res = arbol.execute(tempn,"match (n:"+self.tipo+")",self.tipo,self.maxdepth,-999,999,exrel,umbral,target) tab = [] for l in res.get_leaves(): a,b,c = l.name.partition("*") tab.append(a) entra = True for c in self.arboles: if entra == True: if self.checkequals(c.arbol,res): entra = not self.checkequals(c.arbol,res) if len(res.get_edges()) > 2 and len(set(tab)) > 1 and entra: print "Arbol "+str(len(self.arboles)+1)+"("+str(len(tempn))+" nodos):" print res.get_ascii(show_internal=True) self.arboles.append(arbol)
def raise_forest(Xtrain, ytrain, n, train_size, att_size): print "Raising forest with " + str(n) + " trees" trees = [] for i in xrange(n): sub_train_x, sub_train_ind = sample_with_rep(Xtrain, train_size) sub_train_y = ytrain[sub_train_ind] examples = [(sub_train_x[i], sub_train_y[i]) for i in xrange(len(sub_train_ind))] sub_att, sub_att_ind = sample_with_rep(attributes,att_size) sub_att = set(sub_att) trees.append(id3.id3(examples, sub_att)) return trees
def raise_forest(Xtrain, ytrain, n, train_size, att_size): print "Raising forest with " + str(n) + " trees" trees = [] for i in xrange(n): sub_train_x, sub_train_ind = sample_with_rep(Xtrain, train_size) sub_train_y = ytrain[sub_train_ind] examples = [(sub_train_x[i], sub_train_y[i]) for i in xrange(len(sub_train_ind))] sub_att, sub_att_ind = sample_with_rep(attributes, att_size) sub_att = set(sub_att) trees.append(id3.id3(examples, sub_att)) return trees
def evaluate(U, k): D = get_decisions(U) Y = get_classes(U) shuffle(U) U = divide(U, k) evaluation = [] for i in range(k): tree = id3(Y, D, substract(U, U[i])) evaluation.append(avg_loss(tree, U[i])) return sum(evaluation) / k
def __k_fold(dataset, k): split_dataset = np.array_split(dataset, k) results = pd.DataFrame(columns=["TP", "TN", "FP", "FN"]) for i in range(k): train_set = split_dataset.copy() test_set = split_dataset[i] del train_set[i] train = pd.concat(train_set, sort=False) attributes = train.keys().drop(config.file_label) tree = id3.id3(train, attributes) tmp_results = __test(tree, test_set) results = results.append(tmp_results, ignore_index=True) results = results.sum() return results
def validation_of_full_set_multirun_for_different_dataset_size(dataset, starting_set_part, min_set_part): full_results = pd.DataFrame(columns=COLUMNS) dataset_size = len(dataset) part = starting_set_part while part >= min_set_part: print("part: " + str(part)) results = pd.DataFrame(columns=["TP", "TN", "FP", "FN"]) training_size = round(part * dataset_size) for i in range(config.num_of_reruns): print("rerun: " + str(i)) training = dataset.sample(frac=part, random_state=config.rng_seed + i) attributes = training.keys().drop(config.file_label) tree = id3.id3(training, attributes) results = results.append(__test(tree, dataset)) dataframe = __build_final_dataframe_for_full_validation(results, training_size, dataset_size) full_results = full_results.append(dataframe, ignore_index=True) part /= 2 return full_results
print_time = cmd_args.time print_tree = cmd_args.tree print_smt = cmd_args.smt print_model = cmd_args.model if cmd_args.verbose: print_tree = True print_smt = True print_model = True #print("# reading from stdin") header, samples = parse(sys.stdin) # print("# getting upper bound from ID3") if print_time: start = time.time() id3_sol = id3(samples) if print_time: end = time.time() id3_time = end - start if (id3_sol == -1): print(f"UNSAT") exit(0) results = {} upper_bound = max(3, id3_sol) # because our solver won't work with N < 3 for solver_i in solvers: for search in [ searches.SAT_UNSAT(3, upper_bound), searches.UNSAT_SAT(3, upper_bound), searches.Binary(3, upper_bound)
def readCommand( argv ): "Processes the command used to run from the command line." from optparse import OptionParser parser = OptionParser(USAGE_STRING) parser.add_option('-c', '--classifier', help=default('The type of classifier'), choices=['id3','mostFrequent', 'nb', 'naiveBayes', 'perceptron', 'mira', 'minicontest'], default='mostFrequent') parser.add_option('-d', '--data', help=default('Dataset to use'), choices=['digits', 'faces'], default='digits') parser.add_option('-t', '--training', help=default('The size of the training set'), default=100, type="int") parser.add_option('-f', '--features', help=default('Whether to use enhanced features'), default=False, action="store_true") parser.add_option('-o', '--odds', help=default('Whether to compute odds ratios'), default=False, action="store_true") parser.add_option('-1', '--label1', help=default("First label in an odds ratio comparison"), default=0, type="int") parser.add_option('-2', '--label2', help=default("Second label in an odds ratio comparison"), default=1, type="int") parser.add_option('-w', '--weights', help=default('Whether to print weights'), default=False, action="store_true") parser.add_option('-k', '--smoothing', help=default("Smoothing parameter (ignored when using --autotune)"), type="float", default=2.0) parser.add_option('-a', '--autotune', help=default("Whether to automatically tune hyperparameters"), default=False, action="store_true") parser.add_option('-i', '--iterations', help=default("Maximum iterations to run training"), default=3, type="int") parser.add_option('-s', '--test', help=default("Amount of test data to use"), default=TEST_SET_SIZE, type="int") options, otherjunk = parser.parse_args(argv) if len(otherjunk) != 0: raise Exception('Command line input not understood: ' + str(otherjunk)) args = {} # Set up variables according to the command line input. print "Doing classification" print "--------------------" print "data:\t\t" + options.data print "classifier:\t\t" + options.classifier if not options.classifier == 'minicontest': print "using enhanced features?:\t" + str(options.features) else: print "using minicontest feature extractor" print "training set size:\t" + str(options.training) if(options.data=="digits"): printImage = ImagePrinter(DIGIT_DATUM_WIDTH, DIGIT_DATUM_HEIGHT).printImage if (options.features): featureFunction = enhancedFeatureExtractorDigit else: featureFunction = basicFeatureExtractorDigit if (options.classifier == 'minicontest'): featureFunction = contestFeatureExtractorDigit elif(options.data=="faces"): printImage = ImagePrinter(FACE_DATUM_WIDTH, FACE_DATUM_HEIGHT).printImage if (options.features): featureFunction = enhancedFeatureExtractorFace else: featureFunction = basicFeatureExtractorFace else: print "Unknown dataset", options.data print USAGE_STRING sys.exit(2) if(options.data=="digits"): legalLabels = range(10) else: legalLabels = range(2) if options.training <= 0: print "Training set size should be a positive integer (you provided: %d)" % options.training print USAGE_STRING sys.exit(2) if options.smoothing <= 0: print "Please provide a positive number for smoothing (you provided: %f)" % options.smoothing print USAGE_STRING sys.exit(2) if options.odds: if options.label1 not in legalLabels or options.label2 not in legalLabels: print "Didn't provide a legal labels for the odds ratio: (%d,%d)" % (options.label1, options.label2) print USAGE_STRING sys.exit(2) if(options.classifier == "id3"): classifier = id3.id3(legalLabels) if (options.autotune): print "using automatic tuning for id3" classifier.automaticTuning = True elif(options.classifier == "mostFrequent"): classifier = mostFrequent.MostFrequentClassifier(legalLabels) elif(options.classifier == "naiveBayes" or options.classifier == "nb"): classifier = naiveBayes.NaiveBayesClassifier(legalLabels) classifier.setSmoothing(options.smoothing) if (options.autotune): print "using automatic tuning for naivebayes" classifier.automaticTuning = True else: print "using smoothing parameter k=%f for naivebayes" % options.smoothing elif(options.classifier == "perceptron"): classifier = perceptron.PerceptronClassifier(legalLabels,options.iterations) elif(options.classifier == "mira"): classifier = mira.MiraClassifier(legalLabels, options.iterations) if (options.autotune): print "using automatic tuning for MIRA" classifier.automaticTuning = True else: print "using default C=0.001 for MIRA" elif(options.classifier == 'minicontest'): import minicontest classifier = minicontest.contestClassifier(legalLabels) else: print "Unknown classifier:", options.classifier print USAGE_STRING sys.exit(2) args['classifier'] = classifier args['featureFunction'] = featureFunction args['printImage'] = printImage return args, options
sys.stderr.write('\nerr>> ' + '\nerr>> '.join(pe) + '\n') return None if p.returncode == 20 else read_model(po) if __name__ == "__main__": header, samples = parse_samples(sys.stdin) # print ("# solver:", solver) print_time = True if print_time: solver = ['timeout', '300', '/usr/bin/time', '-f', '%e'] + solver solver_time = 0 num_solver_calls = 0 # print("# getting upper bound from ID3") id3_cost, id3_model = id3(samples) if id3_cost == -1: # print(f"UNSAT") exit(0) # print('# id3', id3_cost) times_dict = {} for search_class in searches: for encoding in encodings: solver_time = 0 time_per_call = {} num_solver_calls = 0 if id3_cost <= 3:
# get training data id3TrainingData = open('data/id3train.txt', 'r').readlines() id3TrainingMatrix = [] for line in id3TrainingData: id3TrainingMatrix.append(np.fromstring(line, dtype=float, sep=' ')) id3TrainingMatrix = np.array(id3TrainingMatrix) # get test data id3TestData = open('data/id3test.txt', 'r').readlines() id3TestMatrix = [] for line in id3TestData: id3TestMatrix.append(np.fromstring(line, dtype=float, sep=' ')) id3TestMatrix = np.array(id3TestMatrix) # ID3 test error without pruning rootNode = id3(id3TrainingMatrix) numErrors = 0 for row in id3TestMatrix: currNode = rootNode while isinstance(currNode.data, list): featureIndex = currNode.data[1] featureVal = currNode.data[0] if row[featureIndex] <= featureVal: currNode = currNode.yesPtr else: currNode = currNode.noPtr if currNode.data != row[-1]: numErrors = numErrors + 1 print(float(numErrors) / len(id3TestMatrix)) # ID3 test error with pruning
feature_indices = range(n_features) data_size = int(data_size * train_data.shape[0]) feature_size = int(feature_size * n_features) split_size = int(split_size * n_features) max_depth = n_features + 2 # No pruning trees = [] for i in range(num_trees): ri = np.random.choice(data_indices, size=data_size, replace=True) rf = np.random.choice(feature_indices, size=n_features - feature_size, replace=False) tree, depth = id3(train_data[ri], train_labels[ri], used_features=rf, max_depth=max_depth, split_size=split_size) trees.append(tree) if (i + 1) % (num_trees / 10) == 0: print(str((i + 1) / num_trees * 100) + '%') np.save('../data/trees', trees) train_acc = evaluate_forest(train_data, train_labels, trees, '../data/new_train') test_acc = evaluate_forest(test_data, test_labels, trees, '../data/new_test') def predictor(row): return mode(list(map(lambda tree: classify(row, tree), trees)))[0][0]
import id3 as id3 from data import Data DATA_DIR = 'data/' def get_data_obj(filename): data = np.loadtxt(DATA_DIR + filename + '.csv', delimiter=',', dtype=str) return Data(data=data) if __name__ == '__main__': print("\nFull Decision Tree: ") data_obj = get_data_obj('train') id3_tree = id3.id3(data_obj, data_obj.attributes, data_obj.get_column('label')) error, depth = id3.report_error(data_obj, id3_tree) print("Accuracy on training data: {}%; Depth: {}".format( 100 - error, depth)) data_obj_test = get_data_obj('test') error, depth = id3.report_error(data_obj_test, id3_tree) print(" Accuracy on test data: {}%; Depth: {}".format( 100 - error, depth)) print("\nTree with Max Depth 5") max_depth = 5 pruned_tree = id3.pruning_tree(id3_tree, max_depth)
if instance[label_attribute] not in monks_labels: monks_labels.append(instance[label_attribute]) digits_attributes, digits_full_dataset = iotools.parse_file('../DecisionTrees/data/opticalDigit.csv') digits_labels = [] for instance in digits_full_dataset: if instance[label_attribute] not in digits_labels: digits_labels.append(instance[label_attribute]) results = [[[],[],[]],[[],[],[]]] for seed in range(100,130): partitions = iotools.split_dataset(monks_full_dataset, seed=seed, num_partitions=3) monks_training_set = [] for p in partitions[:-1]: monks_training_set += p monks_test_set = partitions[-1] # run id3 on monks tree = id3.id3(copy.deepcopy(monks_attributes), monks_training_set) id3_labels, id3_matrix = test(monks_test_set, tree.classify, copy.deepcopy(monks_labels)) print(iotools.print_confusion_matrix(id3_matrix, id3_labels)) print(accuracy(id3_matrix)) results[0][0].append(accuracy(id3_matrix)) # run c4.5 on monks rules = c45.c45(copy.deepcopy(monks_attributes), monks_training_set) c45_labels, c45_matrix = test(monks_test_set, lambda inst: decision_rule.classify_on_rule_list(inst, rules), copy.deepcopy(monks_labels)) print(iotools.print_confusion_matrix(c45_matrix, c45_labels)) print(accuracy(c45_matrix)) results[0][1].append(accuracy(c45_matrix)) # run NB on monks nb_attributes = copy.deepcopy(monks_attributes) nb_attributes.remove(label_attribute) nb = naive_bayes.BayesianClassifier() nb.train(monks_training_set, nb_attributes)
test_block = partitions[j] test_label_block = label_partitions[j] train_examples = zip(training_block, training_label_block.T.tolist()[0]) test_examples = zip(test_block, test_label_block.T.tolist()[0]) #cross validation for random forest print "Cross Validating Random Forest..." train_size = int(training_block.shape[0]) att_size = int(len(attributes)) forest_size = 100 [ensemble_error, ensemble_pred] = rf.ensemble(test_block, test_label_block, rf.raise_forest(training_block,training_label_block, forest_size, train_size, att_size)) error[0]+= (1.0/k) * ensemble_error #cross validation for decision tree print "Cross Validating Decision Tree..." dec_tree = id3.id3(train_examples, attributes) dec_tree_errors = 0 for i in xrange(len(test_block)): if id3.classify(dec_tree, test_block[i]) != test_label_block[i]: dec_tree_errors += 1 error[1] += (1.0/k) * (float(dec_tree_errors) / set_size) print "Cross Validating AdaBoost..." adaboost_classifier = adaboost.adaboost(train_examples, adaboost_rounds) adaboost_errors = 0 for i in xrange(len(test_block)): if adaboost.classify(adaboost_classifier, test_block[i]) != test_label_block[i]: adaboost_errors += 1 error[2] += (1.0/k) * (float(adaboost_errors) / set_size)
def setUp(self): data = np.loadtxt(DATA_DIR + 'tennis.csv', delimiter=' ', dtype=str) self.data_obj = Data(data=data) self.id3_tree = id3(self.data_obj, self.data_obj.attributes, self.data_obj.get_column('label'))
def main_slow(): treenums = [1, 100, 200, 300, 400, 500, 700, 900, 1000] trainset, trainraw = makeData(training, labels) testset, testraw = makeData(testing, labels) medians = medianAssign(trainset, labels) trainset = removeNums(trainset, medians) testset = removeNums(testset, medians) trainLabels = [item[-1] for item in trainraw] testLabels = [item[-1] for item in testraw] for element in trainset: element['weight'] = 1 trainAcc = [] testAcc = [] for num in treenums: treelist = [] for i in range(num): newTraining = rand.choices(trainset, k=len(trainset)) newTree = id3(newTraining, labels, label_attr, labels[-1], 18, 'entropy', None) treelist.append(newTree) trainPred = [] testPred = [] for entry in trainset: thing = bag_guess(treelist, entry, labels, label_attr['outcome']) trainPred.append(thing) for entry in testset: thing = bag_guess(treelist, entry, labels, label_attr["outcome"]) testPred.append(thing) trainAcc.append(accuracy(trainPred, trainLabels)) testAcc.append(accuracy(testPred, testLabels)) print(trainAcc, 'train accuracy') print(testAcc, 'test accuracy') tree_preds = [] basics = [] for i in range(50): train_i = rand.choices(trainset, k=1000) treelist_i = [] for j in range(300): train_j = rand.choices(train_i, k=1000) newTree = id3(train_j, labels, label_attr, labels[-1], 18, 'entropy', None) treelist_i.append(newTree) # if j%100 == 0: # print("100 more trees from set", i, 'have been trained. iteration = ',j) tree_preds.append(treelist_i) basics.append(treelist_i[0]) print("Tree set", i, "has been trained") singleVar = [] singleBias = [] singleMean = [] for entry in testset: guess_agg = 0 predictions = [] for tree in basics: guess = predict(tree, entry, labels) if guess == label_attr['outcome'][0]: guess_agg += 1 predictions.append(1) else: predictions.append(0) ave = guess_agg / len(basics) singleMean.append(ave) value = 0 if entry['outcome'] == label_attr['outcome'][0]: value = 1 bias = (value - ave)**2 singleBias.append(bias) subVar = [] for h in predictions: mini = (h - ave)**2 subVar.append(mini) var = (1 / (len(basics) - 1)) * sum(subVar) singleVar.append(var) bagVar = [] bagBias = [] bagMean = [] for entry in testset: guess_agg = 0 predictions = [] for trees in tree_preds: guess = bag_guess(trees, entry, labels, label_attr['outcome']) if guess == label_attr['outcome'][0]: guess_agg += 1 predictions.append(1) else: predictions.append(0) ave = guess_agg / len(basics) bagMean.append(ave) value = 0 if entry['outcome'] == label_attr['outcome'][0]: value = 1 bias = (value - ave)**2 bagBias.append(bias) subVar = [] for h in predictions: mini = (h - ave)**2 subVar.append(mini) var = (1 / (len(basics) - 1)) * sum(subVar) bagVar.append(var) sVariance = mean(singleVar) sBias = mean(singleBias) sMSE = sBias + sVariance print("The bias and the variance of the single trees are: Variance:", sVariance, 'Bias:', sBias, "and the general squared error is:", sMSE) bVariance = mean(bagVar) bBias = mean(bagBias) bMSE = bBias + bVariance print("The bias and the variance of the bagged trees are: Variance:", bVariance, 'Bias:', bBias, "and the general squared error is:", bMSE)
continue datas.append(np.loadtxt(DATA_DIR + filename + '.csv', delimiter=',', dtype=str)) data = np.concatenate(datas) data_obj = Data(data=data) train_objs.append(data_obj) avg_accuracies = [] for max_depth in depths: accuracies = [] print('**********************************') print('****** Hyperparameter is {} ******'.format(max_depth)) print('**********************************\n') for i in range(len(filenames)): id3_tree = id3.id3(train_objs[i], train_objs[i].attributes, train_objs[i].get_column('label')) pruned_tree = id3.pruning_tree(id3_tree, max_depth) error, depth = id3.report_error(test_objs[i], pruned_tree) accuracies.append(100.0-error) print('***** Testing on {} *****'.format(filenames[i])) avg_accuracy = st.mean(accuracies) avg_accuracies.append(avg_accuracy) print("Average accuracy: {}%; Standard Deviation: {}\n".format(avg_accuracy, np.std(accuracies))) # print(dict(zip(depths, avg_accuracies)))
import functions as f import id3 attributes, data, output = f.read_data("train.txt") for i, j in zip(attributes, data): print(i, j, sep='\t') print(attributes[-1], output, sep='\t') decision_tree = id3.id3(attributes, data, output) print('\n', decision_tree.to_string(1)) f.draw_dt(decision_tree)
import pydot import utils import id3 import decisiontree # Read weather dataset print('Reading weather dataset...') weatherAttributes, weatherDataSet = utils.readDataSet( './datasets/weatherDataSetTrain.csv') weatherTargetAttribute = weatherAttributes[-1] weatherAttributes.remove(weatherTargetAttribute) # Train print('Training weather dataset...') weatherTree = id3.id3(weatherAttributes, weatherTargetAttribute, weatherDataSet) print(weatherTree) print('Plotting weather decision tree...') weatherGraph = pydot.Dot(graph_type='digraph') decisiontree.drawTree(weatherGraph, weatherTree) weatherGraph.write('./images/weather.png', prog=None, format='png') print('Done.') # Read car evaluation dataset print('===========================================') print('Reading car evaluation training dataset...') carAttributes, carEvaluationTrainDataSet = utils.readDataSet( './datasets/car-evaluation-train.csv') targetAttribute = carAttributes[-1] carAttributes.remove(targetAttribute)
return examples # given a list of test examples, the target attribute, and a decision tree, # returns a count of correct and incorrect classifications def test_tree(tree, examples, target): def classify(example, tree): while isinstance(tree, dict): attr = list(tree.keys())[0] attr_val = example[attr] if attr_val not in tree[attr]: c = Counter([e[attr] for e in examples]) attr_val = c.most_common(1)[0][0] tree = tree[attr][attr_val] return tree return Counter([classify(e, tree) == e[target] for e in examples]) train_file = 'vote_train.txt' training = preprocess(train_file) test_file = 'vote_test.txt' testing = preprocess(test_file) target = 'party' attributes = [a for a in training[0].keys() if a is not target] tree = id3(training, attributes, target) results = test_tree(tree, testing, target) print('%f%% correct' % (results[True] * 100.0 / len(testing)))
def main(): if sys.argv[1] == '-h': print("TODO: use argparse and put pretty messages here") sys.exit(0) if (len(sys.argv) < 4): print( "There's a missing parameter. Remember to include path to data, alg and seed" ) sys.exit(1) path = sys.argv[1] algorithm = sys.argv[2].lower() seed = sys.argv[3] if (len(sys.argv) == 5): output_dir = sys.argv[4] else: output_dir = './' attributes, full_dataset = iotools.parse_file(path) partitions = iotools.split_dataset(full_dataset, seed=seed, num_partitions=3) labels = [] label_attribute = 'label' for instance in full_dataset: if instance[label_attribute] not in labels: labels.append(instance[label_attribute]) training_set = [] for p in partitions[:-1]: training_set += p if algorithm == 'id3': tree = id3.id3(attributes, training_set) #iotools.output_graph_image_source(tree, 'pretty_picture.gv') labels, matrix = test(partitions[-1], tree.classify, labels) elif algorithm == 'c4.5': rule_list = c45.c45(attributes, training_set) print("\nFinal Rules:\n") for rule in rule_list: print(rule) print(decision_rule.rule_list_to_tree(rule_list)) labels, matrix = test( partitions[-1], lambda inst: decision_rule.classify_on_rule_list(inst, rule_list), labels) elif algorithm == 'c4.5np': rule_list = c45.c45(attributes, training_set, pruning=False) print("\nFinal Rules:\n") for rule in rule_list: print(rule) labels, matrix = test( partitions[-1], lambda inst: decision_rule.classify_on_rule_list(inst, rule_list), labels) elif algorithm == 'c4.5nsi': rule_list = c45.c45(attributes, training_set, split_info=False) print("\nFinal Rules:\n") for rule in rule_list: print(rule) labels, matrix = test( partitions[-1], lambda inst: decision_rule.classify_on_rule_list(inst, rule_list), labels) elif algorithm == 'naivebayes': attributes.remove(label_attribute) nb = naive_bayes.BayesianClassifier() nb.train(training_set, attributes) labels, matrix = test(partitions[-1], lambda inst: nb.classify(inst, attributes), labels) #elif algorithm == 'neuralnets': # nn = ? # nn.train(training_set, attributes) else: print("Sorry, that algorithm is not implemented yet") sys.exit(1) iotools.output_confusion_matrix(matrix, labels, re.sub(r'.*/([^/\.]*)\.csv', r'\1', path), algorithm, seed, output_dir)
# header = header[1:] print(header+"|"+nestedList) else: # print(nestedList[0]) header += " " for index in range(0,len(nestedList)): printList(nestedList[index],header) # def printList(nestedList): # if len(nestedList) == 0: # return relationName, attributeList, dataList = reader.readARFF("contact-lenses.arff") tree = id3.id3(dataList,attributeList,dataList) header = "" print(relationName) printList(tree,header) relationName, attributeList, dataList = reader.readARFF("restaurants.arff") tree = id3.id3(dataList,attributeList,dataList) header = "" print("\n\n"+relationName) printList(tree,header) relationName, attributeList, dataList = reader.readARFF("weather.nominal.arff") tree = id3.id3(dataList,attributeList,dataList) header = "" print("\n\n"+relationName) printList(tree,header)
def main(args): dataset = stateoftheunions() for sample in dataset: preprocess(sample) common_words = AtLeastNDups((word for sample in dataset \ for sent in sample['speech'] \ for word in sent), 4) for sample in dataset: unkify(sample, common_words) computefeatures(dataset, common_words.union(set(unknown_token))) for sample in dataset: sample['classes'] = [0] if sample['party'] == "Republican" else [1] featureset = set() for sample in dataset: featureset = featureset.union(set(sample['features'].keys())) #globalfeatureset = set((feature for feature in featureset if all((feature in sample['features'] for sample in dataset)))) commonfeatureset = set((feature for feature in featureset if sum((1 if feature in sample['features'] else 0 for sample in dataset)) > 5)) print "featureset len", len(featureset) print "commonfeaturest len", len(commonfeatureset) print "binarizing" splits = {feature: decisionsplit(dataset, feature) for feature in commonfeatureset} bfeatureset = set() for sample in dataset: sample['bfeatures'] = Counter() for feature in [f for f in sample['features'] if f in commonfeatureset]: bfeature = str(feature) + ' > ' + str(splits[feature]) if not bfeature in bfeatureset: bfeatureset = bfeatureset.union(set([bfeature])) if sample['features'][feature] <= splits[feature]: sample['bfeatures'][bfeature] = 0 else: sample['bfeatures'][bfeature] = 1 print "filtering" featureentropy = {} for feature in bfeatureset: ent0 = entropy(normalize({feature: sample['classes'][0] for sample in dataset if sample['bfeatures'][feature] == 0})) ent1 = entropy(normalize({feature: sample['classes'][0] for sample in dataset if sample['bfeatures'][feature] == 1})) featureentropy[feature] = (len([s for s in dataset if s['bfeatures'][feature] == 0]) / len(dataset)) * ent0 + (len([s for s in dataset if s['bfeatures'][feature] == 1]) / len(dataset)) * ent1 bestfeatures = sorted(featureentropy, key=featureentropy.get)[:1000] training = [{'features': Counter({f: sample['bfeatures'][f] for f in sample['bfeatures'] if f in bestfeatures}), 'classes': sample['classes']} for sample in dataset] baselinetrainer = lambda t: baselineclassifier baselinecv = stratifiedcrossvalidate(baselinetrainer, training, 5) print "cross-validated accuracy with baseline:", str(baselinecv) print "average accuracy:", str(sum(baselinecv) / 5.0) nbtrainer = lambda t: lambda s: naivebayesclassify(t, bestfeatures, s) nbcv = stratifiedcrossvalidate(nbtrainer, training, 5) print "cross-validated accuracy with naive bayes:", str(nbcv) print "average accuracy:", str(sum(nbcv) / 5.0) for maxdepth in [5, 10, 20, 40, 80]: id3trainer = lambda t: id3(t, bestfeatures, maxdepth).classify id3cv = stratifiedcrossvalidate(id3trainer, training, 5) print "cross-validated accuracy with id3 and max depth of", str(maxdepth), ":", str(id3cv) print "average accuracy:", str(sum(id3cv) / 5.0) for maxdepth in [5, 10, 20, 40, 80]: for numtrees in [64, 80, 96, 112, 128]: rftrainer = lambda t: RandomForest(lambda o,f: id3(o, f, maxdepth).classprobabilities, t, bestfeatures, numtrees).classify rfcv = stratifiedcrossvalidate(rftrainer, training, 5) print "cross-validated accuracy with random forest with", str(numtrees), "trees and", str(maxdepth), "max depth:", str(rfcv) print "average accuracy:", str(sum(rfcv) / 5.0) """ use random forest since it has by far the highest accuracy random forest democrat/republican semantic differential: rf.classprobabilities(speech)[(1,)] where 0 is republican, 1 is democrat this is just the probability of democrat so when it is low, we are closer to republican (0) and when it is high, we are closer to democrat (1) """ """
'../data/data-splits/data.train', n_features=n_features, preprocessor=preprocessor) test_data, test_labels = load_data( '../data/data-splits/data.test', n_features=n_features, preprocessor=preprocessor) cv_data = np.array_split(np.hstack((train_data, train_labels)), 5) max_acc = 0 opt_depth = 0 for i in range(2, n_features + 2): acc = [] for j in range(len(cv_data)): cv_test = cv_data[j] cv_train = np.vstack(cv_data[:j] + cv_data[j + 1:]) tree, depth = id3(cv_train[:, :-1], cv_train[:, -1], max_depth=i) cv_acc = evaluate_tree(cv_test[:, :-1], cv_test[:, -1], tree) acc.append(cv_acc) avg_acc = np.mean(acc) if avg_acc > max_acc: opt_depth = i max_acc = avg_acc tree, depth = id3(train_data, train_labels, max_depth=opt_depth) train_acc = evaluate_tree(train_data, train_labels, tree) test_acc = evaluate_tree(test_data, test_labels, tree) write_output('ID3', opt_depth, max_acc, train_acc, test_acc) write_predictions('id3', lambda row: classify(row, tree),
__author__ = 'paul' #no other partner for this homework import nightoutparser import id3 import nightoutdata def constructQuery(): return nightoutdata.Row('Large', 'Moderate', 'Cheap', 'Loud', 'City-Center', 'No', 'No', "") OUTCOME = nightoutdata.ENJOY rows = nightoutparser.parsefile("dt-data.txt") node = id3.id3(rows, nightoutdata.allnontargetattributes, nightoutdata.ENJOY) id3.visit([node]) id3.check(rows, node) queryrow = constructQuery() prediction = id3.getDecision(queryrow, node) print('Prediction to enjoy was ' + prediction)
import file_manager import id3 import config dataset = file_manager.read_dataset() attributes = dataset.keys().drop(config.file_label) tree = id3.id3(dataset, attributes) id3.print_tree(tree) print(tree)