def __init__(self, data, attr, targetIndex, m): if targetIndex < 0: self.root = None else: self.root = id3.ID3(data, attr, attr[:targetIndex] + attr[targetIndex + 1:], None, targetIndex, m)
def main(argv): #------------------------------------------------------------------------------- """ usage: """ args, opts = oss.gopt(argv[1:], [('s', 'sync'), ('d', 'dup')], [], main.__doc__ + __doc__) path = AMAZON_PATH tbl = string.maketrans('/\',', '- ') for i in oss.find(path, '*.mp3'): ii = id3.ID3(i) try: title = chkName(ii['TITLE']) artist = chkName(ii['ARTIST']) print(artist, '---', title) if opts.sync: dir = BU_PATH + '/' + artist try: if not oss.exists(dir): oss.mkdir(dir) except IOError as ex: print(i, "IOError: %s" % str(ex)) raise MyException('artist name error') except TypeError as ex: print(i, "IOError: %s" % str(ex)) raise MyException('artist name error') fn = dir + '/' + title + '.mp3' if not oss.exists(fn): print('%s --> %s' % (i, fn)) cp(i, fn) elif opts.dup and not oss.cmp(i, fn): raise MyException('duplicate song') except KeyError as ex: print('%s -- KeyError: %s' % (i, str(ex))) except UnicodeDecodeError as ex: print('%s -- UnicodeDecodeError: %s' % (i, str(ex))) except IOError as ex: print('%s -- IOError: %s' % (i, str(ex))) except TypeError as ex: print('%s -- TypeError: %s' % (i, str(ex))) except MyException as ex: print('%s -- MyExceptionError: %s' % (i, str(ex))) oss.exit(0)
def ID3(k, df, pruning, is_result_continuous): num_data_per_fold = round(len(df.index) / k) trees = [] trees_accuracy = [] # Creating models and counting accuracy for i in range(k): first_idx_val = i * num_data_per_fold last_idx_val = (i+1) * num_data_per_fold - 1 if (first_idx_val != 0 and last_idx_val != (len(df.index) - 1)): df_training_top = df.iloc[:first_idx_val, :].reset_index(drop = True) df_training_bottom = df.iloc[(last_idx_val + 1):, :].reset_index(drop = True) df_training = pandas.concat([df_training_top, df_training_bottom]) elif (first_idx_val == 0 and last_idx_val != (len(df.index) - 1)): df_training = df.iloc[(last_idx_val + 1):, :].reset_index(drop = True) elif (first_idx_val != 0 and last_idx_val == (len(df.index) - 1)): df_training = df.iloc[:first_idx_val, :].reset_index(drop = True) else: df_training = pandas.DataFrame() df_validation = df.iloc[first_idx_val:last_idx_val, :].reset_index(drop = True) print("-------------------- MODEL", i+1, "--------------------") # We use the whole training data for pruning validation result_tree = id3.ID3(df, df_training, df_validation, pruning) trees.append(result_tree) result_accuracy = id3.count_accuracy(result_tree, [], df_validation, is_result_continuous) trees_accuracy.append(result_accuracy) # Printing each model and its accuracy id3.print_tree(trees[i], 0) print("ACCURACY:", trees_accuracy[i]) print("") best_tree_idx = 0 best_accuracy = trees_accuracy[0] for i in range(k): if (trees_accuracy[i] > best_accuracy): best_tree_idx = i best_accuracy = trees_accuracy[i] return trees[best_tree_idx]
def ID3(df, pruning, is_result_continuous): # memisahkan data training dan testing dengan perbandingan 9:1 separator_iris = round((9 / 10) * len(df.index)) train_iris = df.iloc[:separator_iris, :].reset_index(drop=True) test_iris = df.iloc[separator_iris:, :].reset_index(drop=True) # pembelajaran dengan training data result_tree = id3.ID3(df, train_iris, test_iris, pruning) # menghitung kinerja result_accuracy = id3.count_accuracy(result_tree, [], test_iris, is_result_continuous) # menampilkan tree hasil id3.print_tree(result_tree, 0) print('Akurasi :', result_accuracy) print('Confussion Matrix untuk Virginica, Versicolor, Setosa :') print(ConfusionMatrixID3(result_tree, test_iris)) return result_tree
def getSongs(path, verbose=None): #------------------------------------------------------------------------------- m = set() d = {} for i in oss.find(path, '*.mp3'): if verbose: print('*', end='') ii = id3.ID3(i) try: tag = ii['ARTIST'] + '/' + ii['TITLE'] m.add(tag) d[tag] = i except KeyError: pass except UnicodeDecodeError: pass if verbose: print('\n') return m, d
import csv import id3 import numpy import pandas import sklearn if __name__ == '__main__': with open('../input/train.csv') as f: reader = csv.DictReader(f) train = list(reader) with open('../input/test.csv') as f: reader = csv.DictReader(f) test = list(reader) attrs_to_ignore = {'PassengerId', 'Name', 'Ticket', 'Cabin'} class_attr = 'Survived' attrs = set(train[0].keys()) significant_attrs = attrs - set([class_attr]) - attrs_to_ignore tree = id3.ID3(train, significant_attrs, class_attr) csvData = [['PassengerId', 'Survived']] for row in test: csvData.append([row['PassengerId'], tree.query(row)]) with open('person.csv', 'w') as csvFile: writer = csv.writer(csvFile) writer.writerows(csvData) csvFile.close()
def main(argv): #------------------------------------------------------------------------------- args, opts = oss.gopt(argv[1:], [('d', 'dup'), ('s', 'show')], [], __doc__) ign = getIgnoreSet() print("Cur Dir:", oss.pwd()) for i in oss.find('.', '*.mp3'): ii = id3.ID3(i) err = True try: artist = translate(ii['ARTIST']) if artist in ['*', '<<unknown artist>>']: print(i, "artist name error: *") raise MyException('artist name error') if artist in ign: continue title = translate(ii['TITLE']) dir = BU_PATH + '/' + artist try: if not oss.exists(dir): oss.mkdir(dir) except IOError as ex: print(i, "IOError: %s" % str(ex)) raise MyException('artist name error') except TypeError as ex: print(i, "IOError: %s" % str(ex)) raise MyException('artist name error') fn = dir + '/' + title + '.mp3' if not oss.exists(fn): print('%s --> %s' % (i, fn)) if not opts.show: cp(i, fn) elif opts.dup and not oss.cmp(i, fn): raise MyException('duplicate song') err = False except KeyError as ex: print('%s -- KeyError: %s' % (i, str(ex))) except UnicodeDecodeError as ex: print('%s -- UnicodeDecodeError: %s' % (i, str(ex))) except IOError as ex: print('%s -- IOError: %s' % (i, str(ex))) except TypeError as ex: print('%s -- TypeError: %s' % (i, str(ex))) except MyException as ex: print('%s -- MyExceptionError: %s' % (i, str(ex))) if 0 and err: dir = BU_PATH + '/id3_errors' f = dir + '/' + oss.basename(i) if not oss.exists(f): print('error:', i) cp(i, f) oss.exit(0)
import sys import id3 import numpy as np if __name__ == '__main__': result = id3.ID3() attr_end = False attrs = [] data = [] # Read each input line for line in sys.stdin: line = line.strip("\n") # ignore the comments in the file if line.startswith("%"): continue if not attr_end: # While no "@data" is encountered, it will keep parsing attributes if line.startswith("@attribute") or line.startswith("@ATTRIBUTE"): # reads the line, removes "@attribute", appends the attr name to the array attrs.append(line.split()[1]) # end attribute cycle if data is encountered elif line.startswith("@data") or line.startswith("@DATA"): attr_end = True else: # parse data into array, .arff data is separated by commas data.append(line.split(',')) # print data # format the array with NumPy data = np.array(data)
print(len(trainset[0])) step_size = len(trainset[0]) // 10 for length in range(10, upper_limit, step_size): print('Number of Training Instances:', length) outputfile_tr.write('Number of Training Instances: ' + str(length) + '\n') pruned_accuracies = [] unpruned_accuracies = [] for experiment in range(5): train = trainset[experiment][:length] test = testset[experiment] tree = id3.ID3(train, default) id3.prune(tree, validation_set) acc = id3.accuracy(tree, test) pruned_accuracies.append(acc) tree = id3.ID3(train, default) acc = id3.accuracy(tree, test) unpruned_accuracies.append(acc) avg_pruned_accuracies = sum(pruned_accuracies) / len(pruned_accuracies) avg_unpruned_accuracies = sum(unpruned_accuracies) / len( unpruned_accuracies) print(' Accuracy for Pruned tree: ' + str(avg_pruned_accuracies)) print('Accuracy for Unpruned tree: ' + str(avg_unpruned_accuracies))
if len(args) != 4: print("Not enough arguments!") exit(1) iterations = int(args[1]) percentage_of_data = int(args[2]) if_print_tree = False if(int(args[3]) == 1): if_print_tree = True list1 = [] times1 = [0, 0] tree1depth = 0 tree1_attr_used = 0 for i in range(0, iterations): timer1 = time.time() dtree = id3.ID3(percentage_of_data) dtree.prepare_data("mushroom.txt") dtree.root = dtree.modified_ID3(dtree.learningDataSet, dtree.attributes) timer2 = time.time() times1[0] += timer2 - timer1 if if_print_tree is True: dtree.print_tree(dtree.root) print("\n\n") timer1 = time.time() percent = dtree.test_tree() timer2 = time.time() times1[1] += timer2 - timer1 list1.append(percent) tree1depth += dtree.max_tree_depth/2 tree1_attr_used += (22 - len(dtree.attributes)) list2 = []