Ejemplo n.º 1
0
    def _fit(x, labels, y, default_val=False, prune=False):
        x = C45.normalize_missing_attribute(x, y)
        process_numeric(x, y)

        gain = list()

        if default_val == False:
            default_val = mode(y)

        # All target are the same value
        if np.all(y == y[0, ]):
            return Node(str(y[0]), [], True)

        # Empty attribute
        if x.shape[1] == 0:
            return Node(str(default_val), [], True)

        # Calculate gain
        entropy = ID3.count_entropy(y)
        for idx, attr in enumerate(x.T):
            gain.append(ID3.gain(entropy, attr, y) / C45.splitinfo(attr))

        # Create node from best attribute
        idx_max = np.argmax(gain)
        attr_values = np.unique(x.T[idx_max])

        node = Node(labels[idx_max], attr_values, False)

        # Delete label of best attribute
        next_labels = labels.copy()
        next_labels.pop(idx_max)

        # Split row based on best attribute unique value
        data_per_values = dict()
        for value in attr_values:
            value_x = np.array([])
            value_y = np.array([])
            for idx, example in enumerate(x):
                if (example[idx_max] == value):
                    if value_x.shape[0] == 0:
                        value_x = np.array([example])
                        value_y = np.array([y[idx]])
                    else:
                        value_x = np.vstack((value_x, example))
                        value_y = np.append(value_y, y[idx])

            value_x = np.delete(value_x, idx_max, axis=1)
            data_per_values[value] = (value_x, value_y)

        # Recursively set child for each attribute
        for value, data in data_per_values.items():
            node.set_child(value, ID3._fit(data[0], next_labels, data[1]))

        if prune:
            x_test, y_test, x_train, y_train = C45.train_test_split(x, y)
            ruleset = node.to_rule_list()
            node = ruleset

        return node
Ejemplo n.º 2
0
    def performTest(dataService, k):
        dataService.fetchData()
        v = Validation(dataService, k)
        errorID3 = []
        errorC45 = []
        time = datetime.datetime.now().time()
        for i in range(k):
            print(f'Iteration: {i}, error rate:')
            train, test = v.split_to_train_test(i)
            id3_algorithm = ID3(train, dataService.attrValues,
                                dataService.classes)
            tree = id3_algorithm.generateTree()
            errorID3.append(id3_algorithm.evaluate(test))
            c45_algorithm = C45(train, dataService.attrValues,
                                dataService.classes)
            c45_algorithm.adjustWithC45(tree)
            errorC45.append(c45_algorithm.evaluateC45Tree(test))
            Test.save_to_file(k, time, errorID3[i], errorC45[i])

        MeanErrorID3 = round(100 * sum(errorID3) / k, 2)
        MeanErrorC45 = round(100 * sum(errorC45) / k, 2)

        Test.save_to_file(k, time, MeanErrorID3, MeanErrorC45)
        Test.save_to_file(k, time, len(train),
                          (len(train) / len(v.data)) * 100)

        print(f'ID3 mean error: {MeanErrorID3}%')
        print(f'C45 mean error: {MeanErrorC45}%')
Ejemplo n.º 3
0
 def __init__(self, filename=None):
     self.__id3 = ID3()
     self.load = self.__id3.load
     self.save = self.__id3.save
     self.delete = self.__id3.delete
     if filename is not None:
         self.load(filename)
Ejemplo n.º 4
0
    def task5(self,printTree = True, printPrecision = True):

        """ Performs task 5.
        """
        #this part can create multiple replicates if the tree construction
        #in order to create an accuracy plot
        """
        print('Building the tree (Task 5)...')
        donnees = self.importData('train_continuous.csv')
        precisions = []
        for i in np.linspace(0.4,4,60):
            id3 = ID3()
            print(i)
            self.arbre_advance = id3.construit_arbre(donnees,True,i)[0]
            if printTree:
                print('Decision tree :')
                print(self.arbre_advance.__repr__(notEg = True))
            #print()
            precision = self.precision(self.importData("test_public_continuous.csv"),True)
            if printPrecision:
                print('Testing the tree...')
                print('Accuracy = ' + "{:5.2f}".format(precision) + '%')
            #print()
            precisions.append(precision)
        plt.plot(np.linspace(0.4,4,60),precisions)
        plt.xlabel('accuracy_factor')
        plt.ylabel('Accuracy %')
        plt.show()
        """

        print('Building the tree (Task 5)...')
        donnees = self.importData('train_continuous.csv')
        precisions = []
        id3 = ID3()

        self.arbre_advance = id3.construit_arbre(donnees,True,0.7)[0]

        if printTree:
            print('Decision tree :')
            print(self.arbre_advance.__repr__(notEg = True))

        print()

        precision = self.precision(self.importData("test_public_continuous.csv"),True)
        if printPrecision:
            print('Testing the tree...')
            print('Accuracy = ' + "{:5.2f}".format(precision) + '%')

        print()
def crossValidation(data, rules):
    ac = 0
    confusion = {}
    for i in range(len(data)):
        currentrules = {}
        for i in rules.keys():
            currentrules[i] = rules[i]
        case = data.pop(0)
        i = ID3()
        i.train(data, currentrules)
        result = i.classify(case)
        if result == case['Type']:
            ac += 1

        if case['Type'] not in confusion.keys():
            confusion[case['Type']] = {}
        if result not in confusion[case['Type']].keys():
            confusion[case['Type']][result] = 0
        confusion[case['Type']][result] += 1
        data.append(case)
    def __init__(self):
        # Do computations here
        self.train_discrete = csv_to_array('train_bin.csv')
        test_discrete = csv_to_array('test_public_bin.csv')
        id3 = ID3()
        # Task 1
        self.arbre = id3.construit_arbre(self.train_discrete)
        self.print_precision(self.arbre, test_discrete)

        # Task 3
        self.faits_initiaux = test_discrete
        self.regles = rules_generator(
            self.arbre, [reglesansvariables.RegleSansVariables("", set())])
        tk3.explain_and_cure(self.faits_initiaux, self.arbre,
                             self.healthy_rules())
        # Task 5
        train_continuous = csv_to_array('train_continuous.csv')
        test_continuous = csv_to_array('test_public_continuous.csv')
        id3_cont = ID3_cont()
        self.arbre_advance = id3_cont.construit_arbre(train_continuous)
        self.print_precision(self.arbre_advance, test_continuous)
Ejemplo n.º 7
0
def test_id3():
    goal_attr = 'play'
    attr = 'wind'
    attr_universe = ['strong', 'weak']

    attr_2 = 'wheather'
    attr_2_univserse = ['sunny', 'cloudy', 'rainny']

    attr_3 = 'temperature'
    attr_3_universe = ['cold', 'norm', 'hot']

    attr_4 = 'humidity'
    attr_4_universe = ['norm', 'high']

    df = {
        goal_attr: [0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1],
        attr: ['weak', 'strong', 'weak', 'weak', 'weak', 'strong', 'strong', 'weak', 'weak', 'weak', 'strong', 'strong', 'weak'],
        attr_2: ['sunny', 'sunny', 'cloudy', 'rainny', 'rainny', 'rainny', 'cloudy', 'sunny', 'sunny', 'rainny', 'sunny', 'cloudy', 'cloudy'],
        attr_3: ['hot', 'hot', 'hot', 'norm', 'cold', 'cold', 'cold', 'norm', 'cold', 'norm', 'norm', 'norm', 'hot'],
        attr_4: ['high', 'high', 'high', 'high', 'norm', 'norm',
                 'norm', 'high', 'norm', 'norm', 'norm', 'high', 'norm']

    }

    df = pd.DataFrame(df)
    id3 = ID3(shanon_gain)
    tree = id3.train(df, {goal_attr: [0, 1]}, {
                     attr: attr_universe, attr_2: attr_2_univserse, attr_3: attr_3_universe, attr_4: attr_4_universe})

    case = {
        attr: 'strong',
        attr_2: 'rainny',
        attr_3: 'norm',
        attr_4: 'high'
    }

    result = tree.predict(case)
    expected = 0

    assert result == expected
Ejemplo n.º 8
0
    def task1(self,printTree = True):

        """ Performs task 1.
        """

        print('Building the tree (Task 1)...')
        donnees = self.importData('train_bin.csv')

        id3 = ID3()

        s = id3.construit_arbre(donnees)
        self.attributs = s[1]
        self.arbre = s[0]

        if printTree:
            print('Decision tree :')
            print(self.arbre)

        depthData = self.arbre.getDepth()
        print('Average Depth : ' + "{:5.2f}".format(depthData[0]))
        print('Maximum Depth : ' + "{:5.2f}".format(depthData[1]))
        print('Maximum Number of Children : ' + "{:5.2f}".format(depthData[2]))

        print()
Ejemplo n.º 9
0
    def train(self, n_tree, n_data, n_attr, dataset, goal_attr, attrs):
        '''
        To train a random forest, we build each tree and the decide upond the most common answer.
        params:
        - n_tree: number of trees to build
        - n_data: percentage of data to input each tree to train.
        - dataset: datframe with all data.
        - n_attr: number of attributes to consider in each individual tree [1, n].
        - goal_attr: dict containing the name (key) and universe(value) of the output.
        - attrs: dict with name(key) and universe (value) of each attr expected in the dataset.

        '''
        self.forest = []

        # build each tree
        for i in range(n_tree):
            # get m data with replace
            mini_batch = self._train_split(dataset, n_data)
            #  now we generate tree
            id3 = ID3(self.gain)
            attrs_batch = sample(list(attrs.items()), k=n_attr)
            attrs_batch = dict(attrs_batch)
            tree = id3.train(mini_batch, goal_attr, attrs_batch)
            self.forest.append(tree)
Ejemplo n.º 10
0
# -*- coding: utf-8 -*-
import pandas as pd
from id3 import ID3
import numpy as np

np.random.seed(1993)

# 读取所有数据
all_data = pd.read_csv('./nursery_data/all.csv')
# 利用permutation函数随机挑选1000个数据作为测试集,并将剩下的作为训练集
permutation = np.random.permutation(len(all_data))[:1000]
test_data = all_data.iloc[permutation]
result = test_data['classes'].values
test_data = test_data.drop('classes', axis=1)
train_data = all_data.drop(permutation)

id3_solver = ID3(train_data, target='classes')
id3_solver.run()
id3_solver.render_decision_tree('./nursery_data/dtree')

predict = id3_solver.predict(test_data, force=True)
accuracy = id3_solver.score(predict, result)
print('The accuracy of the prediction of test data is {}'.format(accuracy))
Ejemplo n.º 11
0
from c45_numeric_handler import process_numeric
from Rule import Rule

if __name__ == "__main__":
    data = read_csv('Bagian B/datasets/iris.csv')
    # print(data)
    label = data[0, 0:-1].tolist()
    x = data[1:, 0:-1]
    target = data[1:, -1:].flatten()
    # print(label)
    # print(x)
    # print(target)

    # ID3
    print("=====ID 3=====")
    id3 = ID3()
    id3.label = label
    id3.fit(x, target)
    # print(id3.tree)

    # C45
    print("=====C45=====")
    c45 = C45()
    c45.label = label
    # print(x)
    # print(target)
    c45.fit(x, target)
    # print(c45.tree)

    print(c45.predict(x[0:1, :]))
Ejemplo n.º 12
0
from id3 import ID3

c1 = ID3("../data/car.data", "../data/car.names", "../data/test.data",
         "../data/test2.data")
c1.fetchData()
c1.generateTree()
c1.printTree()
Ejemplo n.º 13
0
from data_load import clear_data, load, binarize_data, train_test_split
from id3 import ID3
from stat import tree_prune_stat

m_data = clear_data(load("./mushroom.txt"))

for i in range(len(m_data)):
    f, l = m_data[i][0], m_data[i][-1]
    m_data[i][0] = l
    m_data[i][-1] = f

m_binary = binarize_data(m_data)
m_train, m_test = train_test_split(m_binary, 0.8)
m_tree = ID3(m_train)

tree_prune_stat(m_tree, m_train, m_test)
Ejemplo n.º 14
0
import sys
from id3 import ID3
from data import Dataframe
import copy

model = ID3()

datafile = sys.argv[1]
dataset = Dataframe("")
dataset.read_data(dataset, datafile)

dataset_copy = Dataframe("")
dataset_copy.read_data(dataset_copy, datafile)

if len(sys.argv) == 3:
    root = model.fit(dataset, dataset_copy, dataset.attributes,
                     dataset.target_attribute)
    print("[BRANCHES]:")
else:
    root = model.fit2(dataset, dataset_copy, dataset.attributes,
                      dataset.target_attribute, sys.argv[3], 0)
    print("[BRANCHES]:")

model.printAllRootToLeafPaths(root)

datafile_test = sys.argv[2]
dataset_test = Dataframe("")
dataset_test.read_data(dataset_test, datafile_test)

predictions = []
for row in dataset_test.rows:
Ejemplo n.º 15
0
    def __init__(self):
        id3 = ID3()
        # Import data
        donnee_train = traitement_donnees.import_donnee(self,"../Data/train_bin.csv")
        donnee_test = traitement_donnees.import_donnee_test(self,"../Data/test_public_bin.csv")
        self.faits_initiaux = donnee_train
        
        # Task 1 : Build tree
        self.arbre = id3.construit_arbre(donnee_train)
        print(self.arbre)

        # Task 2 : Precision of the tree
        n = 0
        p = 0
        for donnee in donnee_test :
            model_result = self.classifie(donnee, self.arbre)
            if model_result[-1] == donnee['target']:
                p = p+1
            n = n+1
        print("Precision : " + str (p/n))

        # Task 3 : generate rules
        self.regles = self.generation_regle(self.arbre)
            # Print rules
        r = 0
        for regle in self.regles:
            r += 1
            print(str(r) + ') ' + self.ecrit_regle(regle))
            # Justification of an example using the rules
        conflict = []
        print(self.justifie_exemple(donnee_test[1], self.regles, self.arbre, conflict)) #any patient can be used as an example, we just chose to only print one
            # Rules precision (should be the same as the precision of the tree they come from)
        n_ex = 0
        for ex in donnee_test:
            justification  = self.arbre.justifie_exemple(ex, self.regles, conflict) #justification can be printed in case someone wants to see the justification for each patient of the test data
            n_ex += 1
        print('Taux de succes des justifications : ' + str(1 - len(conflict)/n_ex))
      
        #Task 4 : try to help the patients classified as sick by the tree
        d=[]
        for patient in donnee_test:
            self.arbre.diagnostic(self.regles,patient, d)
        print ('On a pu aider ' + str(len(d)) + ' patients en changeant 2 parametres au maximum.')
        
        # Task 5
        id3_pt5= ID3_PT5()
            #Import continuous data
        donnee_train_continue = traitement_donnees.import_donnee(self,"../Data/train_continuous.csv")
        donnee_test_continue = traitement_donnees.import_donnee_test(self,"../Data/test_public_continuous.csv")
            #Build continous tree
        self.arbre_advance = id3_pt5.construit_arbre(donnee_train_continue)
        print(self.arbre_advance)
            #Accuracy of the continuous tre
        n = 0
        p = 0
        for donnee in donnee_test_continue :
            model_result = self.arbre_advance.classifie(donnee)
            if model_result[-1] == donnee['target']:
                p = p+1
            n = n+1
        print("Precision : " + str(p/n))
Ejemplo n.º 16
0
from data_load import load
from id3 import ID3
from stat import tree_prune_stat

dane = load("./data1.txt", cast_to_int=True)
test = load("./test1.txt", cast_to_int=True)
tree = ID3(dane)

tree_prune_stat(tree, dane, test)
Ejemplo n.º 17
0
Archivo: dna.py Proyecto: Yao1993/ID3
import numpy as np

all_data = pd.read_csv('./dna_data/all.csv')
all_data = all_data.drop('name', axis=1)
all_data['dna'] = all_data['dna'].apply(lambda x: x.strip())
all_data['dna_len'] = all_data['dna'].apply(len)

columns = ['system']
for i in range(60):
    columns.append('d{}'.format(i))
modified_data = pd.DataFrame(columns=columns, index=all_data.index)

for index, row in all_data.iterrows():
    new_row = [row['system']]
    new_row.extend(list(row['dna']))
    modified_data.iloc[index] = new_row

permutation = np.random.permutation(len(modified_data))[:100]
test_data = modified_data.iloc[permutation]
result = test_data['system'].values
test_data = test_data.drop('system', axis=1)
train_data = modified_data.drop(permutation)

id3_solver = ID3(train_data, target='system')
id3_solver.run()
id3_solver.render_decision_tree('./dna_data/dtree')

predict = id3_solver.predict(test_data, force=True)
accuracy = id3_solver.score(predict, result)
print('The accuracy of the prediction of test data is {}'.format(accuracy))
Ejemplo n.º 18
0
from id3 import ID3
from anytree import RenderTree

S = [
    {"Outlook": "Sunny", "Temperature": "Hot", "Humidity": "High", "Wind": "Weak", "Sport": "No"},
    {"Outlook": "Sunny", "Temperature": "Hot", "Humidity": "High", "Wind": "Strong", "Sport": "No"},
    {"Outlook": "Overcast", "Temperature": "Hot", "Humidity": "High", "Wind": "Weak", "Sport": "Yes"},
    {"Outlook": "Rain", "Temperature": "Mild", "Humidity": "High", "Wind": "Weak", "Sport": "Yes"},
    {"Outlook": "Rain", "Temperature": "Cool", "Humidity": "Normal", "Wind": "Weak", "Sport": "Yes"},
    {"Outlook": "Rain", "Temperature": "Cool", "Humidity": "Normal", "Wind": "Strong", "Sport": "No"},
    {"Outlook": "Overcast", "Temperature": "Cool", "Humidity": "Normal", "Wind": "Strong", "Sport": "Yes"},
    {"Outlook": "Sunny", "Temperature": "Mild", "Humidity": "High", "Wind": "Weak", "Sport": "No"},
    {"Outlook": "Sunny", "Temperature": "Cool", "Humidity": "Normal", "Wind": "Weak", "Sport": "Yes"},
    {"Outlook": "Rain", "Temperature": "Mild", "Humidity": "Normal", "Wind": "Weak", "Sport": "Yes"},
    {"Outlook": "Sunny", "Temperature": "Mild", "Humidity": "Normal", "Wind": "Strong", "Sport": "Yes"},
    {"Outlook": "Overcast", "Temperature": "Mild", "Humidity": "High", "Wind": "Strong", "Sport": "Yes"},
    {"Outlook": "Overcast", "Temperature": "Hot", "Humidity": "Normal", "Wind": "Weak", "Sport": "Yes"},
    {"Outlook": "Rain", "Temperature": "Mild", "Humidity": "High", "Wind": "Strong", "Sport": "No"}
]

A = ['Outlook', 'Temperature', 'Humidity', 'Wind']

decision_tree = ID3(S, A)

# Show the tree produced by ID3
for prefix, filling, node in RenderTree(decision_tree.T):
    print("{}{}".format(prefix, node.name))
Ejemplo n.º 19
0
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from id3 import ID3

data = pd.DataFrame(
    np.array([["Sunny", "Hot", "High", "Weak", "No"],
              ["Sunny", "Hot", "High", "Strong", "No"],
              ["Overcast", "Hot", "High", "Weak", "Yes"],
              ["Rain", "Mild", "High", "Weak", "Yes"],
              ["Rain", "Cool", "Normal", "Weak", "Yes"],
              ["Rain", "Cool", "Normal", "Strong", "No"],
              ["Overcast", "Cool", "Normal", "Strong", "Yes"],
              ["Sunny", "Mild", "High", "Weak", "No"],
              ["Sunny", "Cool", "Normal", "Weak", "Yes"],
              ["Rain", "Mild", "Normal", "Weak", "Yes"],
              ["Sunny", "Mild", "Normal", "Strong", "Yes"],
              ["Overcast", "Mild", "High", "Strong", "Yes"],
              ["Overcast", "Hot", "Normal", "Weak", "Yes"],
              ["Rain", "Mild", "High", "Strong", "No"]]),
    columns=['Outlook', 'Temperature', 'Humidity', 'Wind', 'PlayTennis'])

attributes = ['Humidity', 'Wind', 'Outlook']
target_attribute = "PlayTennis"

id3_instance = ID3()
id3_instance.fit(data, target_attribute, attributes)

print("xxxxxxxxxxxxxxxxxxxxxxxxx")
id3_instance.traverse("")
Ejemplo n.º 20
0
    # general settings
    attrs = {'Age': age_labels, 'Pclass': [1, 2, 3], 'Sex': ['male', 'female']}

    goal_attr = {'Survived': [0, 1]}

    train_data, test_data = train_test_split(data, args['test'])

    # now we build each method
    if args['predictor'] == 'random_forest':
        predictior = RandomForest(shanon_gain)
        predictior.train(args['num_tree'], args['n_data'], args['n_attr'],
                         train_data, goal_attr, attrs)
    else:
        gain_func = shanon_gain if args['gain'] == 'shanon' else gini_gain
        id3 = ID3(gain_func)
        predictior = id3.train(train_data, goal_attr, attrs)
    # now we test
    predictions = []
    # make predictions
    for index, case in test_data.iterrows():
        case = case.to_dict()
        predictions.append(predictior.predict(case))

    # build confusion matrix
    labels = [1, 0]
    conf_matrix = confusion_matrix(test_data.Survived.to_list(),
                                   predictions,
                                   labels=labels)
    df_cm = pd.DataFrame(conf_matrix,
                         index=['survived', 'not survived'],