def test_fit():
    bunch = load_breast_cancer()

    id3Estimator = Id3Estimator()
    id3Estimator.fit(bunch.data, bunch.target)
    assert_equal(id3Estimator.tree_.root.value, 22)

    id3Estimator = Id3Estimator(max_depth=2)
    id3Estimator.fit(bunch.data, bunch.target)
    assert_equal(id3Estimator.tree_.root.value, 22)

    id3Estimator = Id3Estimator(min_samples_split=20)
    id3Estimator.fit(bunch.data, bunch.target)
    assert_equal(id3Estimator.tree_.root.value, 22)
 def draw_graph(self, x, y):
     # Decision Tree Graph
     clf = Id3Estimator()
     clf.fit(x, y, check_input=True)
     #clf.predict_proba(x)
     print(export_text(clf.tree_, self.feature_names))
     
     # export tree.dot as pdf file to write Decision Tree as a graph
     dot_data = StringIO()
     #tree.export_graphviz(clf, out_file = dot_data)
     export_graphviz(clf.tree_, 'SVC_Tree.dot', self.feature_names)
     graph = pydot.graph_from_dot_file('SVC_Tree.dot')
     graph[0].write_pdf("SVC_Tree.pdf")
     
     clf = DecisionTreeClassifier()
     clf = clf.fit(x,y)
     clf.predict(x, check_input=True)
     clf.predict_proba(x)
     
     # version v1 pdf output
     dot_data = tree.export_graphviz(clf, out_file='SVC_Tree_v1.dot')
     graph = pydot.graph_from_dot_file('SVC_Tree_v1.dot')
     graph[0].write_pdf("SVC_Tree_v1.pdf")
     
     # version v2 pdf output
     dot_data = tree.export_graphviz(clf, out_file="SVC_Tree_v2", feature_names=self.feature_names, class_names=self.target, filled=True, rounded=True, special_characters=True)
     #dot_data = tree.export_graphviz(clf, out_file="Decision-Tree-Regression-v2", feature_names=feature_names, class_names=target.name, filled=True, rounded=True, special_characters=True)
     graph = graphviz.Source(dot_data)
     # print graph this is done correct as lang as out_file=None
     graph
     
     # save graph version 2 as pdf data file
     graph = pydot.graph_from_dot_file('SVC_Tree_v2')
     graph[0].write_pdf("SVC_Tree_v2.pdf")
     return True
Exemple #3
0
def Tree():
    names = ["tarcza", "czy lata", "wiek", "zbroja", "hp", "level", "potwor"]

    count = len(open('przypadki.txt', 'rU').readlines())
    x = []

    for i in range(1, count):
        line = linecache.getline('przypadki.txt', i).split(" ")
        line[6] = str(line[6][0])
        x.append(line)
    X = np.asarray(x)
    print(X)

    y = np.array([int(i) for i in linecache.getline('wyniki.txt', 1)[:-2]])
    yd = [int(i) for i in linecache.getline('wyniki.txt', 1)[:-2]]
    d = []
    d.append(names)
    d[0].append("wynik")
    for i in range(0, len(yd)):
        d.append(x[i] + [yd[i]])
    print(d)
    clf = Id3Estimator()
    clf.fit(X, y, check_input=True)
    #d = np.array([['0', '0', '39', '1', '9', '0','1', 't']])
    #print(d)
    #c = clf.predict(d)
    #print(c)

    export_graphviz(clf.tree_, "out.dot", names)
    print(export_text(clf.tree_, names))
    return clf
def test_nominal():
    id3Estimator = Id3Estimator()
    id3Estimator.fit(X, y_nom)

    assert_equal(id3Estimator.tree_.root.value, 3)
    predict = id3Estimator.predict(X_nom_test)
    assert_equal(predict, y_nom_test)
Exemple #5
0
def Task2():
    dia, dia_meta = parser("diabetes.arff")

    stratifiedCrossValidation2(dia, dia_meta, 10)

    means = []
    stds = []

    accs = []
    means2 = []
    stds2 = []

    test_paths = []  #prepare test paths
    train_paths = []  #prepare train paths

    for i in range(10):
        test_paths.append("test" + str(i) + ".arff")
        train_paths.append("train" + str(i) + ".arff")

    id3 = Id3Estimator()

    print("ID3: ")
    mean_1, std_1 = CV_learn(id3, train_paths, test_paths, 10, dia_meta)

    print("\nRFC: ")
    rfc = RandomForestClassifier(max_depth=50, random_state=0)
    mean_2, std_2 = CV_learn(rfc, train_paths, test_paths, 10, dia_meta)
Exemple #6
0
def CreaMatrice():
    filename = askopenfilename(title="Ouvrir votre document",
                               filetypes=[('txt files', '.txt'),
                                          ('all files', '.*')])
    fichier = open(filename, "r")
    content = fichier.read()

    fichier = open(filename, "r")
    first_ligne = fichier.readline()
    L = first_ligne.split()
    nbAttributs = len(L)
    fichierX = []
    ligne = fichier.readline()
    compte = 0
    while (ligne):
        fichierX.append(ligne.split())
        compte = compte + 1
        ligne = fichier.readline()
    attributCible = []
    for i in range(len(fichierX)):
        attributCible.append(fichierX[i][-1])
        fichierX[i].pop()

    feature_names = L
    X = np.array(fichierX)
    y = np.array(attributCible)

    clf = Id3Estimator()
    clf.fit(X, y, check_input=False)

    print(export_text(clf.tree_, feature_names))
    save = open("matrice.txt", "w")
    save.write(export_text(clf.tree_, feature_names))
    save.close()
    fichier.close()
 def generate_tree(self, max_depth):
     self.__print(
         "\n-------------------------------------- Modelling ------------------------------------------\n"
     )
     self.__create_output_dir()
     self.__print("\n-----DECISION TREE GENERATION-----\n")
     self.__print("Output file names: ./output/" + self.run_id +
                  "/tree.dot ./output/" + self.run_id + "/tree.png")
     # the estimator
     self.estimator = Id3Estimator(max_depth)
     # suvrived
     x = self.dataframe.iloc[:, 0]
     # all attributes except survieved
     y = self.dataframe.iloc[:, 1:]
     # all var names except survieved
     feature_names = list(y.columns.values)
     # calc the tree
     self.estimator = self.estimator.fit(y, x)
     # export as .dot
     dot_data = export_graphviz(self.estimator.tree_,
                                './output/' + self.run_id + '/tree.dot',
                                feature_names)
     # create png file
     #command = ["dot", "-Tpng", './output/' + self.run_id + '/tree.dot', "-o", "./output/" + self.run_id + "/tree.png"]
     #subprocess.check_call(command, shell=True)
     command = "dot -Tpng " + './output/' + self.run_id + '/tree.dot' + " -o " + "./output/" + self.run_id + "/tree.png"  #Tsvg can be changed to Tjpg, Tpng, Tgif etc (see dot man pages)
     os.system(command)
Exemple #8
0
 def cpuUsageDecisionTree(self):
     (X, Y) = self.get_data_from_csv()
     feature_names = [
         "vm_id_map", "timestamp_new", "cpu_usage_percent",
         "admin_historic_decision_cpu"
     ]
     clf = Id3Estimator()
     clf.fit(X, Y, check_input=True)
     export_graphviz(clf.tree_, "out.dot", feature_names)
Exemple #9
0
def id3():
  headers = pd.read_csv('Task4_Data.csv', nrows=1).columns.values
  headers = headers[3:6]

  X = pd.read_csv('Task4_Data.csv').values
  y = X[:,6]
  X = X[:,3:6]

  clf = Id3Estimator()
  clf.fit(X, y, check_input=True)
  export_graphviz(clf.tree_, 'tree.dot', headers)
Exemple #10
0
def main():
    feature_names = ["Opponent", "Home/Away", "AP Top 25", "Media"]

    X = np.array([['Texas', 'Home', 'Out', '1-NBC'],
                  ['Virginia', 'Away', 'Out', '4-ABC'],
                  ['GeorgiaTech', 'Home', 'In', '1-NBC'],
                  ['UMass', 'Home', 'Out', '1-NBC'],
                  ['Clemson', 'Away', 'In', '4-ABC'],
                  ['Navy', 'Home', 'Out', '1-NBC'],
                  ['USC', 'Home', 'In', '1-NBC'],
                  ['Temple', 'Away', 'Out', '4-ABC'],
                  ['PITT', 'Away', 'Out', '4-ABC'],
                  ['WakeForest', 'Home', 'Out', '1-NBC'],
                  ['BostonCollege', 'Away', 'Out', '1-NBC'],
                  ['Stanford', 'Away', 'In', '3-FOX'],
                  ['Texas', 'Away', 'Out', '4-ABC'],
                  ['Nevada', 'Home', 'Out', '1-NBC'],
                  ['MichiganState', 'Home', 'Out', '1-NBC'],
                  ['Duke', 'Home', 'Out', '1-NBC'],
                  ['Syracuse', 'Home', 'Out', '2-ESPN'],
                  ['NorthCarolinaState', 'Away', 'Out', '4-ABC'],
                  ['Stanford', 'Home', 'In', '1-NBC'],
                  ['MiamiFlorida', 'Home', 'Out', '1-NBC'],
                  ['Navy', 'Home', 'Out', '5-CBS'],
                  ['Army', 'Home', 'Out', '1-NBC'],
                  ['VirginiaTech', 'Home', 'In', '1-NBC'],
                  ['USC', 'Away', 'In', '4-ABC']])

    y = np.array([
        "Win", "Win", "Win", "Win", "Lose", "Win", "Win", "Win", "Win", "Win",
        "Win", "Lose", "Lose", "Win", "Lose", "Lose", "Win", "Lose", "Lose",
        "Win", "Lose", "Win", "Lose", "Lose"
    ])

    clf = Id3Estimator()
    clf.fit(X, y, check_input=True)
    print("Training:")
    print(export_text(clf.tree_, feature_names))
    testing = [
        ["Temple", "Home", "Out", "1-NBC"],
        # ["Georgia", "Home", "In", "1-NBC"],
        ["BostonCollege", "Away", "Out", "2-ESPN"],
        ["MichiganState", "Away", "Out", "3-FOX"],
        # ["MiamiOhio", "Home", "Out", "1-NBC"],
        # ["NorthCarolina", "Away", "Out", "4-ABC"],
        ["USC", "Home", "In", "1-NBC"],
        ["NorthCarolinaState", "Home", "Out", "1-NBC"],
        ["WakeForest", "Home", "Out", "1-NBC"],
        ["MiamiFlorida", "Away", "In", "4-ABC"],
        ["Navy", "Home", "Out", "1-NBC"],
        ["Stanford", "Away", "In", "4-ABC"]
    ]
    print("\n\nTesting:")
    print(clf.predict(testing))
def test_predict():
    estimator = Id3Estimator()
    bunch = load_breast_cancer()
    estimator.fit(bunch.data, bunch.target)
    sample = np.array([
        20.57, 17.77, 132.9, 1326, 0.08474, 0.07864, 0.0869, 0.07017, 0.1812,
        0.05667, 0.5435, 0.7339, 3.398, 74.08, 0.005225, 0.01308, 0.0186,
        0.0134, 0.01389, 0.003532, 24.99, 23.41, 158.8, 1956, 0.1238, 0.1866,
        0.2416, 0.186, 0.275, 0.08902
    ]).reshape(1, -1)
    assert_almost_equal(estimator.predict(bunch.data), bunch.target)
    assert_almost_equal(estimator.predict(sample), 0)
Exemple #12
0
def test_predict_proba():
    estimator = Id3Estimator()
    bunch = load_breast_cancer()
    estimator.fit(bunch.data, bunch.target)
    # Test shape of probability data structure using breast cancer data
    probs = estimator.predict_proba(bunch.data)
    assert_equal(probs.shape[0],bunch.data.shape[0])
    assert_equal(probs.shape[1],estimator.tree_.y_encoder.classes_.shape[0])
    # Test probability values using sample data (as per test_predict method)
    probs = estimator.predict_proba(bc_sample)
    assert probs[0,0] >= 0.5
    assert probs[0,1] < 0.5
    assert_equal(np.sum(probs[0]),1.0)
def test_numerical_split():
    bunch = load_breast_cancer()

    id3Estimator = Id3Estimator()
    id3Estimator.fit(bunch.data, bunch.target)
    splitter = id3Estimator.builder_.splitter
    record = splitter.calc(np.array(list(range(bunch.target.shape[0]))),
                           np.array(list(range(bunch.data.shape[1]))))
    less = np.sum(bunch.data[:, record.feature_idx] <= record.pivot)
    more = bunch.data[:, record.feature_idx].shape[0] - less
    split = splitter.split(np.array(list(range(bunch.target.shape[0]))),
                           record)
    assert_almost_equal(len(split[0].bag), less)
    assert_almost_equal(len(split[1].bag), more)
Exemple #14
0
def BuildTree():
    feature_names = ["danie", "na ciepło", "z mięsem", "na słodko", "kwaśne", "alkoholowe", "czekoladowe", "wybor"]

    dataset = ps.read_csv("recommend.csv", header=None, names=feature_names, sep=";")
 
    X = dataset.drop('wybor', axis=1)
    Y = dataset['wybor']

    clf = Id3Estimator()

    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.20)
    clf.fit(X_train, Y_train)
    
    export_graphviz(clf.tree_, "lol.dot", feature_names)
    return clf
Exemple #15
0
def BuildTree():
    # nazwy cech
    feature_names = [
        "pair", "empty_plate", "talking", "mood", "asked", "hurry", "bill"
    ]

    Yfeature_names = [
        "pair", "empty_plate", "talking", "mood", "asked", "hurry"
    ]

    # wczytaj dataset z pliku dane.csv
    dataset = ps.read_csv("bill.csv",
                          header=None,
                          names=feature_names,
                          sep=";")

    X = dataset.drop('bill', axis=1)
    Y = dataset['bill']

    # tworzenie drzewa decyzyjnego
    clf = Id3Estimator()

    # Podział na dane treningowe i dane testowe
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.20)
    # fit - synonim do "find patterns in data"
    clf.fit(X_train, Y_train)
    export_graphviz(clf.tree_, "test.dot", feature_names)
    model = load_model('third_try.h5')

    while True:
        path = random.choice(
            os.listdir("C://Users/Kinia/Desktop/sztuczna2/SI-master/test"))
        print(path)

        img_pred = image.load_img("test/" + path, target_size=(100, 100))
        img_pred = image.img_to_array(img_pred)
        img_pred = np.expand_dims(img_pred, axis=0)

        rslt = model.predict(img_pred)
        print(rslt)
        if rslt[0][0] == 1:
            prediction = 1
            break
        else:
            prediction = 0

        print(prediction)
    return [prediction, clf]
Exemple #16
0
def measures_of_id3(subsets):
    clf = Id3Estimator()

    start_time = perf_counter()
    clf.fit(subsets[SplitPartNames['X_train']],
            subsets[SplitPartNames['y_train']])
    end_time = perf_counter()
    learning_time = end_time - start_time

    start_time = perf_counter()
    prediction = clf.predict(subsets[SplitPartNames['X_test']])
    end_time = perf_counter()
    prediction_time = end_time - start_time
    accuracy = metrics.accuracy_score(subsets[SplitPartNames['y_test']],
                                      prediction)
    return round(learning_time, 4), round(prediction_time,
                                          4), round(accuracy, 2)
Exemple #17
0
def main():
    feature_names = ["home/away", "top25", "media"]

    X = np.array([['home', 'out', '1-nbc'], ['home', 'in', '1-nbc'],
                  ['away', 'out', '2-espn'], ['away', 'out', '3-fox'],
                  ['home', 'out', '1-nbc'], ['away', 'out', '4-abc']])

    y = np.array(["win", "lose", "win", "win", "win", "win"])

    clf = Id3Estimator()
    clf.fit(X, y, check_input=True)

    print(export_text(clf.tree_, feature_names))
    testing = [["home", "in", "1-nbc"], ["home", "out", "1-nbc"],
               ["home", "out", "1-nbc"], ["home", "in", "4-abc"],
               ["home", "out", "1-nbc"], ["home", "in", "4-abc"]]
    print("\n\nTesting:")
    print(clf.predict(testing))
Exemple #18
0
def main():
    feature_names = ["outlook",
                 "temperature",
                 "humidity",
                 "windy"]

    X = np.array([['sunny', 'hot', 'high', 'false'],
            ['sunny', 'hot', 'high', 'true'],
            ['overcast', 'hot', 'high', 'false'],
            ['rainy', 'mild', 'high', 'false'],
            ['rainy', 'cool', 'normal', 'false'],
            ['rainy', 'cool', 'normal', 'true'],
            ['overcast', 'cool', 'normal', 'true'],
            ['sunny', 'mild', 'high', 'false'],
            ['sunny', 'cool', 'normal', 'false'],
            ['rainy', 'mild', 'normal', 'false'],
            ['sunny', 'mild', 'normal', 'true'],
            ['overcast', 'mild', 'high', 'true'],
            ['overcast', 'hot', 'normal', 'false'],
            ['rainy', 'mild', 'high', 'true']])

    y = np.array(["No",
                  "No",
                  "Yes",
                  "Yes",
                  "Yes",
                  "No",
                  "Yes",
                  "No",
                  "Yes",
                  "Yes",
                  "Yes",
                  "Yes",
                  "Yes",
                  "No"])

    clf = Id3Estimator()
    clf.fit(X, y, check_input=True)
    print("Training:")
    print(export_text(clf.tree_, feature_names))
    print("Testing: rainy, hot, high, false")
    print(clf.predict([["rainy", "hot", "high", "false"]])) #Throws DeprecationWarning, ignore it
Exemple #19
0
def id3(size):
    x_train, x_test, y_train, y_test = setData("Pokemon.csv", 718, size)

    est = Id3Estimator(gain_ratio=True)
    est.fit(x_train, y_train)
    y_test = y_test.to_numpy()
    y_train = y_train.to_numpy()
    y_predict = est.predict(x_test)
    y_predict2 = est.predict(x_train)
    error_1 = 0
    error_2 = 0

    for i in range(len(y_test)):
        if y_predict[i] != y_test[i]:
            error_1 = error_1 + 1

    for i in range(len(y_train)):
        if y_predict2[i] != y_train[i]:
            error_2 = error_2 + 1

    #dot = export_graphviz(est.tree_, 'tree.dot', bunch.feature_names)

    return error_1 / len(y_test) * 100, error_2 / len(y_train) * 100
dados = pd.read_csv('amostras.csv', sep=',', encoding='utf8')

dadosX = dados[[
    'pontuacao_final', 'sobrevivencia', 'bonus_ultima_sobrevivencia',
    'dano_disparo', 'bonus_disparo_morte', 'colisao_dano',
    'bonus_colisao_morte', '1lugar', '2lugar', '3lugar'
]].values
dadosY = dados['classificacao']

treinoX, testeX, treinoY, testeY = train_test_split(dadosX,
                                                    dadosY,
                                                    test_size=0.3,
                                                    shuffle=False)

modeloArvodeID3 = Id3Estimator(max_depth=3)

modeloArvodeID3.fit(treinoX, treinoY)

export_graphviz(modeloArvodeID3.tree_, 'arvoreExecutada.dot', [
    'pontuacao_final', 'sobrevivencia', 'bonus_ultima_sobrevivencia',
    'dano_disparo', 'bonus_disparo_morte', 'colisao_dano',
    'bonus_colisao_morte', '1lugar', '2lugar', '3lugar'
])

classificacoes = modeloArvodeID3.predict(testeX)

print('Resultados Árvore de Decisão ID3 (Iterative Dichotomiser 3):')
print('Acurácia: %.4f' % accuracy_score(classificacoes, testeY))
print('Precisão: %.4f' %
      precision_score(classificacoes, testeY, average='macro'))
Exemple #21
0
datatype_file = graph_dir + "-datatype"
x_file = graph_dir + "-x.out"
y_file = graph_dir + "-y.out"
dot_file = graph_dir + ".dot"

with open(feature_file) as f:
    feature_names = f.readlines()
feature_names = [x.strip() for x in feature_names]

X = np.array(genfromtxt(x_file, dtype=None, delimiter="~").tolist())
y = genfromtxt(y_file, dtype='i4')

if len(feature_names) == 1:
    X = X.reshape(-1, 1)

clf = Id3Estimator()
clf.fit(X, y, check_input=True)
end = datetime.now()
delta = end - start

try:
    export_graphviz(clf.tree_, dot_file, feature_names)
except:
    print("Unexpected error:", sys.exc_info()[0])

result = convert_dot_to_predicate(dot_file, graph_dir)

path, filename = os.path.split(graph_dir)
result.insert(0, filename)
result.append(delta.seconds)
pprint(result)
https://pypi.python.org/pypi/decision-tree-id3/0.1.2
"""
#from sklearn import tree
from id3 import Id3Estimator
from id3 import export_graphviz
import numpy as np
import graphviz

#           | 0     | 1         | 2
#Outlook    | Sunny | Overcast  | Rain
#Temperature| Hot   | Mild      | Cool
#Humidity   | High  | Normal    | -
#Wind       | Weak  | Strong    | -

x_labels = ["Outlook", "Temperature", "Humidity", "Wind"]

X = np.array([[0, 0, 0, 0], [0, 0, 0, 1], [1, 0, 0, 0], [2, 1, 0, 0],
              [2, 2, 1, 0], [2, 2, 1, 1], [1, 2, 1, 1], [0, 1, 0, 0],
              [0, 2, 1, 0], [2, 1, 1, 0], [0, 1, 1, 1], [1, 1, 0, 1],
              [1, 0, 1, 0], [2, 1, 0, 1]])

Y = np.array([0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0])

#clf = tree.DecisionTreeClassifier()
clf = Id3Estimator(min_samples_split=3)
clf.fit(X, Y)
dot_data = export_graphviz(clf.tree_, "decisiontree.dot", x_labels)
#predictions = clf.predict(X)
#for i in range(len(X)):
#    print X[i],Y[i],"->",predictions[i]
Exemple #23
0
        'research background', 'final year projct type', 'enthusiasm',
        'teamwork ability', 'communication & network skill', 'cgpa'
    ]

    #selecting train data
    x = data[fcols]
    y = data['current job field']

    #initilizing the models
    lr = LogisticRegression()
    knn = KNeighborsClassifier(n_neighbors=5)
    gnb = GaussianNB()
    mn = MultinomialNB()
    ber = BernoulliNB()
    tree = DecisionTreeClassifier()
    id3 = Id3Estimator()
    rnd = RandomForestClassifier(n_estimators=300)
    svc = SVC()
    mlp = MLPClassifier(solver='lbfgs',
                        alpha=1e-5,
                        hidden_layer_sizes=(5, 75),
                        random_state=1)
    """a = tree.fit(x,y)
    print(a.feature_importances_)"""
    val = list()
    val.append(k_fold(lr, x, y))
    val.append(k_fold(knn, x, y))
    val.append(k_fold(gnb, x, y))
    val.append(k_fold(mn, x, y))
    val.append(k_fold(ber, x, y))
    val.append(k_fold(tree, x, y))
from scalingdata import *
import time

A = np.genfromtxt('data.txt', delimiter=',')
# split data
Xtrain = A[:60, 0:5]
Ytrain = A[:60, 5]

Xtest = A[60:80, 0:5]
Ytest = A[60:80, 5]

print("---------result id3 build decision tree non scaling data--------")

t = time.time()

clf = Id3Estimator()

clf.fit(Xtrain, Ytrain)

h = clf.predict(Xtest)
ouput(h, Ytest)
ed = time.time()
print("Time excution non scaling data", ed - t)

print("---------result scaling data with standardization------------")
t = time.time()

Xmean = np.mean(Xtrain, axis=0)
st = np.std(Xtrain, axis=0)
X_test_stadar = standardization(Xtest, Xmean, st)
X_train_stadar = standardization(Xtrain, Xmean, st)
from sklearn.datasets import fetch_kddcup99
from sklearn.model_selection import train_test_split
from id3 import export_graphviz
import numpy as np

bunch = fetch_kddcup99(subset="SA")

data = bunch.data
data = np.delete(data, np.s_[1:4], axis=1)
target = bunch.target
X_train, X_test, y_train, y_test = train_test_split(data,
                                                    target,
                                                    test_size=.2,
                                                    random_state=17)

estimator = Id3Estimator()
print("->Fitting ID3 classifier")
estimator.fit(X_train, y_train)

print("->Writing dot file")
export_graphviz(estimator.tree_, 'tree.dot')

print("->Calculating predictions")
pred = estimator.predict(X_test)

well_detected = 0
for index, val in enumerate(pred):
    if val == y_test[index]:
        well_detected += 1

percentage = well_detected / len(pred) * 100
Exemple #26
0
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score
from id3 import Id3Estimator
from extractData import X, y

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=6)

# Without pruning
no_prone_accuracy = cross_val_score(Id3Estimator(), X_train, y_train, cv=4, scoring='accuracy').mean()
print("Accuracy without pruning on cross validation is:", no_prone_accuracy)

# With pruning
estimator = Id3Estimator(min_samples_split=20)
prone_accuracy = cross_val_score(estimator, X_train, y_train, cv=4, scoring='accuracy').mean()
print("Accuracy with pruning on cross validation is:", prone_accuracy)

# estimator.fit(X_train, y_train)
# print("Accuracy with pruning on test is:", accuracy_score(y_test, estimator.predict(X_test)))
Exemple #27
0
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier as knn
import numpy as np
from sklearn import svm
from sklearn.datasets import load_svmlight_file as load_svm
from sklearn.model_selection import KFold
from drop_highlycorelated import clf, xtrain, ytrain, xtest, ytest, X_important_train, X_important_test
from sklearn.metrics import accuracy_score
from id3 import Id3Estimator

clf_imp = Id3Estimator()
clf_imp.fit(X_important_train, ytrain)

#kfold
from sklearn.model_selection import cross_val_score, KFold
n_folds = []
n_folds.append(('K2', 2))
n_folds.append(('K4', 4))
n_folds.append(('K5', 5))
n_folds.append(('K10', 10))

seed = 7

for name, n_split in n_folds:
    results = []
    names = []
    print(name)
    kfold = KFold(n_splits=n_split, random_state=seed)
    cv_results = cross_val_score(clf_imp,
Exemple #28
0
    X, Y, test_size=test_proportion)
X_trainf5, X_test, Y_trainf5, Y_test = train_test_split(
    X, Y, test_size=test_proportion)
#
X_trainf6, X_test, Y_trainf6, Y_test = train_test_split(
    X, Y, test_size=test_proportion)
X_trainf7, X_test, Y_trainf7, Y_test = train_test_split(
    X, Y, test_size=test_proportion)
X_trainf8, X_test, Y_trainf8, Y_test = train_test_split(
    X, Y, test_size=test_proportion)
X_trainf9, X_test, Y_trainf9, Y_test = train_test_split(
    X, Y, test_size=test_proportion)
X_trainf10, X_test, Y_trainf10, Y_test = train_test_split(
    X, Y, test_size=test_proportion)

estimator1 = Id3Estimator()
estimator2 = Id3Estimator()
estimator3 = Id3Estimator()
estimator4 = Id3Estimator()
estimator5 = Id3Estimator()
#
estimator6 = Id3Estimator()
estimator7 = Id3Estimator()
estimator8 = Id3Estimator()
estimator9 = Id3Estimator()
estimator10 = Id3Estimator()

estimator1.fit(X_trainf1, Y_trainf1)
estimator2.fit(X_trainf2, Y_trainf2)
estimator3.fit(X_trainf3, Y_trainf3)
estimator4.fit(X_trainf4, Y_trainf4)
Exemple #29
0
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from id3 import Id3Estimator

min_samples = 20

# load data
data = np.genfromtxt('flare.csv', delimiter=",", dtype=None, names=True)
data_classification = data['classification']
data_features = []
for row in data:
    data_features.append([col for col in row][:-1])

# split data to 75% train and 25% test
features_train, features_test, classification_train, classification_test = \
    train_test_split(data_features, data_classification, test_size=0.25, random_state=4)

# train without cut
estimator = Id3Estimator()
estimator.fit(features_train, classification_train)
classification_predict = estimator.predict(features_test)
print(accuracy_score(classification_test, classification_predict))

# train with cut
estimator_cut = Id3Estimator(min_samples_split=min_samples)
estimator_cut.fit(features_train, classification_train)
classification_predict = estimator_cut.predict(features_test)
print(accuracy_score(classification_test, classification_predict))

Exemple #30
0
           "Tree_with": [], "Tree_without": []}
for _ in range(100):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

    # Question 7
    # Without choosing parameters
    es = KNeighborsClassifier()
    es.fit(X_train, y_train)
    results["KNN_without"].append(accuracy_score(y_test, es.predict(X_test)))

    # With choosing parameters
    number_of_indexes = sfs.sfs(X_train, y_train, 8, KNeighborsClassifier(), scoreSFS)
    es.fit(sfs.subset_of_x(X_train, number_of_indexes), y_train)
    results["KNN_with"].append(accuracy_score(y_test, es.predict(sfs.subset_of_x(X_test, number_of_indexes))))

    # Question 8
    # Without pruning
    es = Id3Estimator()
    es.fit(X_train, y_train)
    results["Tree_without"].append(accuracy_score(y_test, es.predict(X_test)))

    # With pruning
    es = Id3Estimator(min_samples_split=20)
    es.fit(X_train, y_train)
    results["Tree_with"].append(accuracy_score(y_test, es.predict(X_test)))

print(mean(results["KNN_without"]))
print(mean(results["KNN_with"]))
print(mean(results["Tree_without"]))
print(mean(results["Tree_with"]))