Ejemplo n.º 1
0
def drawDecisionTree(data, name):
    data['change next day class'] = data['change next day'].apply(classify)
    X_train, X_test, y_train, y_test = train_test_split(
        data[[
            'rate of increase', 'increase length', 'rate of decrease',
            'decrease length'
        ]],
        data['change next day class'],
        test_size=0.2,
        random_state=42)
    tree = DecisionTreeClassifier(max_depth=6, random_state=0)
    tree.fit(X_train, y_train)
    print('Train score:{:.3f}'.format(tree.score(X_train, y_train)))
    print('Test score:{:.3f}'.format(tree.score(X_test, y_test)))
    #生成可视化图
    export_graphviz(tree,
                    out_file="tree.dot",
                    feature_names=[
                        'rate of increase', 'increase length',
                        'rate of decrease', 'decrease length'
                    ],
                    impurity=False,
                    filled=True)
    #展示可视化图
    graph = pydotplus.graph_from_dot_file('tree.dot')
    graph.write_pdf(name + '.pdf')
Ejemplo n.º 2
0
def get_best_tree(model, X, keep_scores=False):
    """
    Given a model of ensembled trees with an `estimators_` attribute,
    finds the tree that most closely resembles

    Parameters
    ----------
    model
    X

    Returns
    -------

    """
    overall_prediction = model.predict(X)

    predictions = dict()
    scores = dict()

    best_score, best_tree_number = -999, -999

    for tree_num, tree in enumerate(model.estimators_):
        predictions[tree_num] = tree.predict(X)
        new_score = tree.score(X, overall_prediction)
        scores[tree_num] = new_score

        if new_score > best_score:
            best_score = new_score
            best_tree_number = tree_num

    nearest_tree = model.estimators_[best_tree_number]

    if keep_scores:
        return best_tree_number, nearest_tree, scores
    return best_tree_number, nearest_tree
Ejemplo n.º 3
0
def evaluate_regressor(tree, X, Y):
    """
    Evaluates a tree with the data values passed, returning the R2 and MSE
    """
    r2 = tree.score(X, Y)
    e = tree.predict(X)
    mse = np.average(np.power((e - Y.values), 2))
    return r2, mse
Ejemplo n.º 4
0
    def process_chunk(self, chunk):
        def update_min_max_count_dict(key, dict1, dict2):
            return {
                'max': max(dict1[key]['max'], dict2[key]['max']),
                'min': max(dict1[key]['min'], dict2[key]['min']),
                'count': dict1[key]['count'] + dict2[key]['count'],
            }

        new_min_max_count = get_feature_min_max_count(chunk,
                                                      self.feature_names)
        if self.feature_min_max_count is None:
            self.feature_min_max_count = new_min_max_count
        else:
            self.feature_min_max_count = {
                key: update_min_max_count_dict(key, self.feature_min_max_count,
                                               new_min_max_count)
                for key in self.feature_names
            }
        chunk = self.__preprocess_data(chunk, self.feature_names)
        tree = sklearn.tree.DecisionTreeRegressor(criterion='mse',
                                                  random_state=42,
                                                  **self.model_params)
        train, test = sklearn.model_selection.train_test_split(chunk,
                                                               random_state=42)
        plot, _ = sklearn.model_selection.train_test_split(chunk,
                                                           random_state=42,
                                                           train_size=min(
                                                               100,
                                                               len(chunk) - 1),
                                                           test_size=0)
        self.plot_data_frames.append(plot)
        tree.fit(train[self.model_feature_names], train[self.output_name])
        self.test_scores.append(
            tree.score(test[self.model_feature_names], test[self.output_name]))
        self.train_scores.append(
            tree.score(train[self.model_feature_names],
                       train[self.output_name]))
        self.trees.append(tree)
Ejemplo n.º 5
0
def cluster_then_forest(xs, ys, in_sample_size):
    isi, in_sample, osi, out_sample = create_in_out_samples(xs, in_sample_size)
    clf = cluster.KMeans(n_clusters=4)
    clf.fit(in_sample)
    oos_clusterid = clf.predict(out_sample)
    ins_clusterid = clf.predict(in_sample)

    for id in numpy.unique(oos_clusterid):
        print "Now working on Cluster " + str(id)
        oos_ind = oos_clusterid == id
        ins_ind = ins_clusterid == id

        tree = ensemble.RandomForestRegressor(50)

        tree.fit(in_sample[ins_ind], ys[isi][ins_ind])
        print "Score for in-sample"
        print str(tree.score(in_sample[ins_ind], ys[isi][ins_ind]))

        print "Score for out-of sample"
        tree.predict(out_sample[oos_ind])
        print str(tree.score(out_sample[oos_ind], ys[osi][oos_ind]))

    return None
Ejemplo n.º 6
0
def cluster_then_forest(xs, ys, in_sample_size):
    isi, in_sample, osi, out_sample = create_in_out_samples(xs, in_sample_size)
    clf = cluster.KMeans(n_clusters = 4)
    clf.fit(in_sample)
    oos_clusterid = clf.predict(out_sample)
    ins_clusterid = clf.predict(in_sample)

    for id in numpy.unique(oos_clusterid):
        print "Now working on Cluster " + str(id)
        oos_ind = oos_clusterid == id
        ins_ind = ins_clusterid == id

        tree = ensemble.RandomForestRegressor(50)

        tree.fit(in_sample[ins_ind], ys[isi][ins_ind])
        print "Score for in-sample"
        print str(tree.score(in_sample[ins_ind], ys[isi][ins_ind]))

        print "Score for out-of sample"
        tree.predict(out_sample[oos_ind])
        print str(tree.score(out_sample[oos_ind], ys[osi][oos_ind]))

    return None
Ejemplo n.º 7
0
def cross_val(data, classifiers):
    averages =  []
    for i in range(5):
        bayes_estimates = []
        tree_estimates =[]
        data = np.array(data)
        classifiers = np.array(classifiers)
        folds = cross_validation.KFold(len(classifiers), n_folds=10, shuffle=True)
        for train, test in folds:
            bayes = naive_bayes(data[train], classifiers[train])
            bayes_estimates.append(bayes.score(data[test], classifiers[test]))
            tree = decision_tree(data[train], classifiers[train])
            tree_estimates.append(tree.score(data[test], classifiers[test]))

        temp = []
        temp.append(np.mean(bayes_estimates))
        temp.append(np.mean(tree_estimates))
        averages.append(temp)

    visualize(averages)
Ejemplo n.º 8
0
def classify(keyword2int):
    import numpy as np
    import matplotlib.pyplot as plt
    import scipy.misc
    from PIL import Image

    import pprint
    pp = pprint.PrettyPrinter(indent=4)

    # In[18]:

    import sklearn
    from sklearn import datasets
    import skimage.io

    # ## Datasets preparation

    # In[19]:

    def png2vec(filename):
        img = Image.open(filename).convert('L')
        arr = np.array(img)
        return arr

    # In[20]:

    filesetNames = ["%01d" % x for x in range(0, 2)]

    # In[21]:

    import os
    images = []
    tgt = []
    count = 0
    img_test = Image.open("./img/0/1.jpg")

    for curFileset in filesetNames:
        curPath = "./img/" + curFileset + "/"
        for file in os.listdir(curPath):
            curImageVector = png2vec(curPath + file)
            images.append(curImageVector)
            tgt.append(curFileset)
            count += 1
    #end
    print(len(images))

    # In[22]:

    # In[24]:

    from sklearn.model_selection import train_test_split
    from sklearn import model_selection, metrics
    images_np = np.array(images)
    img = images_np.reshape(images_np.shape[0], -1)

    xM, xT, yM, yT = train_test_split(img, tgt, test_size=0.01)
    print(yT)
    xT = list(xT)

    #map(list,xT)
    xT.extend(xM[0:20])
    yT.extend(yM[0:20])
    print(yT)
    xT = np.array(xT)

    print(yT)

    # # Naive Bayes

    # In[25]:
    # 要传参!!! from search keyword2int
    # 要传参!!! from search keyword2int
    #keyword2int = {'dog':0,'cat':1}

    from sklearn import metrics
    from sklearn import naive_bayes
    cls = naive_bayes.GaussianNB()
    cls.fit(xM, yM)
    res = cls.predict(xT)
    print(type(xT[0:1]))
    print(len(xT), len(xT[0]), 'result', type(xT))
    print(res)
    print(metrics.confusion_matrix(yT, res), metrics.accuracy_score(yT, res),
          "GaussianNB")
    #plt.title('这个图片是'+str(list(keyword2int.keys())[0])+'预测结果是'+ str(list(keyword2int.keys())[int(res_test[0])]))
    #plt.imshow(img_test)
    #plt.show()
    # In[26]:

    cls = naive_bayes.BernoulliNB(binarize=0.9)
    cls.fit(xM, yM)
    res = cls.predict(xT)
    print(metrics.confusion_matrix(yT, res), metrics.accuracy_score(yT, res),
          "BernoulliNB")

    # In[27]:

    cls = naive_bayes.MultinomialNB(alpha=0.1,
                                    fit_prior=True,
                                    class_prior=None)
    cls.fit(xM, yM)
    res = cls.predict(xT)
    print(metrics.confusion_matrix(yT, res), metrics.accuracy_score(yT, res),
          "naive_bayes.MultinomialNB")

    # # KNN

    # In[28]:

    from sklearn.neighbors import KNeighborsClassifier
    cls = KNeighborsClassifier(n_neighbors=1)
    cls.fit(xM, yM)
    res = cls.predict(xT)
    print(metrics.confusion_matrix(yT, res), metrics.accuracy_score(yT, res),
          ' knn')

    # # Decision Tree

    # In[33]:

    from sklearn import tree
    cls = tree.DecisionTreeClassifier()
    cls = cls.fit(xM, yM)
    res = cls.predict(xT)
    print(metrics.confusion_matrix(yT, res), metrics.accuracy_score(yT, res),
          "Decision Tree")
    import random
    index = random.randint(0, 10)
    x_test = xT[index]
    y_tag = yT[index]
    DT_res = cls.predict([x_test])

    x_test_image_list = list(x_test)
    x_temp = []
    x_image = []
    for j in range(100):
        for i in range(100):
            x_temp.append(x_test[i + j * 100])

        x_image.append(x_temp)
        x_temp = []

    np.array(x_image)
    plt.title('the image is ' + str(list(keyword2int.keys())[int(y_tag)]) +
              ' result is ' + str(list(keyword2int.keys())[int(DT_res[0])]))
    print(DT_res)
    print(
        list(keyword2int.keys())[int(y_tag)],
        list(keyword2int.keys())[int(DT_res[0])])
    plt.imshow(x_image)
    plt.show()

    import sys
    sys.exit()

    # In[40]:

    from sklearn.tree import DecisionTreeClassifier

    tree = DecisionTreeClassifier(random_state=0, min_samples_split=2)
    tree.fit(xT, yT)

    print(tree.score(xT, yT), tree.score(xM, yM), "DecisionTreeClassifier")

    # In[37]:

    DecisionTreeClassifier()

    # # random forest

    # In[56]:

    from sklearn.ensemble import RandomForestClassifier

    forest = RandomForestClassifier(n_estimators=150, random_state=0)
    forest.fit(xT, yT)

    forest.score(xT, yT), forest.score(xM, yM)

    # In[44]:

    forest

    # In[74]:

    scoreListUniform = []
    nListUniform = []
    stepCount = 10
    stepDist = 200
    start = 200
    end = 2001
    for curCount in range(start, end, stepDist):
        forest = RandomForestClassifier(n_estimators=curCount, random_state=0)
        forest.fit(xT, yT)
        nListUniform.append(curCount)
        scoreListUniform.append(forest.score(xM, yM))
    scoreListUniform

    # ## SVM

    # In[76]:

    from sklearn import svm
    cls = svm.SVC()
    cls = cls.fit(xM, yM)
    res = cls.predict(xT)
    print(metrics.confusion_matrix(yT, res), metrics.accuracy_score(yT, res),
          " svm")

    from sklearn import svm
    import matplotlib.pyplot as plt
    import numpy
    n_trials = 3
    train_percentages = range(5, 95, 5)
    test_accuracies = numpy.zeros(len(train_percentages))

    for (i, tp) in enumerate(train_percentages):
        test_accuracy = numpy.zeros(n_trials)
        for n in range(n_trials):
            xM, xT, yM, yT = train_test_split(digits.data,
                                              digits.target,
                                              train_size=tp / 100.0)
            cls = svm.LinearSVC().fit(xM, yM)
            res = cls.predict(xT)
            test_accuracy[n] = metrics.accuracy_score(yT, res)
        test_accuracies[i] = test_accuracy.mean()
        print(i, tp, test_accuracies[i])
    print(train_percentages)
    fig = plt.figure()
    plt.plot(train_percentages, test_accuracies)
    plt.xlabel('Percentage of Data Used for Training')
    plt.ylabel('Accuracy on Test Set')
    plt.show()
Ejemplo n.º 9
0
#tratando valores nulos encontrados pela média
new_data_train['Age'].fillna(new_data_train['Age'].mean(), inplace=True)
new_data_test['Age'].fillna(new_data_test['Age'].mean(), inplace=True)

#Verificação de valores nulos, de ordem descrescente com os 10 primeiros
new_data_test.isnull().sum().sort_values(ascending=False).head(10)

#tratamento do valor nulo coluna fare pela média
new_data_test['Fare'].fillna(new_data_test['Fare'].mean(), inplace=True)

#separado as features para a criação do modelo
X = new_data_train.drop("Survived", axis=1)  #tirando apenas a coluna target
y = new_data_train["Survived"]  # colocando somente a coluna target

from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
#parametrizando o tamanho a arvore de decisao
tree = DecisionTreeClassifier(max_depth=3, random_state=0)
tree.fit(X, y)

#avaliando o modelo
tree.score(X, y)

#Enviando a previsão para o Kaggle
previsao = pd.DataFrame()
previsao["PassengerId"] = new_data_test["PassengerId"]
previsao["Survived"] = tree.predict(new_data_test)

#importando para CSV
previsao.to_csv('previsao.csv', index=False)
Ejemplo n.º 10
0
    regiao = 'Grande Florianópolis'
    localizacao = 'URBANA'
    serie = '8º ano'

    regiao_enc = encoders['regiao'].transform([regiao])[0]
    localizacao_enc = encoders['localizacao'].transform([localizacao])[0]
    serie_enc = encoders['serie'].transform([serie])[0]

    prediction_enc = tree.predict([[regiao_enc, localizacao_enc, serie_enc]])
    prediction = encoders['status'].inverse_transform(int(prediction_enc[0]))

    proba = tree.predict_proba([[regiao_enc, localizacao_enc, serie_enc]])[0]

    # Output
    print('\nAcurácia do classificador: ' + str(tree.score(data, target)))

    print('\nA predição retornou: ' + prediction)
    if prediction == 'Suficiente':
        print('O aluno atende aos pré requisitos do Bolsa Família.')
    else:
        print(
            'O aluno não atende aos pré requisitos do Bolsa Família e deve ser desligado do programa.'
        )

    print('\nPeso de cada variável:')
    for i in range(len(default_csv[0]) - 1):
        print('\t' + default_csv[0][i] + ': ' +
              str(tree.feature_importances_[i]))

    print('\nGrau de certeza de cada possível resposta:')
Ejemplo n.º 11
0
X = data.values[:, :-1]
y = data.values[:, -1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5)

svm = svm.SVC(probability=True)
logreg = LogisticRegression()
tree = tree.DecisionTreeClassifier()

svm.fit(X_train, y_train)
logreg.fit(X_train, y_train)
tree.fit(X_train, y_train)

print("SVM Accuracy: %.2f%s" %
      (svm.score(X_test, y_test) * 100, '%'))
print("LogReg Accuracy: %.2f%s" %
      (logreg.score(X_test, y_test) * 100, '%'))
print("Tree Accuracy: %.2f%s" %
      (tree.score(X_test, y_test) * 100, '%'))

w = [1, 1, 1]
ensemble = VotingClassifier(
    estimators=[('svm', svm),
                ('logreg', logreg),
                ('tree', tree)],
    voting='hard', weights=w)
ensemble.fit(X_train, y_train)

print("Ensemble Accuracy: %.2f%s" %
      (ensemble.score(X_test, y_test) * 100, '%'))
Ejemplo n.º 12
0
plt.scatter(x_data[:, 0], x_data[:, 1], c=y_data)
plt.show()

# 准确率
print(kNN.score(x_test, y_test))
'''建决策树模型'''
tree = tree.DecisionTreeClassifier()
tree.fit(x_train, y_train)

# 画图
draw(tree)
plt.scatter(x_data[:, 0], x_data[:, 1], c=y_data)
plt.show()

# 准确率
print(tree.score(x_test, y_test))
'''接下来使用bagging集成学习,加入kNN'''
# 100个不放回的抽样,也就是训练100个kNN分类器
bagging_kNN = BaggingClassifier(kNN, n_estimators=100)
bagging_kNN.fit(x_train, y_train)
# 画图
draw(bagging_kNN)
plt.scatter(x_data[:, 0], x_data[:, 1], c=y_data)
plt.show()

# 准确率
print(bagging_kNN.score(x_test, y_test))
'''加入决策树的集成学习'''
bagging_tree = BaggingClassifier(tree, n_estimators=100)
bagging_tree.fit(x_train, y_train)
targets_test = test.loc[:, 'over_50k']

#the decision tree tends to overfit the training data at the expense of lower accuracy in testing
#this can be mitigated by specifying a maximum tree depth

#using the validation data to pick a tree depth
validation_accs = []  #list of all validation accuracies
for i in range(1, 24):
    #decision tree model
    tree = DecisionTreeClassifier(max_depth=i,
                                  random_state=0)  #specify model type
    tree.fit(data_train, targets_train)  #fit the model

    #add accuracy for this test to testing_accs
    validation_accs.append(100 *
                           tree.score(data_validation, targets_validation))

depth = validation_accs.index(
    max(validation_accs)
) + 1  #the tree depth that produced the greatest test accuracy - in general this tends to be 7 or 8

print('\nSelecting a maximum tree depth of ' + str(depth))

#decision tree model
model = DecisionTreeClassifier(max_depth=depth,
                               random_state=0)  #specify model type
model.fit(data_train, targets_train)  #fit the model

print('Testing accuracy: ' + str('%.2f' %
                                 (100 * tree.score(data_test, targets_test))) +
      '%')  #tends to be roughly 82%
Ejemplo n.º 14
0
print(All_X.shape, train_X.shape, val_X.shape, train_y.shape, val_y.shape,
      test_X.shape)

# In[ ]:

#Feature importance

tree = DecisionTreeClassifier(random_state=99)
tree.fit(train_X, train_y)
imp = pd.DataFrame(tree.feature_importances_,
                   columns=['Importance'],
                   index=train_X.columns)
imp = imp.sort_values(['Importance'], ascending=True)
imp[:10].plot(kind='barh')
print(tree.score(train_X, train_y))

# In[ ]:

# Modeling
MLA = [
    #Ensemble Methods
    ensemble.AdaBoostClassifier(),
    ensemble.BaggingClassifier(),
    ensemble.ExtraTreesClassifier(),
    ensemble.GradientBoostingClassifier(),
    ensemble.RandomForestClassifier(),

    #Gaussian Processes
    gaussian_process.GaussianProcessClassifier(),
Ejemplo n.º 15
0
info = [0.66, 0.8, 0.9, 0.95]
depths = len(info) * [None]
nleaves = len(info) * [None]
accuracies_train = len(info) * [None]
accuracies_test = len(info) * [None]

for index in range(0, len(info)):
    elem = info[index]
    pca = PCA(elem)
    reduced_d2_X_train = pca.fit_transform(d2_X_train)
    reduced_d2_X_test = pca.transform(d2_X_test)
    tree = DecisionTreeClassifier()
    model = tree.fit(reduced_d2_X_train, y_train)
    depths[index] = tree.get_depth()
    nleaves[index] = tree.get_n_leaves()
    accuracies_train[index] = tree.score(reduced_d2_X_train, y_train)
    accuracies_test[index] = tree.score(reduced_d2_X_test, y_test)

t = PrettyTable()
t.add_column("% information", info)
t.add_column("Tree depth", depths)
t.add_column("number of leaves", nleaves)
t.add_column("Accuracy on train set", accuracies_train)
t.add_column("Accuracy on test set", accuracies_test)
print(t)

res_model.append("Decision tree")
res_param.append("Unrestricted max depth \n66% of information")
res_train_acc.append(accuracies_train[0])
res_valid_acc.append("-")
res_test_acc.append(accuracies_test[0])
Ejemplo n.º 16
0
X = data.values[:, :-1]
y = data.values[:, -1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5)

svm = svm.SVC(probability=True)
logreg = LogisticRegression()
tree = tree.DecisionTreeClassifier()

svm.fit(X_train, y_train)
logreg.fit(X_train, y_train)
tree.fit(X_train, y_train)

print("SVM Accuracy: %.2f%s" % (svm.score(X_test, y_test) * 100, '%'))
print("LogReg Accuracy: %.2f%s" % (logreg.score(X_test, y_test) * 100, '%'))
print("Tree Accuracy: %.2f%s" % (tree.score(X_test, y_test) * 100, '%'))

w = [
    svm.score(X_test, y_test),
    logreg.score(X_test, y_test),
    tree.score(X_test, y_test)
]

ensemble = VotingClassifier(estimators=[('svm', svm), ('logreg', logreg),
                                        ('tree', tree)],
                            voting='soft',
                            weights=w)
ensemble.fit(X_train, y_train)

print("Ensemble Accuracy: %.2f%s" %
      (ensemble.score(X_test, y_test) * 100, '%'))
Ejemplo n.º 17
0
def evaluate_classifier(tree, X, Y):
    """
    Evaluates a tree with the data values passed, returning the R2 and MSE
    """
    miss_rate = tree.score(X, Y)
    return miss_rate
Ejemplo n.º 18
0
def decision_tree():
    # Decision Tree
    tree = tree.DecisionTreeClassifier()
    tree.fit(X_train, y_train)
    tree_pred = tree.predict(X_test)
    print("Tree test set score: {:.5f}".format(tree.score(X_test, y_test)))
from scipy import misc
# from sklearn.metrics import accuracy_score


def show_tree(tree, features, path):
    f = io.StringIO()
    export_graphviz(tree,
                    out_file=f,
                    class_names=['malignant', 'benign'],
                    feature_names=features,
                    impurity=False)
    pydotplus.graph_from_dot_data(f.getvalue()).write_png(path)
    img = misc.imread(path)

    plt.rcParams['figure.figsize'] = (20, 20)
    plt.imshow(img)


cancer = load_breast_cancer()
X_train, X_test, y_train, y_test = train_test_split(cancer.data,
                                                    cancer.target,
                                                    stratify=cancer.target,
                                                    random_state=42)
tree = DecisionTreeClassifier(max_depth=6,
                              min_samples_split=10,
                              random_state=0)
tree.fit(X_train, y_train)

show_tree(tree, cancer.feature_names, "BreastCancer.png")
accuracy = tree.score(X_test, y_test)
print('Accuracy :', accuracy)
Ejemplo n.º 20
0
    edges[edge.get_source()].append(int(edge.get_destination()))

for edge in edges:
    edges[edge].sort()    
    for i in range(2):
        dest = graph.get_node(str(edges[edge][i]))[0]
        dest.set_fillcolor(colors[i])

graph.write_png('tree.png')
"""
#########################################################################

prediction = tree.predict(test_features)

print("The prediction accuracy is: ",
      tree.score(test_features, test_targets) * 100, "%")

# x axis values
x = ['sun', 'mon', 'fri', 'sat', 'tue', 'wed', 'thu']

# y axis values
y = [5, 6.7, 4, 6, 2, 4.9, 1.8]
# plotting strip plot with seaborn
ax = sns.stripplot(x, y)

# giving labels to x-axis and y-axis
ax.set(xlabel='x', ylabel='y')

# giving title to the plot
plt.title('My first graph')
Ejemplo n.º 21
0
    4,  # número de épocas, sem redução na função de perda, que o monitor espera para agir.
    verbose=1,
)

#fazendo uso dos monitores criados
history = model.fit(X_train,
                    y_train_cat,
                    batch_size=batch_size,
                    epochs=epochs,
                    callbacks=[reduce_lr, early_stopping],
                    verbose=1,
                    validation_data=(X_valid, y_valid_cat))

print("1-NN: ", 1 - neigh.score(X_valid, y_valid))
print("SVM: ", 1 - svm.score(X_valid, y_valid))
print('Tree: ', 1 - tree.score(X_valid, y_valid))
print("MLP:", 1 - model.evaluate(X_valid, y_valid_cat, verbose=0)[1])

plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()
# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
Ejemplo n.º 22
0
print("Acuracia de teste NAIVE BAYES: %0.3f" % te1)
#precision NAIVE BAYES
y_pred1 = gnb.predict(X_test)
micro_precision1 = precision_score(Y_test, y_pred1, average='macro')
print("Precision NAIVE BAYES: %0.3f" % micro_precision1)
#recall NAIVE BAYES
recall1 = recall_score(Y_test, y_pred1, average='macro')
print("Recall NAIVE BAYES: %0.3f" % recall1)

#criando e treinando arvore de decisao
tree = tree.DecisionTreeClassifier(criterion='entropy', random_state=1)
t_tree = time()
tree = tree.fit(X_train, Y_train)
tf_tree = round(time() - t_tree, 3)
print("Training time Arvore de Decisao:", tf_tree, "s")
t2 = round(tree.score(X_train, Y_train), 3)
te2 = round(tree.score(X_test, Y_test), 3)
#acuracia treino e test
print("Acuracia de treinamento ARVORE DE DECISAO: %0.3f" % t2)
print("Acuracia de teste ARVORE DE DECISAO: %0.3f" % te2)
#precision arvore de decisao
y_pred2 = tree.predict(X_test)
micro_precision2 = precision_score(Y_test, y_pred2, average='macro')
print("Precision ARVORE DE DECISAO: %0.3f" % micro_precision2)
#recall ARVORE DE DECISAO
recall2 = recall_score(Y_test, y_pred2, average='macro')
print("Recall ARVORE DE DECISAO: %0.3f" % recall2)

#criando e treinando logistic regression
lr = LogisticRegression(random_state=1)
t_lr = time()
Ejemplo n.º 23
0
# max_depth = 3
list_min_leaves = [i for i in range(1, 10)]
for criterion in insert_criterion:
    for depth in range(2, 7):
        for min_leaf in list_min_leaves:
            # tree = DecisionTreeClassifier(criterion=criterion, max_depth=max_depth,
            #                               min_samples_leaf=min_leaf, random_state=42)

            tree = DecisionTreeClassifier(criterion=criterion,
                                          max_depth=depth,
                                          min_samples_leaf=min_leaf,
                                          random_state=42)

            tree.fit(x_train, y_train)

            train_result.append(tree.score(x_train, y_train))
            test_result.append(tree.score(x_test, y_test))

            criterions.append(criterion)

            # max_depths.append(max_depth)
            max_depths.append(depth)

            min_leaves.append(min_leaf)

result = pd.DataFrame()
result["Criterion"] = criterions
result["Depth"] = max_depths
result["MinLeafSize"] = min_leaves
result["Train_Acc"] = train_result
result["Test_Acc"] = test_result
Ejemplo n.º 24
0
forest = sklearn.ensemble.RandomForestRegressor(n_estimators=100,
                                                max_depth=10,
                                                random_state=4324)
# forest_pipe = sklearn.pipeline.Pipeline([("Scale", scaler), ("Model", forest)])
forest.fit(X_train, y_train)

tree = sklearn.tree.DecisionTreeRegressor(max_depth=3, random_state=575767)
tree.fit(X_train, y_train)

print("Linear Regression & {:.3f} & {:.3f} \\\\".format(
    lm.score(X_train, y_train), lm.score(X_test, y_test)))
print("Random Forest & {:.3f} & {:.3f} \\\\".format(
    forest.score(X_train, y_train), forest.score(X_test, y_test)))
print("Decision Tree & {:.3f} & {:.3f} \\\\".format(
    tree.score(X_train, y_train), tree.score(X_test, y_test)))

# fig, ax = plt.subplots(figsize=(15, 5))
# sklearn.tree.plot_tree(tree, filled=True, impurity=False, fontsize=10, ax=ax)
# plt.show()

vaasa_iter = [
    get_data_week_and_two_previous(col, "Vaasa", algae) +
    get_data_week_and_two_previous(col, "Vaasa", temps) +
    get_data_week_and_two_previous(col, "Vaasa", rains) +
    get_data_week_and_two_previous(col, "Vaasa", aqs)
    # + [population.loc["Vaasa"][str(col[0])]]
    + [visits.loc["Vaasa"][col] / population.loc["Vaasa"][str(col[0])]]
    for col in visits.columns
]
vaasa_data = np.stack(vaasa_iter)
Ejemplo n.º 25
0
def classify(keyword2int):
    import numpy as np
    import matplotlib.pyplot as plt
    import scipy.misc
    from PIL import Image

    import pprint
    pp = pprint.PrettyPrinter(indent = 4)


    # In[18]:


    import sklearn
    from sklearn import datasets
    import skimage.io


    # ## Datasets preparation

    # In[19]:


    def png2vec(filename):
        img = Image.open(filename).convert('L')
        arr = np.array(img)
        return arr


    # In[20]:


    filesetNames = ["%01d" %x for x in range(0,2)]

    # In[21]:

    import os
    images = []
    tgt = []
    count = 0
    img_test = Image.open("./img/0/1.jpg")

    for curFileset in filesetNames:
        curPath = "./img/"+ curFileset + "/"
        for file in os.listdir(curPath):
            curImageVector = png2vec(curPath + file)
            images.append(curImageVector)
            tgt.append(curFileset)
            count += 1
    #end
    print (len(images))

    # In[22]:

    '''
    import random
    index = random.randint(0,count - 1)
    random_raw_image = images[index]
    random_im = Image.fromarray(random_raw_image)
    title = "No."+ "%04d"%index + " Tag:" + tgt[index]
    plt.title(title)
    plt.imshow(random_im)


    # In[23]:


    random_raw_image.flatten()
    '''

    # In[24]:


    from sklearn.model_selection import train_test_split
    from sklearn import model_selection, metrics
    images_np = np.array(images)
    img = images_np.reshape(images_np.shape[0],-1)

    xM, xT, yM, yT = train_test_split(img, tgt, test_size = 0.01)
    print(yT)
    xT = list(xT)

    #map(list,xT)
    xT.extend(xM[0:20])
    yT.extend(yM[0:20])
    print(yT)
    xT = np.array(xT)


    print(yT)

    # # Naive Bayes

    # In[25]:
    # 要传参!!! from search keyword2int
    # 要传参!!! from search keyword2int
    #keyword2int = {'dog':0,'cat':1}

    from sklearn import metrics
    from sklearn import naive_bayes
    cls = naive_bayes.GaussianNB()
    cls.fit(xM, yM)
    res = cls.predict(xT)
    print(type(xT[0:1]))
    print(len(xT),len(xT[0]),'result',type(xT))
    print(res)
    print(metrics.confusion_matrix(yT, res),metrics.accuracy_score(yT, res),"GaussianNB")
    #plt.title('这个图片是'+str(list(keyword2int.keys())[0])+'预测结果是'+ str(list(keyword2int.keys())[int(res_test[0])]))
    #plt.imshow(img_test)
    #plt.show()
    # In[26]:


    cls = naive_bayes.BernoulliNB(binarize = 0.9)
    cls.fit(xM, yM)
    res = cls.predict(xT)
    print(metrics.confusion_matrix(yT, res),metrics.accuracy_score(yT, res),"BernoulliNB")


    # In[27]:


    cls = naive_bayes.MultinomialNB(alpha=0.1, fit_prior=True, class_prior=None)
    cls.fit(xM, yM)
    res = cls.predict(xT)
    print(metrics.confusion_matrix(yT, res),metrics.accuracy_score(yT, res),"naive_bayes.MultinomialNB")


    # # KNN

    # In[28]:


    from sklearn.neighbors import KNeighborsClassifier 
    cls = KNeighborsClassifier(n_neighbors=7)
    cls.fit(xM, yM)
    res = cls.predict(xT)
    print(metrics.confusion_matrix(yT, res),metrics.accuracy_score(yT, res),' knn')

    '''
    # ## KNN Parameters

    # In[30]:


    scoreListUniform = []
    nListUniform = []
    stepCount = 10
    start = 1
    end = 10
    for curN in range(start,end):
        cls = KNeighborsClassifier(n_neighbors=curN,weights = 'uniform')
        cls.fit(xM, yM)
        res = cls.predict(xT)
        curScore = metrics.accuracy_score(yT, res)
        nListUniform.append(curN)
        scoreListUniform.append(curScore)
    scoreListUniform


    # In[75]:


    scoreListDistance = []
    nListDistance = []
    for curN in range(start,end):
        cls = KNeighborsClassifier(n_neighbors = curN,weights = 'distance')
        cls.fit(xM, yM)
        res = cls.predict(xT)
        curScore = metrics.accuracy_score(yT, res)
        nListDistance.append(curN)
        scoreListDistance.append(curScore)
    scoreListDistance


    # ## Plot

    # In[ ]:


    x = range(start,end)
    plt.figure(figsize = (8,4))
    plt.plot(x,scoreListUniform,"r-",label = "Uniform Weights",linewidth = 1)
    plt.plot(x,scoreListDistance,"b-",label = "Weights by Distance",linewidth = 1)
    plt.xlabel("Neighbours")
    plt.ylabel("Score")
    plt.title("KNN Parameters Scores")
    plt.legend()
    plt.savefig("KnnParameters.png")

    plt.show()


    # ## KNN Parameters: weights

    # In[ ]:


    scoreList = []
    nList = []
    stepCount = 10
    start = 1
    end = 10

    cls = KNeighborsClassifier(weights = 'uniform')
    cls.fit(xM, yM)
    res = cls.predict(xT)
    curScore = metrics.accuracy_score(yT, res)
    nList.append('uniform')
    scoreList.append(curScore)

    cls = KNeighborsClassifier(weights = 'distance')
    cls.fit(xM, yM)
    res = cls.predict(xT)
    curScore = metrics.accuracy_score(yT, res)
    nList.append('distance')
    scoreList.append(curScore)

    '''

    # # Decision Tree

    # In[33]:


    from sklearn import tree
    cls = tree.DecisionTreeClassifier()
    cls = cls.fit(xM,yM)
    res = cls.predict(xT)
    print(metrics.confusion_matrix(yT,res), metrics.accuracy_score(yT,res),"Decision Tree")
    import random
    index = random.randint(0, 10)
    x_test = xT[index]
    y_tag = yT[index]
    DT_res = cls.predict([x_test])

    x_test_image_list = list(x_test)
    x_temp = []
    x_image = []
    for j in range(100):
        for i in range(100):
            x_temp.append(x_test[i+j*100])
        
        x_image.append(x_temp)
        x_temp = []

    np.array(x_image)
    plt.title('the image is '+str(list(keyword2int.keys())[int(y_tag)])+' res is '+ str(list(keyword2int.keys())[int(DT_res[0])]))
    print(DT_res)
    print(list(keyword2int.keys())[int(y_tag)],list(keyword2int.keys())[int(DT_res[0])])
    plt.imshow(x_image)
    plt.show()

    import sys
    sys.exit()

    # In[40]:


    from sklearn.tree import DecisionTreeClassifier


    tree = DecisionTreeClassifier(random_state=0, min_samples_split=2)
    tree.fit(xT,yT)

    print(tree.score(xT, yT),tree.score(xM, yM),"DecisionTreeClassifier")


    # In[37]:


    DecisionTreeClassifier()


    # # random forest 

    # In[56]:


    from sklearn.ensemble import RandomForestClassifier


    forest = RandomForestClassifier(n_estimators=150, random_state=0)
    forest.fit(xT, yT)


    forest.score(xT, yT),forest.score(xM, yM)


    # In[44]:


    forest


    # In[74]:


    scoreListUniform = []
    nListUniform = []
    stepCount = 10
    stepDist = 200
    start = 200
    end = 2001
    for curCount in range(start,end,stepDist):
        forest = RandomForestClassifier(n_estimators = curCount, random_state=0)
        forest.fit(xT, yT)
        nListUniform.append(curCount)
        scoreListUniform.append(forest.score(xM, yM))
    scoreListUniform


    # In[94]:






    # ### Gini Random

    # ### Entropy Best

    # ### Entropy Random

    # ## SVM

    # In[76]:

    '''
    '''
    from sklearn import svm
    cls = svm.SVC()
    cls = cls.fit(xM,yM)
    res = cls.predict(xT)
    print(metrics.confusion_matrix(yT,res), metrics.accuracy_score(yT,res), " svm")


    # In[77]:
    '''
Ejemplo n.º 26
0
                           max_iter=500,
                           random_state=17,
                           n_jobs=4,
                           multi_class='multinomial')
logit_pipe = Pipeline([('scaler', StandardScaler()), ('logit', logit)])
logit_pipe.fit(X_train, y_train)
y_pred_lf = logit_pipe.predict(X_test)
score_lf = accuracy_score(y_test, y_pred_lf)

##### 0.41

##### Decision Tree
tree = tree.DecisionTreeClassifier()
tree.fit(X_train, y_train)
y_pred_tree = tree.predict(X_test)
score_tree = tree.score(X_test, y_test)

##### 0.38
##### Random Forest

forest = RandomForestClassifier(n_estimators=100, random_state=17, n_jobs=4)
forest.fit(X_train, y_train)
y_pred_forest = forest.predict(X_test)
S = accuracy_score(y_test, y_pred_forest)

##### 0.43
##### SVM
### Linear
from sklearn.svm import SVC

svc = OneVsRestClassifier(SVC()).fit(X_train, y_train)
loans_info['orig_fico'] = loans_orig_info['nb_original_fico']
loans_info['nb_realized_loss'] = np.log(loans_info['nb_realized_loss'])
loans_info['orig_balance'] = np.log(loans_info['orig_balance'])
vals = 0
runs = 100
x1 = 0
x2 = 0
x3 = 0
x4 = 0
for x in range (0,runs):
  clf = DecisionTreeClassifier()
  data = loans_info.dropna(axis = 0, how = 'any')
  X = (data.drop(columns={'nb_loan_number','nb_realized_loss', 'caused_loss','age'}))
  X_train, X_test, Y_train, Y_test = train_test_split(X, data['caused_loss'], test_size = .7, train_size = .3)
  tree = clf.fit(X_train, Y_train)
  vals = vals + (tree.score(X_test, Y_test))
  #print(tree.feature_importances_)
  x1 += tree.feature_importances_[0]
  x2 += tree.feature_importances_[1]
  x3 += tree.feature_importances_[2]
  x4 += tree.feature_importances_[3]
print(vals/runs) 
print('Average Importance of Loan Term, Loam Amount, FICO Score, and Loan to Value Ratio')
print(x1/runs)
print(x2/runs)
print(x3/runs)
x4/runs



Ejemplo n.º 28
0
def decision_tree_regressor(x_train, y_train, x_test, y_test):
    from sklearn import tree
    tree = tree.DecisionTreeRegressor()
    tree.fit(x_train, y_train)
    value = tree.score(x_test, y_test)
    return "{0:.2f}".format(value)
Ejemplo n.º 29
0
def getTreeImportanceAndScore(tree, features, traget):
    importance = tree.feature_importances_
    score = tree.score(features, traget)
    return importance, score
########################################################################################################################

from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split  # some documents still include the cross-validation option but it no more exists in version 18.0
import pylab as plt

cancer = load_breast_cancer()
X_train, X_test, y_train, y_test = train_test_split(cancer.data,
                                                    cancer.target,
                                                    stratify=cancer.target,
                                                    random_state=42)
tree = DecisionTreeClassifier(random_state=0)
tree.fit(X_train, y_train)
print("Accuracy on training set: {:.3f}".format(tree.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(tree.score(X_test, y_test)))

for i in range(2, 10):
    tree = DecisionTreeClassifier(max_depth=i, random_state=0)
    tree.fit(X_train, y_train)
    export_graphviz(tree,
                    out_file="tree" + str(i) + ".dot",
                    class_names=["malignant", "benign"],
                    feature_names=cancer.feature_names,
                    impurity=False,
                    filled=True)
    print("Accuracy on training set: {:.3f}".format(
        tree.score(X_train, y_train)))
    print("Accuracy on test set: {:.3f}".format(tree.score(X_test, y_test)))
Ejemplo n.º 31
0
yList = [predY10, predY30, predY50, predY70, usedTrainY]
yListName = [
    '10% labeled', '30% labeled', '50% labeled', '70% labeled', 'All data'
]
semiRecord = []
for yIdx, y in enumerate(yList):
    CV = cross_val_score(BaggingClassifier(cartTree,
                                           max_samples=0.7,
                                           max_features=1.0),
                         usedTrainX,
                         y,
                         cv=5).mean()
    #    CV = cross_val_score(ridge, usedTrainX, y, cv=5).mean()
    tree = cartTree_bagging.fit(usedTrainX, y)
    tree.fit(usedTrainX, y)
    testScore = tree.score(usedTestX, usedTestY)

    print(yListName[yIdx])
    print('CV:\t{}'.format(CV))
    print('Test:\t{}'.format(testScore))

    semiRecord.append({'name': yListName[yIdx], 'CV': CV, 'test': testScore})

#%% 4. Predict(batch data)
from sklearn.cross_validation import train_test_split
x_batch_10 = usedTrainX[y_10 != -1]
y_batch_10 = usedTrainY[y_10 != -1]
x_batch_30 = usedTrainX[y_30 != -1]
y_batch_30 = usedTrainY[y_30 != -1]
x_batch_50 = usedTrainX[y_50 != -1]
y_batch_50 = usedTrainY[y_50 != -1]
Ejemplo n.º 32
0
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

income_data=pd.read_csv('income.csv', header=0, delimiter=', ')
print(income_data.iloc[0])

labels=income_data[["income"]]
income_data['sex-int']=income_data['sex'].apply(lambda x: 0 if x == 'Male' else 1)
income_data['country-int']=income_data['native-country'].apply(lambda y: 0 if y == 'United-State' else 1)
data = income_data[['age','capital-gain','capital-loss','hours-per-week','sex-int','country-int']]

train_data,test_data,train_labels,test_labels=train_test_split(data, labels, random_state=1)

forest=RandomForestClassifier(random_state=1)
forest.fit(train_data, train_labels)
#print(forest.feature_importances_)
print(forest.score(test_data, test_labels))

#comparison with Decision Tree
tree=DecisionTreeClassifier(random_state=1)
tree.fit(train_data, train_labels)
print(tree.score(test_data, test_labels))



#print(income_data['native-country'].value_counts())




Ejemplo n.º 33
0
    lambda occ: 0 if occ in [
        "Adm-clerical", "Armed-Forces", "Priv-house-serv", "Handlers-cleaners"
    ] else 1)

labels = income_data[["income"]]
data = income_data[[
    "capital-gain", "capital-loss", "education-num", "occupation-int"
]]

train_data, test_data, train_labels, test_labels = train_test_split(
    data, labels, random_state=1)

forest = RandomForestClassifier(n_estimators=150, random_state=2)
forest.fit(train_data, train_labels)

# Show importances of features
print("importances: ", forest.feature_importances_)
print(forest.score(test_data, test_labels))  # 84.45% correct

prediction = {}
for i in range(1, 20):
    tree = DecisionTreeClassifier(random_state=2, max_depth=i)
    tree.fit(train_data, train_labels)
    prediction.update({i: tree.score(test_data, test_labels)})
print("Decision Tree best: ",
      max(prediction.items(), key=operator.itemgetter(1)))  # 84.47% correct

me = np.array([0, 0, 13, 1]).reshape(1, -1)

print("Random Forest Prediction of me: ", forest.predict(me))