Beispiel #1
0
def graph():
    dt_data = tree.export_graphviz(tree(),
                                   out_file=None,
                                   feature_names=iris.feature_names,
                                   class_names=iris.target_names,
                                   filled=True,
                                   rounded=True,
                                   special_characters=True)
    graph = graphviz.Source(dt_data)
    graph.render('output')
Beispiel #2
0
def getSimEntDataset(f,words,task):
    data = open(f,'r')
    lines = data.readlines()
    examples = []
    for i in lines:
        i=i.strip()
        if(len(i) > 0):
            i=i.split('\t')
            if len(i) == 3:
                if task == "sim":
                    e = (tree(i[0], words), tree(i[1], words), float(i[2]))
                    examples.append(e)
                elif task == "ent":
                    e = (tree(i[0], words), tree(i[1], words), i[2])
                    examples.append(e)
                else:
                    raise ValueError('Params.traintype not set correctly.')
            else:
                print(i)
    return examples
Beispiel #3
0
def getSentimentDataset(f,words):
    data = open(f,'r')
    lines = data.readlines()
    examples = []
    for i in lines:
        i=i.strip()
        if(len(i) > 0):
            i=i.split('\t')
            if len(i) == 2:
                e = (tree(i[0], words), i[1])
                examples.append(e)
            else:
                print(i)
    return examples
Beispiel #4
0
def main(argv):
    if FLAGS.dataset == 'toy':
        train_X, train_y, test_X, test_y, num_classes = get_toy_dataset()
    elif FLAGS.dataset == 'mnist':
        train_X, train_y, test_X, test_y, num_classes = get_mnist()

    train_pred = None

    if FLAGS.method == 'knn':
        pred = knn(train_X, train_y, test_X)
    elif FLAGS.method == 'svm':
        train_pred, pred = svm(train_X, train_y, test_X)
    elif FLAGS.method == 'tree':
        pred = tree(train_X, train_y, test_X)
    elif FLAGS.method == 'boosting':
        pred = boosting(train_X, train_y, test_X)
    elif FLAGS.method == 'nn':
        train_pred, pred = nn(train_X, train_y, test_X, num_classes)

    if train_pred is not None:
        print('Train Accuracy: %f' % compute_accuracy(train_pred, train_y))

    print('Accuracy: %f' % compute_accuracy(pred, test_y))
Beispiel #5
0
#     ('knn', neighbors.KNeighborsClassifier()),

#     #SVM: http://scikit-learn.org/stable/modules/svm.html
#     ('svc', svm.SVC(probability=True)),

#     #xgboost: http://xgboost.readthedocs.io/en/latest/model.html
#    ('xgb', XGBClassifier())

# ]

# vote_hard = ensemble.VotingClassifier(estimators = vote_est, voting='hard')
# vote_hard_cv = model_selection.cross_validate(vote_hard, data1[data1_x_bin], data1[Target], cv = cv_split)
# vote_hard.fit(data1[data1_x_bin], data1[Target])

# # print("VOTING_CLASSIFIER Parameters: ", dtree.get_params())
# # print("VOTING_CLASSIFIER Training w/bin score mean: {:.2f}", format(vote_hard_cv['train_score'].mean()))
# print("HARD_VOTING_CLASSIFIER Test w/bin score mean: {:.2f}", format(vote_hard_cv['test_score'].mean()))

# vote_soft = ensemble.VotingClassifier(estimators = vote_est, voting='soft')
# vote_soft_cv = model_selection.cross_validate(vote_soft, data1[data1_x_bin], data1[Target], cv = cv_split)
# vote_soft.fit(data1[data1_x_bin], data1[Target])

# # print("VOTING_CLASSIFIER Parameters: ", dtree.get_params())
# # print("VOTING_CLASSIFIER Training w/bin score mean: {:.2f}", format(vote_hard_cv['train_score'].mean()))
# print("SOFT_VOTING_CLASSIFIER Test w/bin score mean: {:.2f}", format(vote_soft_cv['test_score'].mean()))

data_val['Survived'] = tree(data_val).astype(int)
submit = data_val[['PassengerId', 'Survived']]
submit.to_csv("submission2.csv", index=False)
print(submit.info())
import pandas as pd
data = pd.read_pickle('/home/hudson/Downloads/prostate.df')
data.head(2)

#cell 5
y = data.values[:, -1]
print y.shape, Counter(y.tolist())
x = data.values[:, :-1]
print x.shape

#cell 6
## Task 1

(You can use DecisionTree implementation from scikit-learn.) 

Try decision tree on the above dataset. consider different values for the max depth of the tree ('max_depth') and min number of samples required to be a leaf node ('min-samples_leaf'). Conduct 10-fold cross-validation and: 

    - plot training error and testing error v.s. tree depth
    - plot training error and testing error v.s. min. sample for leaf nodes
    
Error should be measured by percentage of misclassification (i.e., return 'normal' for 'tumor' and vice versa).    

#cell 7
from sklearn import tree
from sklearn import metrics
from sklearn.cross_validation import KFold
n_folds = 10
#10 fold cross-validation
kf = KFold(len(y), n_folds=n_folds)

Beispiel #7
0
def predict(obj):
    p = tree().predict([obj])
    print('this is ', p)
Beispiel #8
0
        classifier.predict(testing_set.drop('class', axis=1)))


dataframe = pd.read_csv("diabetes.csv")
classes = [x for x in list(dataframe['class'].unique())]

# split dataframe (67% / 33%)
training_set = dataframe.sample(frac=0.67)
testing_set = dataframe[~dataframe.isin(training_set).all(1)]
print(training_set)
print(testing_set)

nn3 = knn(training_set, 3)
nn5 = knn(training_set, 5)
nn11 = knn(training_set, 11)
tree = tree(training_set)
gnb = gnb(training_set)

functions = [nn3, nn5, nn11, tree, gnb]
classifiers = ["3NN", "5NN", "11NN", "tree", "naive_bayes"]
scores = [get_score(x, testing_set) for x in functions]

## confusion matrices
for classifier, function, score in zip(classifiers, functions, scores):
    print("\n" + classifier + " classifier")
    print("accuracy = " + str(round(score * 100, 2)) + "%")
    print(get_confusion_matrix(function, testing_set))

## bar chart
plt.bar(classifiers, scores, align='center')
plt.ylabel("score")
df_test_tmp = df_test.replace("male", 0).replace("female", 1)
df_test_tmp = df_test_tmp.replace("C", 0).replace("Q", 1).replace("S", 2)
df_test_tmp["Age"].fillna(df_test_tmp["Age"].median(), inplace=True)
df_test_tmp["Fare"].fillna(df_test_tmp["Fare"].median(), inplace=True)
df_test_tmp["Embarked"].fillna(df_test_tmp["Embarked"].median(), inplace=True)
test_data = df_test_tmp.loc[:, [
    "Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"
]]

# In[76]:

max_acc = 0
max_depth = 0
max_model = None
for i in range(1, 100):
    ret = tree(i, train_data, train_target, valid_data, valid_target)
    if max_acc < ret[1]:
        max_acc = ret[1]
        max_depth = i
        max_model = ret[0]
print(max_depth, ",", max_acc)

# In[79]:

predicted = max_model.predict(test_data)
with open("predict_result_data.csv", "w") as f:
    writer = csv.writer(f, lineterminator='\n')
    writer.writerow(["PassengerId", "Survived"])
    for pid, survived in zip(df_test["PassengerId"], predicted):
        writer.writerow([pid, survived])
Beispiel #10
0
    def diseases(node, depth):
        if tree_.feature[node] != _tree.TREE_UNDEFINED:
            disease_name = feature_name[node]
            threshold = tree_.threshold[node]
            print("\n" + disease_name + "?\n")
            answer = input()
            # Asking the user if they have the symptoms displayed
            if answer == 'Yes':
                val = 1
            else:
                val = 0
            if val <= threshold:
                diseases(tree_.children_left[node], depth + 1)
            else:
                symptoms_present.append(disease_name)
                diseases(tree_.children_right[node], depth + 1)
        else:
            present_disease = probable_disease(tree_.value[node])
            print("Possible disease: " + present_disease)
            red_column = data.columns
            symptoms_given = red_column[
                data.loc[present_disease].values[0].nonzero()]
            print("\nPresent symtomp:  " + str(list(symptoms_present)))
            print("\nKnown symptomps of the disease: " +
                  str(list(symptoms_given)))

    diseases(0, 1)


tree(classifier, column)
Beispiel #11
0
    return f1_score(y_test, clf.predict(X_test), average='macro')


def less_X():
    X = pd.merge(X.iloc[:, 0:25],
                 X.iloc[:, 1024:1049],
                 how='outer',
                 left_index=True,
                 right_index=True)
    X, y = shuffle(X, y, random_state=0)

    X_train, X_valid, X_test = X[:int((0.6 * len(X)))], X[int((
        0.6 * len(X))):int((0.8 * len(X)))], X[int((0.8 * len(X))):]
    y_train, y_valid, y_test = y[:int((0.6 * len(X)))], y[int((
        0.6 * len(X))):int((0.8 * len(X)))], y[int((0.8 * len(X))):]

    randomforest(100, 50)


if __name__ == "__main__":
    svm()
    randomforest(100, 100)
    randomforest(100, 50)
    NB()
    less_X()

    plot_data = []
    for i in xrange(0, 100):
        plot_data.append(tree())
    plt.plot(plot_data)