Esempio n. 1
0
def main():
    X_train, X_test, y_train, y_test = prank_data_split(
        '../dataset/ratings.csv', 0.2)

    #cross validation#
    depth_array = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11])
    depth = kfoldcv(X_train.as_matrix(), y_train.as_matrix(), 5, depth_array,
                    '../dataset/movies.csv')

    #prediction#
    print('predict using decision tree with max depth', depth, ':')
    X_train, X_test, y_train, y_test = generate_matrix(
        X_train, X_test, y_train, y_test, '../dataset/movies.csv')
    regr = decision_tree(X_train, y_train, depth)
    y_predicted = predict(regr, X_test)

    rmse = get_RMSE(y_test, y_predicted)
    print('rmse:', rmse)
    mae = get_MAE(y_test, y_predicted)
    print('mae:', mae)

    #specificity, sensitivity, precision, accuracy
    spec, sens, prec, accu = get_spec_sens_prec_accu(y_test, y_predicted)
    print('spec:', spec)
    print('sens:', sens)
    print('prec:', prec)
    print('accu:', accu)
Esempio n. 2
0
def performing_algorithm(X, y, X_test):
    """
    :param X: Matrix
    :param y: Matrix
    :param X_test: Matrix
    :return: Prediction of chosen algorithm
    """
    if args.algorithm == "linear_regression":
        return linear_regression(X, y, X_test)
    elif args.algorithm == "decision_tree":
        return decision_tree(X, y, X_test)
    elif args.algorithm == "SVM":
        return SVM(X, y, X_test)
def get_best_attribute(attributes, examples):
    """
  workers = parallel.Workers()
  workers.initialize_n_workers(4)
  workers.set_function(get_attribute_info_gain)
  workers.start()

  inputs = [[a, examples] for a in attributes]
  output = workers.run_over_data(inputs)
  """
    best_gain = float("-inf")
    best_attr = None
    for a in attributes:
        gain, attr = get_attribute_info_gain(a, examples)
        if gain < best_gain:
            best_gain = gain
            best_attr = a

    return best_attr, decision_tree(best_attr, dt_util.get_goal_counts(examples), dt_util.GOAL_INDEX)
def get_best_attribute(attributes, examples):
    """
  workers = parallel.Workers()
  workers.initialize_n_workers(4)
  workers.set_function(get_attribute_info_gain)
  workers.start()

  inputs = [[a, examples] for a in attributes]
  output = workers.run_over_data(inputs)
  """
    best_gain = float("-inf")
    best_attr = None
    for a in attributes:
        gain, attr = get_attribute_info_gain(a, examples)
        if gain < best_gain:
            best_gain = gain
            best_attr = a

    return best_attr, decision_tree(best_attr,
                                    dt_util.get_goal_counts(examples),
                                    dt_util.GOAL_INDEX)
Esempio n. 5
0
 def __init__(self, data, labels, num_trees, weights=None, randomized=False):
       self.data = data
       self.labels = labels
       self.trees = []
       num_obs = data.shape[0]
       num_features = data.shape[1]
       # uniformly weight data as default (can be modified for boosting)
       if weights == None:
             weights = 1.0/num_obs*numpy.ones((num_obs,1))
       # split into num_trees training sets with data sampled with replacement according to weighting scheme
       # the sets are the same size as the original training set
       data_sets = numpy.zeros((num_trees, num_obs, num_features))
       label_sets = numpy.zeros((num_trees, num_obs, 1))
       for i in range(num_trees):
             for j in range(num_obs):
                   sampled_obs_index = self.sample_index(weights)
                   data_sets[i,j] = data[sampled_obs_index, :]
                   label_sets[i,j] = labels[sampled_obs_index]
                   
       # train num_trees decision trees
       for i in range(num_trees):
             self.trees.append(decision_tree(data_sets[i,:,:], label_sets[i,:], randomized))
Esempio n. 6
0
    [90, 1, 0, 3.5],  # FIT
    [75, 1, 1, 3.1],  # FIT
    [85, 2, 1, 3.1],  # NOT_FIT
    [65, 0, 1, 2.1],  # NOT_FIT
    [70, 1, 0, 3.0]
])  # NOT_FIT

# класс для каждой кандидатки:
Y = np.array(
    [FIT, FIT, FIT, FIT, NOT_FIT, FIT, FIT, NOT_FIT, NOT_FIT, NOT_FIT])

# типы переменных в столбцах обучающей выборки
scale = np.array([NUMERICAL, CATEGORICAL, CATEGORICAL, NUMERICAL])

# рекурсивное построение дерева решений
decision_tree(X, Y, scale)

# классификация каждого примера с помощью
# классификатора, созданного на основе дерева
y = np.array([clf.classify(X[i, :]) for i in range(len(X))])

# классификация успешна, если все примеры правильно классифицированы
if np.all(y == Y):
    print('\nclassification success!\n')
else:
    print('\nclassification fail... :(\n')

# проверка себя с помощью классификатора
# TODO: после того, как вы построили дерево решений и реализовали на его
# основе функцию classify, раскомментируйте код ниже и проверьте себя,
# подходите ли вы на роль ассистентки профессора Буковски :)
Esempio n. 7
0
from decision_tree import *
from numpy import array
import scipy.io as sio

def max_depth(node):
    if node.is_leaf():
        return node.depth
    else:
        return max(max_depth(node.left_child), max_depth(node.right_child))

all_data = sio.loadmat('spam.mat')

dt = decision_tree(all_data['Xtrain'][0:3200,:], all_data['ytrain'][0:3200], True)
score = 0
for i in range(3200, 3450):
    score += dt.classify(all_data['Xtrain'][i,:]) == all_data['ytrain'][i][0]

print 'score = ' + str(score)
print 'error = ', str(1 - float(score) / 250)
print 'max depth = ', max_depth(dt.root)

dt = decision_tree(all_data['Xtrain'], all_data['ytrain'], True)
score = 0
for i in range(3450):
    score += dt.classify(all_data['Xtrain'][i,:]) == all_data['ytrain'][i][0]

print 'score = ' + str(score)
print 'error = ', str(1 - (float(score) / 3450))
print 'max depth = ', max_depth(dt.root)

#dt = decision_tree(all_data['Xtrain'], all_data['ytrain'])