def remove_features(removal_order, train_file, test_file, attr_file, max_features): train_accs = [] test_accs = [] remove_columns = [] for col in removal_order: print(col) remove_columns.append(col) if len(remove_columns) == max_features: break print(remove_columns) train_data, train_attr = read_data(train, attr, remove_columns=remove_columns) test_data, test_attr = read_data(test, attr, remove_columns=remove_columns) tree = decision_tree.DecisionTreeLearning(train_data, train_attr, "normal", "class") decision_tree.print_tree(tree) y_pred, y_true = decision_tree.predict(train_data, tree) train_acc = decision_tree.accuracy_score(y_pred, y_true) print('Accuracy on Training Data: {0}'.format(train_acc * 100)) y_pred, y_true = decision_tree.predict(test_data, tree) test_acc = decision_tree.accuracy_score(y_pred, y_true) print('Accuracy on Training Data: {0}'.format(test_acc * 100)) train_accs.append(train_acc) test_accs.append(test_acc) return train_accs, test_accs
def map_predict_voting(interface, state, label, inp): import decision_tree out = interface.output(0) fill_in_values = state["fill_in_values"] for row in inp: row = row.strip().split(state["delimiter"]) predicted = False if len(row) > 1: x_id = "" if state["id_index"] == -1 else row[state["id_index"]] x = [(fill_in_values[j] if row[j] in state["missing_vals"] else float(row[j]) if state["X_meta"][i] == "c" else row[j]) for i, j in enumerate(state["X_indices"])] tallies = {} for tree in state["forest"]: pred = decision_tree.predict(tree, x) tallies[pred] = tallies.get(pred, 0) + 1 if any(e > int(len(state["forest"]) / 2.) for e in tallies.values()): prediction = max(tallies, key=tallies.get) out.add(x_id, (prediction, tallies[prediction])) predicted = True break if not predicted: prediction = max(tallies, key=tallies.get) out.add(x_id, (prediction, tallies[prediction]))
def map_predict_voting(interface, state, label, inp): import decision_tree out = interface.output(0) fill_in_values = state["fill_in_values"] for row in inp: row = row.strip().split(state["delimiter"]) predicted = False if len(row) > 1: x_id = "" if state["id_index"] == -1 else row[state["id_index"]] x = [(fill_in_values[j] if row[j] in state["missing_vals"] else float(row[j]) if state["X_meta"][ i] == "c" else row[j]) for i, j in enumerate(state["X_indices"])] tallies = {} for tree in state["forest"]: pred = decision_tree.predict(tree, x) tallies[pred] = tallies.get(pred, 0) + 1 if any(e > int(len(state["forest"]) / 2.) for e in tallies.values()): prediction = max(tallies, key=tallies.get) out.add(x_id, (prediction, tallies[prediction])) predicted = True break if not predicted: prediction = max(tallies, key=tallies.get) out.add(x_id, (prediction, tallies[prediction]))
def CalculatePoints(crop_points, crop_prop, currentCondition): for crop in crop_points: humidity = HumidityInRange(crop_prop[crop][crop_properties.HUMIDITY], currentCondition[crop_properties.HUMIDITY]) crop_points[crop] += humidity temperature = TemperatureInRange( crop_prop[crop][crop_properties.TEMPERATURE], currentCondition[crop_properties.TEMPERATURE]) crop_points[crop] += temperature rainfall = RainfallInRange(crop_prop[crop][crop_properties.HUMIDITY], currentCondition[crop_properties.HUMIDITY]) crop_points[crop] += rainfall locationDesnsityScore = LocationScore(crop) crop_points[crop] += locationDesnsityScore dtParameters = [ currentCondition[crop_properties.TEMPERATURE], currentCondition[crop_properties.HUMIDITY], currentCondition[crop_properties.RAINFALL] ] dtCrop = decision_tree.predict(dtParameters) print(dtCrop) crop_points[dtCrop] += 1 return crop_points
def map_predict_voting(interface, state, label, inp): import decision_tree out = interface.output(0) half_ensemble = round(len(state["forest"]) / 2.) fill_in_values = state["fill_in_values"] for row in inp: row = row.strip().split(state["delimiter"]) if len(row) > 1: x_id = "" if state["id_index"] == -1 else row[state["id_index"]] x = [(fill_in_values[j] if row[j] in state["missing_vals"] else float(row[j]) if state["X_meta"][i] == "c" else row[j]) for i, j in enumerate(state["X_indices"])] predictions = {} for i, tree in enumerate(state["forest"]): pred = decision_tree.predict(tree, x) predictions[pred] = predictions.get(pred, 0) + 1 if i >= half_ensemble - 1: prediction = max(predictions, key=predictions.get) value = predictions[prediction] if value == half_ensemble: break out.add(x_id, (prediction, i + 1))
def map_predict_dist(interface, state, label, inp): import numpy as np import decision_tree out = interface.output(0) ensemble_size = len(state["forest"]) fill_in_values = state["fill_in_values"] for row in inp: row = row.strip().split(state["delimiter"]) if len(row) > 1: x_id = "" if state["id_index"] == -1 else row[state["id_index"]] x = [(fill_in_values[j] if row[j] in state["missing_vals"] else float(row[j]) if state["X_meta"][i] == "c" else row[j]) for i, j in enumerate(state["X_indices"])] pred_dist = [ decision_tree.predict(tree, x, dist=True) for tree in state["forest"] ] y_dist = { k: v / float(ensemble_size) for k, v in np.sum(pred_dist).iteritems() } prediction = max(y_dist, key=y_dist.get) out.add(x_id, (prediction, y_dist[prediction]))
def map_predict_voting(interface, state, label, inp): import decision_tree out = interface.output(0) half_ensemble = round(len(state["forest"]) / 2.) fill_in_values = state["fill_in_values"] for row in inp: row = row.strip().split(state["delimiter"]) if len(row) > 1: x_id = "" if state["id_index"] == -1 else row[state["id_index"]] x = [(fill_in_values[j] if row[j] in state["missing_vals"] else float(row[j]) if state["X_meta"][ i] == "c" else row[j]) for i, j in enumerate(state["X_indices"])] predictions = {} for i, tree in enumerate(state["forest"]): pred = decision_tree.predict(tree, x) predictions[pred] = predictions.get(pred, 0) + 1 if i >= half_ensemble - 1: prediction = max(predictions, key=predictions.get) value = predictions[prediction] if value == half_ensemble: break out.add(x_id, (prediction, i + 1))
def map_predict(interface, state, label, inp): import decision_tree import numpy as np out = interface.output(0) fill_in_values = state["fill_in_values"] coeff = state["coeff"] for row in inp: row = row.strip().split(state["delimiter"]) if len(row) > 1: x_id = "" if state["id_index"] == -1 else row[state["id_index"]] x, cont, disc = [], [], [] for i, j in enumerate(state["X_indices"]): if row[j] in state["missing_vals"]: row[j] = fill_in_values[i] if state["X_meta"][i] == "c": x.append(float(row[j])) cont.append(float(row[j])) else: x.append(row[j]) disc.append(row[j]) cont, disc = np.array(cont), np.array(disc) similarities = [] for i, medoids in enumerate(state["medoids"]): gower = 0 if len(cont) == 0 else np.sum( 1 - np.true_divide(np.abs(cont - medoids[0]), state["gower_ranges"][i]), axis=1) gower += 0 if len(disc) == 0 else np.sum(disc == medoids[1], axis=1) similarities += zip(np.round(1 - gower / float(len(x)), 4), [(i, j) for j in range(len(gower))]) similarities = sorted(similarities) threshold = similarities[0][0] * (1 + coeff) similar_medoids = [similarities[0][1]] pos = 1 while pos < len(similarities) and similarities[pos][0] <= threshold: similar_medoids.append(similarities[pos][1]) pos += 1 predictions = {} num_trees = 0 if len(predictions) == 0: for forest in state["forest"]: for tree in forest: pred = decision_tree.predict(tree, x) predictions[pred] = predictions.get(pred, []) + [1] num_trees += 1 for k, v in predictions.iteritems(): predictions[k] = np.average(v) * len(v) max_pred = max(predictions, key=predictions.get) out.add(x_id, (max_pred, num_trees))
def predict(testX, forest): result = [] for tree in forest: result.append(decision_tree.predict(testX, tree)) count0 = result.count(0) count1 = result.count(1) #print(result) if count0 > count1: return 0 else: return 1
def classify_boosting(train, test, n_rounds): weight = np.full(len(train), 1 / len(train)) # weight for all records boosting_sum = np.zeros((n_rounds, len(test))) # sum matrix for all rounds restart = False for i in range(n_rounds): # for each round, sample the training set with replacement according to weight bootstrap_train_x, bootstrap_train_y, bootstrap_weight = sample(train, weight) # generate decision tree dt = decision_tree.create_tree(bootstrap_train_x, 0, 3) # apply decision tree on training dataset result_train = train.apply(lambda r: decision_tree.predict(r, dt), axis=1) # apply decision tree on testing dataset result_test = test.apply(lambda r: decision_tree.predict(r, dt), axis=1) # miss = 1 if misclassified else 0 miss = np.logical_xor(result_train.values, bootstrap_train_y) # error = sum(miss(i)*weight(i))/sum(weight(i)) error = np.sum(np.multiply(weight, miss)) / np.sum(weight) # if error > 0.5 then start over if (error > 0.5): restart = True break # alpha(classifier weight) = 1/2 * ln(1- error / error). alpha = 0.5 * np.log((1 - error) / error) # calculate sum of alpha * y_test for this round boosting_sum[i,:] = np.multiply([float(1 if r > 0 else -1) for r in result_test], alpha) # update weight # new weight = e ^ (alpha * miss) weight = np.multiply(weight, np.exp([float(1 if m > 0 else -1) * alpha for m in miss])) # normalize weight weight = [float(w) / sum(weight) for w in weight] if not restart: # get final prediction based on the sum of weighted prediction of all rounds classification = np.sign(boosting_sum.sum(axis=0)) classification = [1 if c > 0 else 0 for c in classification] # convert -1s to 0s return classification else: return classify_boosting(train, test, n_rounds)
def get_predict(trees_result, trees_fiture, data_train): m_tree = len(trees_result) m = np.shape(data_train)[0] result = [] for i in xrange(m_tree): clf = trees_result[i] feature = trees_fiture[i] data = split_data(data_train, feature) result_i = [] for i in xrange(m): result_i.append((predict(data[i][0:-1], clf).keys())[0]) result.append(result_i) final_predict = np.sum(result, axis=0) return final_predict
def predict(self, data): """ Predict class of a single data vector Data should be 1x(m+1) numpy matrix where m is the number of features (recall that the first element of the vector is the label). I recommend implementing the specific algorithms in a seperate module and then determining which method to call based on classifier_type. This method should return the predicted label. """ if self.classifier_type == 'decision_tree': import decision_tree decision_tree.predict(self.params, data) if self.classifier_type == 'naive_bayes': import naive_bayes naive_bayes.predict(self.params, data) if self.classifier_type == 'neural_net': import neural_nets neural_nets.predict(self.params, data)
def map_predict_dist(interface, state, label, inp): import numpy as np import decision_tree out = interface.output(0) ensemble_size = len(state["forest"]) fill_in_values = state["fill_in_values"] for row in inp: row = row.strip().split(state["delimiter"]) if len(row) > 1: x_id = "" if state["id_index"] == -1 else row[state["id_index"]] x = [(fill_in_values[j] if row[j] in state["missing_vals"] else float(row[j]) if state["X_meta"][ i] == "c" else row[j]) for i, j in enumerate(state["X_indices"])] pred_dist = [decision_tree.predict(tree, x, dist=True) for tree in state["forest"]] y_dist = {k: v / float(ensemble_size) for k, v in np.sum(pred_dist).iteritems()} prediction = max(y_dist, key=y_dist.get) out.add(x_id, (prediction, y_dist[prediction]))
def train(itr_num, training, sample_num): BDT_list = list() BDT_alpha_list = list() N = len(training) weights = [1.0 / N for i in range(N)] y = [] for row in training: y.append(row[-1]) for i in range(itr_num): cur_training = resample(training, weights, sample_num) cur_tree = build_BDT(cur_training) y_head = DT.predict(training, cur_tree) errors = np.array([1 if a != b else 0 for a, b in zip(y_head, y)]) epsilon = sum(errors * weights) alpha = np.log((1 - epsilon) * 1.0 / epsilon) / 2 # y_head == y, C[i] = 1 else C[i] = -1 C = [-1 if error == 1 else 1 for error in errors] for j in range(N): weights[j] *= np.exp(-1 * alpha * C[j]) weights /= sum(weights) BDT_list.append(cur_tree) BDT_alpha_list.append(alpha) return BDT_list, BDT_alpha_list
# # 3. Evaluate decision tree that uses information gain # tree = DecisionTreeClassifier(max_depth=3) # tree.fit(X, y) # y_pred = tree.predict(X) # error = np.mean(y_pred != y) # print("Error: %.3f" % error) for maxDepth in range(2,15): print "******* MAX DEPTH =", maxDepth, "***********" # 2. Evaluate decision tree model = decision_tree.fit(X, y, maxDepth=maxDepth) # print model y_pred = decision_tree.predict(model, X) error = np.mean(y_pred != y) # print model print("Error: %.3f" % error) # 3. Evaluate decision tree that uses information gain tree = DecisionTreeClassifier(max_depth=maxDepth+1) tree.fit(X, y) y_pred = tree.predict(X) error = np.mean(y_pred != y) print("Error: %.3f" % error) elif question == "3.1": dataset = utils.load_dataset("citiesSmall")
def predict(self, X): y = dt.predict(X, self.tree) return y
import pickle import argparse import pandas as pd from decision_tree import predict, preprocess_dataframe if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('-d', '--decision_tree', required=True) parser.add_argument('-t', '--test_data', required=True) parser.add_argument('-o', '--output', required=True) args = parser.parse_args() with open(args.decision_tree, 'rb') as t: tree = pickle.load(t) with open(args.test_data) as f: test_df = pd.read_csv(f, na_values=["?"]) test_df = preprocess_dataframe(test_df, handle_continuous=False) predict(tree, test_df, 'winner') test_df.to_csv(args.output)
def map_fit(interface, state, label, inp): import numpy as np import decision_tree, measures, random from collections import Counter out = interface.output(0) margins, forest, medoids, medoids_y = [], [], [], [] missing_vals_attr = set() num_test_samples = state["num_medoids"] num_samples = sum([1 for row in inp if len(row.strip().split(state["delimiter"])) > 1]) test_indices = set(random.sample([i for i in range(num_samples)], num_test_samples)) for counter in range(state["trees_per_chunk"]): bag_indices = Counter(np.random.randint(num_samples, size=(num_samples))) _ = [bag_indices.pop(test_id) for test_id in test_indices if test_id in bag_indices] x, y, fill_in_values = [], [], [] attr_mapping, y_mapping = {}, {} row_num = -1 for row in inp: row_num += 1 row = row.strip().split(state["delimiter"]) if len(row) > 1: if row_num in test_indices: if counter == 0: new_row = [] for i, j in enumerate(state["X_indices"]): if state["X_meta"][i] == "c": new_row.append(float(row[j])) else: new_row.append(row[j]) medoids.append(new_row) medoids_y.append(row[state["y_index"]]) else: continue else: while bag_indices[row_num] > 0: new_row = [] for i, j in enumerate(state["X_indices"]): if row[j] in state["missing_vals"]: new_row.append(row[j]) missing_vals_attr.add(i) elif state["X_meta"][i] == "c": new_row.append(float(row[j])) else: if row[j] not in attr_mapping: attr_mapping[row[j]] = len(attr_mapping) new_row.append(attr_mapping[row[j]]) x.append(new_row) if row[state["y_index"]] not in y_mapping: y_mapping[row[state["y_index"]]] = len(y_mapping) y.append(y_mapping[row[state["y_index"]]]) bag_indices[row_num] -= 1 attr_mapping = {v: k for k, v in attr_mapping.iteritems()} y_mapping = {v: k for k, v in y_mapping.iteritems()} if len(y_mapping) == 1: print "Warning: Only one class in the subset!" return if len(missing_vals_attr) > 0: for i in range(len(state["X_indices"])): if state["X_meta"][i] == "c": value = np.average([sample[i] for sample in x if type(sample[i]) == float]) fill_in_values.append(value) else: value = np.bincount([sample[i] for sample in x if type(sample[i]) == int]).argmax() fill_in_values.append(attr_mapping[value]) if i in missing_vals_attr: for j in range(len(x)): if x[j][i] in state["missing_vals"]: x[j][i] = value x = np.array(x, dtype=np.float32) y = np.array(y, dtype=np.uint16) tree = decision_tree.fit( x=x, y=y, t=state["X_meta"], randomized=True, max_tree_nodes=state["max_tree_nodes"], min_samples_leaf=state["min_samples_leaf"], min_samples_split=state["min_samples_split"], class_majority=state["class_majority"], measure=measures.info_gain if state["measure"] == "info_gain" else measures.mdl, accuracy=state["accuracy"], separate_max=state["separate_max"]) print "Tree was built" if len(tree) < 2: print "tree was removed" continue tree_mapped = {} for k, v in tree.iteritems(): tree_mapped[k] = [None for i in range(2)] for i, node in enumerate(v): dist_map = dict([(y_mapping[label], freq) for label, freq in node[3].iteritems()]) split_map = set([attr_mapping[int(s)] for s in list(node[2])]) if node[5] == "d" else node[2] tree_mapped[k][i] = (node[0], node[1], split_map, dist_map, node[4], node[5]) forest.append(tree_mapped) tree_margins = [] for ti in range(num_test_samples): leaf, margin = decision_tree.predict(tree_mapped, medoids[ti], medoids_y[ti]) tree_margins.append(margin) margins.append(tree_margins) print "tree was build" cont, disc = [], [] for i in range(len(medoids)): cont.append([medoids[i][j] for j in range(len(medoids[i])) if state["X_meta"][j] == "c"]) disc.append([medoids[i][j] for j in range(len(medoids[i])) if state["X_meta"][j] == "d"]) medoids = [np.array(cont), np.array(disc)] gower_range = np.array([np.ptp(x[:, i]) for i in range(len(state["X_meta"])) if state["X_meta"][i] == "c"]) gower_range[gower_range == 0] = 1e-9 out.add("model", (forest, margins, medoids, gower_range)) out.add("fill_in_values", fill_in_values)
# part 3: plot classification boundaries for k=1 if question == '2.1': dataset = utils.load_dataset('vowel') X = dataset['X'] y = dataset['y'] Xtest = dataset['Xtest'] ytest = dataset['ytest'] # # part 1: plot decision_tree as depth varies from 1 to 15 train_errors = np.zeros(15) test_errors = np.zeros(15) for i in range(1, 16): model = decision_tree.fit(X, y, i) y_pred = decision_tree.predict(model, X) training_error = np.sum(y_pred != y) / float(X.shape[0]) # print "Training error:", training_error, "at depth", i y_pred = decision_tree.predict(model, Xtest) test_error = np.sum(y_pred != ytest) / float(Xtest.shape[0]) # print "Test error:", test_error, "at depth", i train_errors[i - 1] = training_error test_errors[i - 1] = test_error x_vals = np.arange(1, 16) plt.title("Tree depth vs. training and test error") plt.plot(x_vals, train_errors, label="Training error") plt.plot(x_vals, test_errors, label="Testing error") plt.xlabel("Depth")
def map_fit(interface, state, label, inp): import numpy as np import decision_tree, measures, random from collections import Counter out = interface.output(0) margins, forest, medoids, medoids_y = [], [], [], [] missing_vals_attr = set() num_test_samples = state["num_medoids"] num_samples = sum( [1 for row in inp if len(row.strip().split(state["delimiter"])) > 1]) test_indices = set( random.sample([i for i in range(num_samples)], num_test_samples)) for counter in range(state["trees_per_chunk"]): bag_indices = Counter( np.random.randint(num_samples, size=(num_samples))) _ = [ bag_indices.pop(test_id) for test_id in test_indices if test_id in bag_indices ] x, y, fill_in_values = [], [], [] attr_mapping, y_mapping = {}, {} row_num = -1 for row in inp: row_num += 1 row = row.strip().split(state["delimiter"]) if len(row) > 1: if row_num in test_indices: if counter == 0: new_row = [] for i, j in enumerate(state["X_indices"]): if state["X_meta"][i] == "c": new_row.append(float(row[j])) else: new_row.append(row[j]) medoids.append(new_row) medoids_y.append(row[state["y_index"]]) else: continue else: while bag_indices[row_num] > 0: new_row = [] for i, j in enumerate(state["X_indices"]): if row[j] in state["missing_vals"]: new_row.append(row[j]) missing_vals_attr.add(i) elif state["X_meta"][i] == "c": new_row.append(float(row[j])) else: if row[j] not in attr_mapping: attr_mapping[row[j]] = len(attr_mapping) new_row.append(attr_mapping[row[j]]) x.append(new_row) if row[state["y_index"]] not in y_mapping: y_mapping[row[state["y_index"]]] = len(y_mapping) y.append(y_mapping[row[state["y_index"]]]) bag_indices[row_num] -= 1 attr_mapping = {v: k for k, v in attr_mapping.iteritems()} y_mapping = {v: k for k, v in y_mapping.iteritems()} if len(y_mapping) == 1: print "Warning: Only one class in the subset!" return if len(missing_vals_attr) > 0: for i in range(len(state["X_indices"])): if state["X_meta"][i] == "c": value = np.average([ sample[i] for sample in x if type(sample[i]) == float ]) fill_in_values.append(value) else: value = np.bincount([ sample[i] for sample in x if type(sample[i]) == int ]).argmax() fill_in_values.append(attr_mapping[value]) if i in missing_vals_attr: for j in range(len(x)): if x[j][i] in state["missing_vals"]: x[j][i] = value x = np.array(x, dtype=np.float32) y = np.array(y, dtype=np.uint16) tree = decision_tree.fit(x=x, y=y, t=state["X_meta"], randomized=True, max_tree_nodes=state["max_tree_nodes"], min_samples_leaf=state["min_samples_leaf"], min_samples_split=state["min_samples_split"], class_majority=state["class_majority"], measure=measures.info_gain if state["measure"] == "info_gain" else measures.mdl, accuracy=state["accuracy"], separate_max=state["separate_max"]) print "Tree was built" if len(tree) < 2: print "tree was removed" continue tree_mapped = {} for k, v in tree.iteritems(): tree_mapped[k] = [None for i in range(2)] for i, node in enumerate(v): dist_map = dict([(y_mapping[label], freq) for label, freq in node[3].iteritems()]) split_map = set([attr_mapping[int(s)] for s in list(node[2]) ]) if node[5] == "d" else node[2] tree_mapped[k][i] = (node[0], node[1], split_map, dist_map, node[4], node[5]) forest.append(tree_mapped) tree_margins = [] for ti in range(num_test_samples): leaf, margin = decision_tree.predict(tree_mapped, medoids[ti], medoids_y[ti]) tree_margins.append(margin) margins.append(tree_margins) print "tree was build" cont, disc = [], [] for i in range(len(medoids)): cont.append([ medoids[i][j] for j in range(len(medoids[i])) if state["X_meta"][j] == "c" ]) disc.append([ medoids[i][j] for j in range(len(medoids[i])) if state["X_meta"][j] == "d" ]) medoids = [np.array(cont), np.array(disc)] gower_range = np.array([ np.ptp(x[:, i]) for i in range(len(state["X_meta"])) if state["X_meta"][i] == "c" ]) gower_range[gower_range == 0] = 1e-9 out.add("model", (forest, margins, medoids, gower_range)) out.add("fill_in_values", fill_in_values)
def map_fit(interface, state, label, inp): import numpy as np from itertools import permutations import decision_tree, measures, k_medoids out = interface.output(0) x, y, margins, forest = [], [], [], [] attr_mapping, y_mapping, similarity_mat = {}, {}, {} missing_vals_attr = set() for row in inp: row = row.strip().split(state["delimiter"]) if len(row) > 1: new_row = [] for i, j in enumerate(state["X_indices"]): if row[j] in state["missing_vals"]: new_row.append(row[j]) missing_vals_attr.add(i) elif state["X_meta"][i] == "c": new_row.append(float(row[j])) else: if row[j] not in attr_mapping: attr_mapping[row[j]] = len(attr_mapping) new_row.append(attr_mapping[row[j]]) x.append(new_row) if row[state["y_index"]] not in y_mapping: y_mapping[row[state["y_index"]]] = len(y_mapping) y.append(y_mapping[row[state["y_index"]]]) if len(y_mapping) == 1: print "Warning: Only one class in the subset!" return fill_in_values = [] attr_mapping = {v: k for k, v in attr_mapping.iteritems()} y_mapping = {v: k for k, v in y_mapping.iteritems()} if len(missing_vals_attr) > 0: for i in range(len(state["X_indices"])): if state["X_meta"][i] == "c": value = np.average( [sample[i] for sample in x if type(sample[i]) == float]) fill_in_values.append(value) else: value = np.bincount([ sample[i] for sample in x if type(sample[i]) == int ]).argmax() fill_in_values.append(attr_mapping[value]) if i in missing_vals_attr: for j in range(len(x)): if x[j][i] in state["missing_vals"]: x[j][i] = value x, y = np.array(x), np.array(y) iteration = 0 while len(forest) < state["trees_per_chunk"]: if iteration == state["trees_per_chunk"] * 2: return bag_indices = np.random.randint(len(x), size=(len(x))) unique = set(bag_indices) out_of_bag_indices = [i for i in range(len(x)) if i not in unique][:500] iteration += 1 if len(np.unique(y[bag_indices])) == 1: continue tree = decision_tree.fit(x=x[bag_indices], y=y[bag_indices], t=state["X_meta"], randomized=True, max_tree_nodes=state["max_tree_nodes"], min_samples_leaf=state["min_samples_leaf"], min_samples_split=state["min_samples_split"], class_majority=state["class_majority"], measure=measures.info_gain if state["measure"] == "info_gain" else measures.mdl, accuracy=state["accuracy"], separate_max=state["separate_max"]) if len(tree) < 2: continue # calculate margins tree_margins, leafs_grouping = {}, {} for j in out_of_bag_indices: leaf, margin = decision_tree.predict(tree, x[j], y[j]) tree_margins[j] = margin if leaf in leafs_grouping: leafs_grouping[leaf].append(j) else: leafs_grouping[leaf] = [j] margins.append(tree_margins) for k, v in leafs_grouping.iteritems(): for cx, cy in permutations(v, 2): if cx in similarity_mat: similarity_mat[cx][cy] = similarity_mat[cx].get(cy, 0) - 1 else: similarity_mat[cx] = {cy: -1} tree_mapped = {} for k, v in tree.iteritems(): tree_mapped[k] = [None for i in range(2)] for i, node in enumerate(v): dist_map = dict([(y_mapping[label], freq) for label, freq in node[3].iteritems()]) split_map = set([attr_mapping[int(s)] for s in list(node[2]) ]) if node[5] == "d" else node[2] tree_mapped[k][i] = (node[0], node[1], split_map, dist_map, node[4], node[5]) forest.append(tree_mapped) min_elements = [] for k, v in similarity_mat.iteritems(): min_id = min(similarity_mat[k], key=similarity_mat[k].get) min_elements.append((similarity_mat[k][min_id], min_id)) min_elements = sorted(min_elements) if state["k"] == "sqrt": k = int(np.sqrt(len(x[0]))) + 1 elif state["k"] == "square": k = len(np.unique(y)) * len(np.unique(y)) cidx = set() counter = 0 while counter < len(min_elements) and len(cidx) < k: cidx.add(min_elements[counter][1]) counter += 1 inds, medoids_i = k_medoids.fit(similarity_mat, len(x), list(cidx)) sample_ids = np.array(similarity_mat.keys()) medoids_i = [sample_ids[i] for i in medoids_i] clusters = [sample_ids[np.where(inds == i)[0]] for i in np.unique(inds)] medoids = x[medoids_i].tolist() # set medoids without sample identifier cont, disc = [], [] for i in range(len(medoids)): cont.append([ medoids[i][j] for j in range(len(medoids[i])) if state["X_meta"][j] == "c" ]) disc.append([ attr_mapping[int(medoids[i][j])] for j in range(len(medoids[i])) if state["X_meta"][j] == "d" ]) medoids = [np.array(cont), np.array(disc)] stats = [[] for i in range(len(medoids_i))] for i in range(len(forest)): # for every tree in forest for num, cluster in enumerate(clusters): # calculate average margin for cluster values = [ margins[i][sample_id] for sample_id in cluster if int(sample_id) in margins[i] ] if values != []: avg = np.average(values) forest[i]["margin" + str(num)] = avg stats[num].append(avg) stats = [np.median(value) for value in stats] gower_range = np.array([ np.ptp(x[:, i]) for i in range(len(state["X_meta"])) if state["X_meta"][i] == "c" ]) gower_range[gower_range == 0] = 1e-9 out.add("model", (forest, medoids, stats, gower_range)) out.add("fill_in_values", fill_in_values)
def map_predict(interface, state, label, inp): import decision_tree import numpy as np out = interface.output(0) fill_in_values = state["fill_in_values"] coeff = state["coeff"] for row in inp: row = row.strip().split(state["delimiter"]) if len(row) > 1: x_id = "" if state["id_index"] == -1 else row[state["id_index"]] x, cont, disc = [], [], [] for i, j in enumerate(state["X_indices"]): if row[j] in state["missing_vals"]: row[j] = fill_in_values[i] if state["X_meta"][i] == "c": x.append(float(row[j])) cont.append(float(row[j])) else: x.append(row[j]) disc.append(row[j]) cont, disc = np.array(cont), np.array(disc) similarities = [] for i, medoids in enumerate(state["medoids"]): gower = 0 if len(cont) == 0 else np.sum(1 - np.true_divide( np.abs(cont - medoids[0]), state["gower_ranges"][i]), axis=1) gower += 0 if len(disc) == 0 else np.sum(disc == medoids[1], axis=1) similarities += zip(np.round(1 - gower / float(len(x)), 4), [(i, j) for j in range(len(x))]) similarities = sorted(similarities) threshold = similarities[0][0] * (1 + coeff) similar_medoids = [similarities[0][1]] pos = 1 while pos < len( similarities) and similarities[pos][0] <= threshold: similar_medoids.append(similarities[pos][1]) pos += 1 global_predictions = {} for i, j in similar_medoids: predictions = {} margin = "margin" + str(j) for tree in state["forest"][i]: if margin in tree and tree[margin] >= state["stats"][i][j]: pred = decision_tree.predict(tree, x) predictions[pred] = predictions.get( pred, []) + [tree[margin]] for k, v in predictions.iteritems(): predictions[k] = np.average(v) * len(v) max_pred = max(predictions, key=predictions.get) if max_pred not in global_predictions: global_predictions[max_pred] = predictions[max_pred] elif predictions[max_pred] > global_predictions[max_pred]: global_predictions[max_pred] = predictions[max_pred] out.add(x_id, (max(global_predictions, key=global_predictions.get), ))
def predict(forest, sample): predictions = [decision_tree.predict(tree, sample, dist=True) for tree in forest] y_dist = {k: v / float(len(forest)) for k, v in np.sum(predictions).iteritems()} return max(y_dist, key=y_dist.get)
def map_fit(interface, state, label, inp): import numpy as np from itertools import permutations import decision_tree, measures, k_medoids out = interface.output(0) x, y, margins, forest = [], [], [], [] attr_mapping, y_mapping, similarity_mat = {}, {}, {} missing_vals_attr = set() for row in inp: row = row.strip().split(state["delimiter"]) if len(row) > 1: new_row = [] for i, j in enumerate(state["X_indices"]): if row[j] in state["missing_vals"]: new_row.append(row[j]) missing_vals_attr.add(i) elif state["X_meta"][i] == "c": new_row.append(float(row[j])) else: if row[j] not in attr_mapping: attr_mapping[row[j]] = len(attr_mapping) new_row.append(attr_mapping[row[j]]) x.append(new_row) if row[state["y_index"]] not in y_mapping: y_mapping[row[state["y_index"]]] = len(y_mapping) y.append(y_mapping[row[state["y_index"]]]) if len(y_mapping) == 1: print "Warning: Only one class in the subset!" return fill_in_values = [] attr_mapping = {v: k for k, v in attr_mapping.iteritems()} y_mapping = {v: k for k, v in y_mapping.iteritems()} if len(missing_vals_attr) > 0: for i in range(len(state["X_indices"])): if state["X_meta"][i] == "c": value = np.average([sample[i] for sample in x if type(sample[i]) == float]) fill_in_values.append(value) else: value = np.bincount([sample[i] for sample in x if type(sample[i]) == int]).argmax() fill_in_values.append(attr_mapping[value]) if i in missing_vals_attr: for j in range(len(x)): if x[j][i] in state["missing_vals"]: x[j][i] = value x, y = np.array(x), np.array(y) iteration = 0 while len(forest) < state["trees_per_chunk"]: if iteration == state["trees_per_chunk"] * 2: return bag_indices = np.random.randint(len(x), size=(len(x))) unique = set(bag_indices) out_of_bag_indices = [i for i in range(len(x)) if i not in unique][:500] iteration += 1 if len(np.unique(y[bag_indices])) == 1: continue tree = decision_tree.fit( x=x[bag_indices], y=y[bag_indices], t=state["X_meta"], randomized=True, max_tree_nodes=state["max_tree_nodes"], min_samples_leaf=state["min_samples_leaf"], min_samples_split=state["min_samples_split"], class_majority=state["class_majority"], measure=measures.info_gain if state["measure"] == "info_gain" else measures.mdl, accuracy=state["accuracy"], separate_max=state["separate_max"], ) if len(tree) < 2: continue # calculate margins tree_margins, leafs_grouping = {}, {} for j in out_of_bag_indices: leaf, margin = decision_tree.predict(tree, x[j], y[j]) tree_margins[j] = margin if leaf in leafs_grouping: leafs_grouping[leaf].append(j) else: leafs_grouping[leaf] = [j] margins.append(tree_margins) for k, v in leafs_grouping.iteritems(): for cx, cy in permutations(v, 2): if cx in similarity_mat: similarity_mat[cx][cy] = similarity_mat[cx].get(cy, 0) - 1 else: similarity_mat[cx] = {cy: -1} tree_mapped = {} for k, v in tree.iteritems(): tree_mapped[k] = [None for i in range(2)] for i, node in enumerate(v): dist_map = dict([(y_mapping[label], freq) for label, freq in node[3].iteritems()]) split_map = set([attr_mapping[int(s)] for s in list(node[2])]) if node[5] == "d" else node[2] tree_mapped[k][i] = (node[0], node[1], split_map, dist_map, node[4], node[5]) forest.append(tree_mapped) min_elements = [] for k, v in similarity_mat.iteritems(): min_id = min(similarity_mat[k], key=similarity_mat[k].get) min_elements.append((similarity_mat[k][min_id], min_id)) min_elements = sorted(min_elements) if state["k"] == "sqrt": k = int(np.sqrt(len(x[0]))) + 1 elif state["k"] == "square": k = len(np.unique(y)) * len(np.unique(y)) cidx = set() counter = 0 while counter < len(min_elements) and len(cidx) < k: cidx.add(min_elements[counter][1]) counter += 1 inds, medoids_i = k_medoids.fit(similarity_mat, len(x), list(cidx)) sample_ids = np.array(similarity_mat.keys()) medoids_i = [sample_ids[i] for i in medoids_i] clusters = [sample_ids[np.where(inds == i)[0]] for i in np.unique(inds)] medoids = x[medoids_i].tolist() # set medoids without sample identifier cont, disc = [], [] for i in range(len(medoids)): cont.append([medoids[i][j] for j in range(len(medoids[i])) if state["X_meta"][j] == "c"]) disc.append([attr_mapping[int(medoids[i][j])] for j in range(len(medoids[i])) if state["X_meta"][j] == "d"]) medoids = [np.array(cont), np.array(disc)] stats = [[] for i in range(len(medoids_i))] for i in range(len(forest)): # for every tree in forest for num, cluster in enumerate(clusters): # calculate average margin for cluster values = [margins[i][sample_id] for sample_id in cluster if int(sample_id) in margins[i]] if values != []: avg = np.average(values) forest[i]["margin" + str(num)] = avg stats[num].append(avg) stats = [np.median(value) for value in stats] gower_range = np.array([np.ptp(x[:, i]) for i in range(len(state["X_meta"])) if state["X_meta"][i] == "c"]) gower_range[gower_range == 0] = 1e-9 out.add("model", (forest, medoids, stats, gower_range)) out.add("fill_in_values", fill_in_values)
def score_model(model, features, labels, depth): labels_pred = predict(model, features) correct = 0.0 for i, _ in enumerate(labels_pred): correct += int(labels_pred[i] == labels[i]) print(correct/len(labels))