Beispiel #1
0
def remove_features(removal_order, train_file, test_file, attr_file,
                    max_features):
    train_accs = []
    test_accs = []
    remove_columns = []
    for col in removal_order:
        print(col)
        remove_columns.append(col)
        if len(remove_columns) == max_features: break
        print(remove_columns)
        train_data, train_attr = read_data(train,
                                           attr,
                                           remove_columns=remove_columns)
        test_data, test_attr = read_data(test,
                                         attr,
                                         remove_columns=remove_columns)
        tree = decision_tree.DecisionTreeLearning(train_data, train_attr,
                                                  "normal", "class")

        decision_tree.print_tree(tree)
        y_pred, y_true = decision_tree.predict(train_data, tree)
        train_acc = decision_tree.accuracy_score(y_pred, y_true)
        print('Accuracy on Training Data: {0}'.format(train_acc * 100))
        y_pred, y_true = decision_tree.predict(test_data, tree)
        test_acc = decision_tree.accuracy_score(y_pred, y_true)
        print('Accuracy on Training Data: {0}'.format(test_acc * 100))

        train_accs.append(train_acc)
        test_accs.append(test_acc)
    return train_accs, test_accs
def map_predict_voting(interface, state, label, inp):
    import decision_tree

    out = interface.output(0)
    fill_in_values = state["fill_in_values"]

    for row in inp:
        row = row.strip().split(state["delimiter"])
        predicted = False
        if len(row) > 1:
            x_id = "" if state["id_index"] == -1 else row[state["id_index"]]
            x = [(fill_in_values[j] if row[j] in state["missing_vals"] else
                  float(row[j]) if state["X_meta"][i] == "c" else row[j])
                 for i, j in enumerate(state["X_indices"])]

            tallies = {}
            for tree in state["forest"]:
                pred = decision_tree.predict(tree, x)
                tallies[pred] = tallies.get(pred, 0) + 1
                if any(e > int(len(state["forest"]) / 2.)
                       for e in tallies.values()):
                    prediction = max(tallies, key=tallies.get)
                    out.add(x_id, (prediction, tallies[prediction]))
                    predicted = True
                    break
            if not predicted:
                prediction = max(tallies, key=tallies.get)
                out.add(x_id, (prediction, tallies[prediction]))
def map_predict_voting(interface, state, label, inp):
    import decision_tree

    out = interface.output(0)
    fill_in_values = state["fill_in_values"]

    for row in inp:
        row = row.strip().split(state["delimiter"])
        predicted = False
        if len(row) > 1:
            x_id = "" if state["id_index"] == -1 else row[state["id_index"]]
            x = [(fill_in_values[j] if row[j] in state["missing_vals"] else float(row[j]) if state["X_meta"][
                                                                                                 i] == "c" else row[j])
                 for i, j in enumerate(state["X_indices"])]

            tallies = {}
            for tree in state["forest"]:
                pred = decision_tree.predict(tree, x)
                tallies[pred] = tallies.get(pred, 0) + 1
                if any(e > int(len(state["forest"]) / 2.) for e in tallies.values()):
                    prediction = max(tallies, key=tallies.get)
                    out.add(x_id, (prediction, tallies[prediction]))
                    predicted = True
                    break
            if not predicted:
                prediction = max(tallies, key=tallies.get)
                out.add(x_id, (prediction, tallies[prediction]))
Beispiel #4
0
def CalculatePoints(crop_points, crop_prop, currentCondition):
    for crop in crop_points:
        humidity = HumidityInRange(crop_prop[crop][crop_properties.HUMIDITY],
                                   currentCondition[crop_properties.HUMIDITY])
        crop_points[crop] += humidity
        temperature = TemperatureInRange(
            crop_prop[crop][crop_properties.TEMPERATURE],
            currentCondition[crop_properties.TEMPERATURE])
        crop_points[crop] += temperature
        rainfall = RainfallInRange(crop_prop[crop][crop_properties.HUMIDITY],
                                   currentCondition[crop_properties.HUMIDITY])
        crop_points[crop] += rainfall
        locationDesnsityScore = LocationScore(crop)
        crop_points[crop] += locationDesnsityScore

        dtParameters = [
            currentCondition[crop_properties.TEMPERATURE],
            currentCondition[crop_properties.HUMIDITY],
            currentCondition[crop_properties.RAINFALL]
        ]
        dtCrop = decision_tree.predict(dtParameters)
        print(dtCrop)
        crop_points[dtCrop] += 1

    return crop_points
Beispiel #5
0
def map_predict_voting(interface, state, label, inp):
    import decision_tree

    out = interface.output(0)
    half_ensemble = round(len(state["forest"]) / 2.)
    fill_in_values = state["fill_in_values"]

    for row in inp:
        row = row.strip().split(state["delimiter"])
        if len(row) > 1:
            x_id = "" if state["id_index"] == -1 else row[state["id_index"]]
            x = [(fill_in_values[j] if row[j] in state["missing_vals"] else
                  float(row[j]) if state["X_meta"][i] == "c" else row[j])
                 for i, j in enumerate(state["X_indices"])]

            predictions = {}
            for i, tree in enumerate(state["forest"]):
                pred = decision_tree.predict(tree, x)
                predictions[pred] = predictions.get(pred, 0) + 1

                if i >= half_ensemble - 1:
                    prediction = max(predictions, key=predictions.get)
                    value = predictions[prediction]
                    if value == half_ensemble:
                        break
            out.add(x_id, (prediction, i + 1))
Beispiel #6
0
def map_predict_dist(interface, state, label, inp):
    import numpy as np
    import decision_tree

    out = interface.output(0)
    ensemble_size = len(state["forest"])
    fill_in_values = state["fill_in_values"]

    for row in inp:
        row = row.strip().split(state["delimiter"])
        if len(row) > 1:
            x_id = "" if state["id_index"] == -1 else row[state["id_index"]]
            x = [(fill_in_values[j] if row[j] in state["missing_vals"] else
                  float(row[j]) if state["X_meta"][i] == "c" else row[j])
                 for i, j in enumerate(state["X_indices"])]

            pred_dist = [
                decision_tree.predict(tree, x, dist=True)
                for tree in state["forest"]
            ]
            y_dist = {
                k: v / float(ensemble_size)
                for k, v in np.sum(pred_dist).iteritems()
            }
            prediction = max(y_dist, key=y_dist.get)
            out.add(x_id, (prediction, y_dist[prediction]))
def map_predict_voting(interface, state, label, inp):
    import decision_tree

    out = interface.output(0)
    half_ensemble = round(len(state["forest"]) / 2.)
    fill_in_values = state["fill_in_values"]

    for row in inp:
        row = row.strip().split(state["delimiter"])
        if len(row) > 1:
            x_id = "" if state["id_index"] == -1 else row[state["id_index"]]
            x = [(fill_in_values[j] if row[j] in state["missing_vals"] else float(row[j]) if state["X_meta"][
                                                                                                 i] == "c" else row[j])
                 for i, j in enumerate(state["X_indices"])]

            predictions = {}
            for i, tree in enumerate(state["forest"]):
                pred = decision_tree.predict(tree, x)
                predictions[pred] = predictions.get(pred, 0) + 1

                if i >= half_ensemble - 1:
                    prediction = max(predictions, key=predictions.get)
                    value = predictions[prediction]
                    if value == half_ensemble:
                        break
            out.add(x_id, (prediction, i + 1))
def map_predict(interface, state, label, inp):
    import decision_tree
    import numpy as np

    out = interface.output(0)
    fill_in_values = state["fill_in_values"]
    coeff = state["coeff"]

    for row in inp:
        row = row.strip().split(state["delimiter"])
        if len(row) > 1:
            x_id = "" if state["id_index"] == -1 else row[state["id_index"]]

            x, cont, disc = [], [], []
            for i, j in enumerate(state["X_indices"]):
                if row[j] in state["missing_vals"]:
                    row[j] = fill_in_values[i]

                if state["X_meta"][i] == "c":
                    x.append(float(row[j]))
                    cont.append(float(row[j]))
                else:
                    x.append(row[j])
                    disc.append(row[j])
            cont, disc = np.array(cont), np.array(disc)

            similarities = []
            for i, medoids in enumerate(state["medoids"]):
                gower = 0 if len(cont) == 0 else np.sum(
                    1 - np.true_divide(np.abs(cont - medoids[0]), state["gower_ranges"][i]), axis=1)
                gower += 0 if len(disc) == 0 else np.sum(disc == medoids[1], axis=1)

            similarities += zip(np.round(1 - gower / float(len(x)), 4), [(i, j) for j in range(len(gower))])
            similarities = sorted(similarities)

            threshold = similarities[0][0] * (1 + coeff)
            similar_medoids = [similarities[0][1]]

            pos = 1
            while pos < len(similarities) and similarities[pos][0] <= threshold:
                similar_medoids.append(similarities[pos][1])
                pos += 1

            predictions = {}
            num_trees = 0

            if len(predictions) == 0:
                for forest in state["forest"]:
                    for tree in forest:
                        pred = decision_tree.predict(tree, x)
                        predictions[pred] = predictions.get(pred, []) + [1]
                        num_trees += 1

            for k, v in predictions.iteritems():
                predictions[k] = np.average(v) * len(v)
            max_pred = max(predictions, key=predictions.get)
            out.add(x_id, (max_pred, num_trees))
def predict(testX, forest):
    result = []
    for tree in forest:
        result.append(decision_tree.predict(testX, tree))
    count0 = result.count(0)
    count1 = result.count(1)
    #print(result)
    if count0 > count1:
        return 0
    else:
        return 1
def classify_boosting(train, test, n_rounds):

  weight = np.full(len(train), 1 / len(train)) # weight for all records
  boosting_sum = np.zeros((n_rounds, len(test))) # sum matrix for all rounds
  restart = False
  
  for i in range(n_rounds):
    # for each round, sample the training set with replacement according to weight
    bootstrap_train_x, bootstrap_train_y, bootstrap_weight = sample(train, weight)
    # generate decision tree
    dt = decision_tree.create_tree(bootstrap_train_x, 0, 3)
    # apply decision tree on training dataset
    result_train = train.apply(lambda r: decision_tree.predict(r, dt), axis=1)
    # apply decision tree on testing dataset    
    result_test = test.apply(lambda r: decision_tree.predict(r, dt), axis=1)
    # miss = 1 if misclassified else 0
    miss = np.logical_xor(result_train.values, bootstrap_train_y)
    # error = sum(miss(i)*weight(i))/sum(weight(i))
    error = np.sum(np.multiply(weight, miss)) / np.sum(weight)
    # if error > 0.5 then start over
    if (error > 0.5):
      restart = True
      break
    # alpha(classifier weight) =  1/2 * ln(1- error / error).
    alpha = 0.5 * np.log((1 - error) / error)
    # calculate sum of alpha * y_test for this round
    boosting_sum[i,:] = np.multiply([float(1 if r > 0 else -1) for r in result_test], alpha)
    # update weight
    # new weight = e ^ (alpha * miss)
    weight = np.multiply(weight, np.exp([float(1 if m > 0 else -1) * alpha for m in miss]))
    # normalize weight
    weight = [float(w) / sum(weight) for w in weight]
  if not restart:
    # get final prediction based on the sum of weighted prediction of all rounds
    classification = np.sign(boosting_sum.sum(axis=0))
    classification = [1 if c > 0 else 0 for c in classification] # convert -1s to 0s
    return classification
  else:
    return classify_boosting(train, test, n_rounds)
Beispiel #11
0
def get_predict(trees_result, trees_fiture, data_train):
    m_tree = len(trees_result)
    m = np.shape(data_train)[0]

    result = []
    for i in xrange(m_tree):
        clf = trees_result[i]
        feature = trees_fiture[i]
        data = split_data(data_train, feature)
        result_i = []
        for i in xrange(m):
            result_i.append((predict(data[i][0:-1], clf).keys())[0])
        result.append(result_i)
    final_predict = np.sum(result, axis=0)
    return final_predict
Beispiel #12
0
    def predict(self, data):
        """
		Predict class of a single data vector
		Data should be 1x(m+1) numpy matrix where m is the number of features
		(recall that the first element of the vector is the label).

		I recommend implementing the specific algorithms in a
		seperate module and then determining which method to call
		based on classifier_type.

		This method should return the predicted label.
		"""

        if self.classifier_type == 'decision_tree':
            import decision_tree
            decision_tree.predict(self.params, data)

        if self.classifier_type == 'naive_bayes':
            import naive_bayes
            naive_bayes.predict(self.params, data)

        if self.classifier_type == 'neural_net':
            import neural_nets
            neural_nets.predict(self.params, data)
	def predict(self, data):
		"""
		Predict class of a single data vector
		Data should be 1x(m+1) numpy matrix where m is the number of features
		(recall that the first element of the vector is the label).

		I recommend implementing the specific algorithms in a
		seperate module and then determining which method to call
		based on classifier_type.

		This method should return the predicted label.
		"""

		if self.classifier_type == 'decision_tree':
			import decision_tree
			decision_tree.predict(self.params, data)			

		if self.classifier_type == 'naive_bayes':
			import naive_bayes
			naive_bayes.predict(self.params, data)			

		if self.classifier_type == 'neural_net':
			import neural_nets
			neural_nets.predict(self.params, data)
def map_predict_dist(interface, state, label, inp):
    import numpy as np
    import decision_tree

    out = interface.output(0)
    ensemble_size = len(state["forest"])
    fill_in_values = state["fill_in_values"]

    for row in inp:
        row = row.strip().split(state["delimiter"])
        if len(row) > 1:
            x_id = "" if state["id_index"] == -1 else row[state["id_index"]]
            x = [(fill_in_values[j] if row[j] in state["missing_vals"] else float(row[j]) if state["X_meta"][
                                                                                                 i] == "c" else row[j])
                 for i, j in enumerate(state["X_indices"])]

            pred_dist = [decision_tree.predict(tree, x, dist=True) for tree in state["forest"]]
            y_dist = {k: v / float(ensemble_size) for k, v in np.sum(pred_dist).iteritems()}
            prediction = max(y_dist, key=y_dist.get)
            out.add(x_id, (prediction, y_dist[prediction]))
def train(itr_num, training, sample_num):
    BDT_list = list()
    BDT_alpha_list = list()
    N = len(training)
    weights = [1.0 / N for i in range(N)]
    y = []
    for row in training:
        y.append(row[-1])
    for i in range(itr_num):
        cur_training = resample(training, weights, sample_num)
        cur_tree = build_BDT(cur_training)
        y_head = DT.predict(training, cur_tree)
        errors = np.array([1 if a != b else 0 for a, b in zip(y_head, y)])
        epsilon = sum(errors * weights)
        alpha = np.log((1 - epsilon) * 1.0 / epsilon) / 2
        # y_head == y, C[i] = 1 else C[i] = -1
        C = [-1 if error == 1 else 1 for error in errors]
        for j in range(N):
            weights[j] *= np.exp(-1 * alpha * C[j])
        weights /= sum(weights)
        BDT_list.append(cur_tree)
        BDT_alpha_list.append(alpha)
    return BDT_list, BDT_alpha_list
        # # 3. Evaluate decision tree that uses information gain
        # tree = DecisionTreeClassifier(max_depth=3)
        # tree.fit(X, y)

        # y_pred = tree.predict(X)
        # error = np.mean(y_pred != y)

        # print("Error: %.3f" % error)
      
        for maxDepth in range(2,15):
            print "******* MAX DEPTH =", maxDepth, "***********"
            # 2. Evaluate decision tree 
            model = decision_tree.fit(X, y, maxDepth=maxDepth)
            # print model
            y_pred = decision_tree.predict(model, X)
            error = np.mean(y_pred != y)
            # print model
            print("Error: %.3f" % error)

            # 3. Evaluate decision tree that uses information gain
            tree = DecisionTreeClassifier(max_depth=maxDepth+1)
            tree.fit(X, y)

            y_pred = tree.predict(X)
            error = np.mean(y_pred != y)

            print("Error: %.3f" % error)

    elif question == "3.1":
        dataset = utils.load_dataset("citiesSmall")
 def predict(self, X):
     y = dt.predict(X, self.tree)
     return y
import pickle
import argparse
import pandas as pd
from decision_tree import predict, preprocess_dataframe


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('-d', '--decision_tree', required=True)
    parser.add_argument('-t', '--test_data', required=True)
    parser.add_argument('-o', '--output', required=True)

    args = parser.parse_args()

with open(args.decision_tree, 'rb') as t:
    tree = pickle.load(t)

with open(args.test_data) as f:
    test_df = pd.read_csv(f, na_values=["?"])
    test_df = preprocess_dataframe(test_df, handle_continuous=False)

predict(tree, test_df, 'winner')
test_df.to_csv(args.output)
def map_fit(interface, state, label, inp):
    import numpy as np
    import decision_tree, measures, random
    from collections import Counter

    out = interface.output(0)
    margins, forest, medoids, medoids_y = [], [], [], []
    missing_vals_attr = set()

    num_test_samples = state["num_medoids"]
    num_samples = sum([1 for row in inp if len(row.strip().split(state["delimiter"])) > 1])
    test_indices = set(random.sample([i for i in range(num_samples)], num_test_samples))

    for counter in range(state["trees_per_chunk"]):
        bag_indices = Counter(np.random.randint(num_samples, size=(num_samples)))
        _ = [bag_indices.pop(test_id) for test_id in test_indices if test_id in bag_indices]

        x, y, fill_in_values = [], [], []
        attr_mapping, y_mapping = {}, {}

        row_num = -1
        for row in inp:
            row_num += 1
            row = row.strip().split(state["delimiter"])
            if len(row) > 1:
                if row_num in test_indices:
                    if counter == 0:
                        new_row = []
                        for i, j in enumerate(state["X_indices"]):
                            if state["X_meta"][i] == "c":
                                new_row.append(float(row[j]))
                            else:
                                new_row.append(row[j])
                        medoids.append(new_row)
                        medoids_y.append(row[state["y_index"]])
                    else:
                        continue
                else:
                    while bag_indices[row_num] > 0:
                        new_row = []
                        for i, j in enumerate(state["X_indices"]):
                            if row[j] in state["missing_vals"]:
                                new_row.append(row[j])
                                missing_vals_attr.add(i)
                            elif state["X_meta"][i] == "c":
                                new_row.append(float(row[j]))
                            else:
                                if row[j] not in attr_mapping:
                                    attr_mapping[row[j]] = len(attr_mapping)
                                new_row.append(attr_mapping[row[j]])
                        x.append(new_row)
                        if row[state["y_index"]] not in y_mapping:
                            y_mapping[row[state["y_index"]]] = len(y_mapping)
                        y.append(y_mapping[row[state["y_index"]]])
                        bag_indices[row_num] -= 1

        attr_mapping = {v: k for k, v in attr_mapping.iteritems()}
        y_mapping = {v: k for k, v in y_mapping.iteritems()}

        if len(y_mapping) == 1:
            print "Warning: Only one class in the subset!"
            return

        if len(missing_vals_attr) > 0:
            for i in range(len(state["X_indices"])):
                if state["X_meta"][i] == "c":
                    value = np.average([sample[i] for sample in x if type(sample[i]) == float])
                    fill_in_values.append(value)
                else:
                    value = np.bincount([sample[i] for sample in x if type(sample[i]) == int]).argmax()
                    fill_in_values.append(attr_mapping[value])
                if i in missing_vals_attr:
                    for j in range(len(x)):
                        if x[j][i] in state["missing_vals"]:
                            x[j][i] = value

        x = np.array(x, dtype=np.float32)
        y = np.array(y, dtype=np.uint16)

        tree = decision_tree.fit(
            x=x,
            y=y,
            t=state["X_meta"],
            randomized=True,
            max_tree_nodes=state["max_tree_nodes"],
            min_samples_leaf=state["min_samples_leaf"],
            min_samples_split=state["min_samples_split"],
            class_majority=state["class_majority"],
            measure=measures.info_gain if state["measure"] == "info_gain" else measures.mdl,
            accuracy=state["accuracy"],
            separate_max=state["separate_max"])
        print "Tree was built"
        if len(tree) < 2:
            print "tree was removed"
            continue


        tree_mapped = {}
        for k, v in tree.iteritems():
            tree_mapped[k] = [None for i in range(2)]
            for i, node in enumerate(v):
                dist_map = dict([(y_mapping[label], freq) for label, freq in node[3].iteritems()])
                split_map = set([attr_mapping[int(s)] for s in list(node[2])]) if node[5] == "d" else node[2]
                tree_mapped[k][i] = (node[0], node[1], split_map, dist_map, node[4], node[5])
        forest.append(tree_mapped)

        tree_margins = []
        for ti in range(num_test_samples):
            leaf, margin = decision_tree.predict(tree_mapped, medoids[ti], medoids_y[ti])
        tree_margins.append(margin)
        margins.append(tree_margins)
    print "tree was build"

    cont, disc = [], []
    for i in range(len(medoids)):
        cont.append([medoids[i][j] for j in range(len(medoids[i])) if state["X_meta"][j] == "c"])
        disc.append([medoids[i][j] for j in range(len(medoids[i])) if state["X_meta"][j] == "d"])
    medoids = [np.array(cont), np.array(disc)]

    gower_range = np.array([np.ptp(x[:, i]) for i in range(len(state["X_meta"])) if state["X_meta"][i] == "c"])
    gower_range[gower_range == 0] = 1e-9

    out.add("model", (forest, margins, medoids, gower_range))
    out.add("fill_in_values", fill_in_values)
Beispiel #20
0
        # part 3: plot classification boundaries for k=1

    if question == '2.1':
        dataset = utils.load_dataset('vowel')
        X = dataset['X']
        y = dataset['y']
        Xtest = dataset['Xtest']
        ytest = dataset['ytest']

        # # part 1: plot decision_tree as depth varies from 1 to 15
        train_errors = np.zeros(15)
        test_errors = np.zeros(15)

        for i in range(1, 16):
            model = decision_tree.fit(X, y, i)
            y_pred = decision_tree.predict(model, X)
            training_error = np.sum(y_pred != y) / float(X.shape[0])

            # print "Training error:", training_error, "at depth", i
            y_pred = decision_tree.predict(model, Xtest)
            test_error = np.sum(y_pred != ytest) / float(Xtest.shape[0])
            # print "Test error:", test_error, "at depth", i

            train_errors[i - 1] = training_error
            test_errors[i - 1] = test_error
        x_vals = np.arange(1, 16)

        plt.title("Tree depth vs. training and test error")
        plt.plot(x_vals, train_errors, label="Training error")
        plt.plot(x_vals, test_errors, label="Testing error")
        plt.xlabel("Depth")
def map_fit(interface, state, label, inp):
    import numpy as np
    import decision_tree, measures, random
    from collections import Counter

    out = interface.output(0)
    margins, forest, medoids, medoids_y = [], [], [], []
    missing_vals_attr = set()

    num_test_samples = state["num_medoids"]
    num_samples = sum(
        [1 for row in inp if len(row.strip().split(state["delimiter"])) > 1])
    test_indices = set(
        random.sample([i for i in range(num_samples)], num_test_samples))

    for counter in range(state["trees_per_chunk"]):
        bag_indices = Counter(
            np.random.randint(num_samples, size=(num_samples)))
        _ = [
            bag_indices.pop(test_id) for test_id in test_indices
            if test_id in bag_indices
        ]

        x, y, fill_in_values = [], [], []
        attr_mapping, y_mapping = {}, {}

        row_num = -1
        for row in inp:
            row_num += 1
            row = row.strip().split(state["delimiter"])
            if len(row) > 1:
                if row_num in test_indices:
                    if counter == 0:
                        new_row = []
                        for i, j in enumerate(state["X_indices"]):
                            if state["X_meta"][i] == "c":
                                new_row.append(float(row[j]))
                            else:
                                new_row.append(row[j])
                        medoids.append(new_row)
                        medoids_y.append(row[state["y_index"]])
                    else:
                        continue
                else:
                    while bag_indices[row_num] > 0:
                        new_row = []
                        for i, j in enumerate(state["X_indices"]):
                            if row[j] in state["missing_vals"]:
                                new_row.append(row[j])
                                missing_vals_attr.add(i)
                            elif state["X_meta"][i] == "c":
                                new_row.append(float(row[j]))
                            else:
                                if row[j] not in attr_mapping:
                                    attr_mapping[row[j]] = len(attr_mapping)
                                new_row.append(attr_mapping[row[j]])
                        x.append(new_row)
                        if row[state["y_index"]] not in y_mapping:
                            y_mapping[row[state["y_index"]]] = len(y_mapping)
                        y.append(y_mapping[row[state["y_index"]]])
                        bag_indices[row_num] -= 1

        attr_mapping = {v: k for k, v in attr_mapping.iteritems()}
        y_mapping = {v: k for k, v in y_mapping.iteritems()}

        if len(y_mapping) == 1:
            print "Warning: Only one class in the subset!"
            return

        if len(missing_vals_attr) > 0:
            for i in range(len(state["X_indices"])):
                if state["X_meta"][i] == "c":
                    value = np.average([
                        sample[i] for sample in x if type(sample[i]) == float
                    ])
                    fill_in_values.append(value)
                else:
                    value = np.bincount([
                        sample[i] for sample in x if type(sample[i]) == int
                    ]).argmax()
                    fill_in_values.append(attr_mapping[value])
                if i in missing_vals_attr:
                    for j in range(len(x)):
                        if x[j][i] in state["missing_vals"]:
                            x[j][i] = value

        x = np.array(x, dtype=np.float32)
        y = np.array(y, dtype=np.uint16)

        tree = decision_tree.fit(x=x,
                                 y=y,
                                 t=state["X_meta"],
                                 randomized=True,
                                 max_tree_nodes=state["max_tree_nodes"],
                                 min_samples_leaf=state["min_samples_leaf"],
                                 min_samples_split=state["min_samples_split"],
                                 class_majority=state["class_majority"],
                                 measure=measures.info_gain if state["measure"]
                                 == "info_gain" else measures.mdl,
                                 accuracy=state["accuracy"],
                                 separate_max=state["separate_max"])
        print "Tree was built"
        if len(tree) < 2:
            print "tree was removed"
            continue

        tree_mapped = {}
        for k, v in tree.iteritems():
            tree_mapped[k] = [None for i in range(2)]
            for i, node in enumerate(v):
                dist_map = dict([(y_mapping[label], freq)
                                 for label, freq in node[3].iteritems()])
                split_map = set([attr_mapping[int(s)] for s in list(node[2])
                                 ]) if node[5] == "d" else node[2]
                tree_mapped[k][i] = (node[0], node[1], split_map, dist_map,
                                     node[4], node[5])
        forest.append(tree_mapped)

        tree_margins = []
        for ti in range(num_test_samples):
            leaf, margin = decision_tree.predict(tree_mapped, medoids[ti],
                                                 medoids_y[ti])
        tree_margins.append(margin)
        margins.append(tree_margins)
    print "tree was build"

    cont, disc = [], []
    for i in range(len(medoids)):
        cont.append([
            medoids[i][j] for j in range(len(medoids[i]))
            if state["X_meta"][j] == "c"
        ])
        disc.append([
            medoids[i][j] for j in range(len(medoids[i]))
            if state["X_meta"][j] == "d"
        ])
    medoids = [np.array(cont), np.array(disc)]

    gower_range = np.array([
        np.ptp(x[:, i]) for i in range(len(state["X_meta"]))
        if state["X_meta"][i] == "c"
    ])
    gower_range[gower_range == 0] = 1e-9

    out.add("model", (forest, margins, medoids, gower_range))
    out.add("fill_in_values", fill_in_values)
Beispiel #22
0
def map_fit(interface, state, label, inp):
    import numpy as np
    from itertools import permutations
    import decision_tree, measures, k_medoids

    out = interface.output(0)
    x, y, margins, forest = [], [], [], []
    attr_mapping, y_mapping, similarity_mat = {}, {}, {}
    missing_vals_attr = set()

    for row in inp:
        row = row.strip().split(state["delimiter"])
        if len(row) > 1:
            new_row = []
            for i, j in enumerate(state["X_indices"]):
                if row[j] in state["missing_vals"]:
                    new_row.append(row[j])
                    missing_vals_attr.add(i)
                elif state["X_meta"][i] == "c":
                    new_row.append(float(row[j]))
                else:
                    if row[j] not in attr_mapping:
                        attr_mapping[row[j]] = len(attr_mapping)
                    new_row.append(attr_mapping[row[j]])
            x.append(new_row)

            if row[state["y_index"]] not in y_mapping:
                y_mapping[row[state["y_index"]]] = len(y_mapping)
            y.append(y_mapping[row[state["y_index"]]])
    if len(y_mapping) == 1:
        print "Warning: Only one class in the subset!"
        return

    fill_in_values = []
    attr_mapping = {v: k for k, v in attr_mapping.iteritems()}
    y_mapping = {v: k for k, v in y_mapping.iteritems()}
    if len(missing_vals_attr) > 0:
        for i in range(len(state["X_indices"])):
            if state["X_meta"][i] == "c":
                value = np.average(
                    [sample[i] for sample in x if type(sample[i]) == float])
                fill_in_values.append(value)
            else:
                value = np.bincount([
                    sample[i] for sample in x if type(sample[i]) == int
                ]).argmax()
                fill_in_values.append(attr_mapping[value])
            if i in missing_vals_attr:
                for j in range(len(x)):
                    if x[j][i] in state["missing_vals"]:
                        x[j][i] = value
    x, y = np.array(x), np.array(y)

    iteration = 0
    while len(forest) < state["trees_per_chunk"]:
        if iteration == state["trees_per_chunk"] * 2:
            return
        bag_indices = np.random.randint(len(x), size=(len(x)))
        unique = set(bag_indices)
        out_of_bag_indices = [i for i in range(len(x))
                              if i not in unique][:500]
        iteration += 1

        if len(np.unique(y[bag_indices])) == 1:
            continue

        tree = decision_tree.fit(x=x[bag_indices],
                                 y=y[bag_indices],
                                 t=state["X_meta"],
                                 randomized=True,
                                 max_tree_nodes=state["max_tree_nodes"],
                                 min_samples_leaf=state["min_samples_leaf"],
                                 min_samples_split=state["min_samples_split"],
                                 class_majority=state["class_majority"],
                                 measure=measures.info_gain if state["measure"]
                                 == "info_gain" else measures.mdl,
                                 accuracy=state["accuracy"],
                                 separate_max=state["separate_max"])

        if len(tree) < 2:
            continue
        # calculate margins
        tree_margins, leafs_grouping = {}, {}
        for j in out_of_bag_indices:
            leaf, margin = decision_tree.predict(tree, x[j], y[j])
            tree_margins[j] = margin
            if leaf in leafs_grouping:
                leafs_grouping[leaf].append(j)
            else:
                leafs_grouping[leaf] = [j]
        margins.append(tree_margins)

        for k, v in leafs_grouping.iteritems():
            for cx, cy in permutations(v, 2):
                if cx in similarity_mat:
                    similarity_mat[cx][cy] = similarity_mat[cx].get(cy, 0) - 1
                else:
                    similarity_mat[cx] = {cy: -1}

        tree_mapped = {}
        for k, v in tree.iteritems():
            tree_mapped[k] = [None for i in range(2)]
            for i, node in enumerate(v):
                dist_map = dict([(y_mapping[label], freq)
                                 for label, freq in node[3].iteritems()])
                split_map = set([attr_mapping[int(s)] for s in list(node[2])
                                 ]) if node[5] == "d" else node[2]
                tree_mapped[k][i] = (node[0], node[1], split_map, dist_map,
                                     node[4], node[5])
        forest.append(tree_mapped)

    min_elements = []
    for k, v in similarity_mat.iteritems():
        min_id = min(similarity_mat[k], key=similarity_mat[k].get)
        min_elements.append((similarity_mat[k][min_id], min_id))
    min_elements = sorted(min_elements)

    if state["k"] == "sqrt":
        k = int(np.sqrt(len(x[0]))) + 1
    elif state["k"] == "square":
        k = len(np.unique(y)) * len(np.unique(y))

    cidx = set()
    counter = 0
    while counter < len(min_elements) and len(cidx) < k:
        cidx.add(min_elements[counter][1])
        counter += 1

    inds, medoids_i = k_medoids.fit(similarity_mat, len(x), list(cidx))
    sample_ids = np.array(similarity_mat.keys())
    medoids_i = [sample_ids[i] for i in medoids_i]

    clusters = [sample_ids[np.where(inds == i)[0]] for i in np.unique(inds)]
    medoids = x[medoids_i].tolist()  # set medoids without sample identifier

    cont, disc = [], []
    for i in range(len(medoids)):
        cont.append([
            medoids[i][j] for j in range(len(medoids[i]))
            if state["X_meta"][j] == "c"
        ])
        disc.append([
            attr_mapping[int(medoids[i][j])] for j in range(len(medoids[i]))
            if state["X_meta"][j] == "d"
        ])
    medoids = [np.array(cont), np.array(disc)]

    stats = [[] for i in range(len(medoids_i))]
    for i in range(len(forest)):  # for every tree in forest
        for num, cluster in enumerate(clusters):
            # calculate average margin for cluster
            values = [
                margins[i][sample_id] for sample_id in cluster
                if int(sample_id) in margins[i]
            ]
            if values != []:
                avg = np.average(values)
                forest[i]["margin" + str(num)] = avg
                stats[num].append(avg)

    stats = [np.median(value) for value in stats]
    gower_range = np.array([
        np.ptp(x[:, i]) for i in range(len(state["X_meta"]))
        if state["X_meta"][i] == "c"
    ])
    gower_range[gower_range == 0] = 1e-9
    out.add("model", (forest, medoids, stats, gower_range))
    out.add("fill_in_values", fill_in_values)
Beispiel #23
0
def map_predict(interface, state, label, inp):
    import decision_tree
    import numpy as np

    out = interface.output(0)
    fill_in_values = state["fill_in_values"]
    coeff = state["coeff"]

    for row in inp:
        row = row.strip().split(state["delimiter"])
        if len(row) > 1:
            x_id = "" if state["id_index"] == -1 else row[state["id_index"]]

            x, cont, disc = [], [], []
            for i, j in enumerate(state["X_indices"]):
                if row[j] in state["missing_vals"]:
                    row[j] = fill_in_values[i]

                if state["X_meta"][i] == "c":
                    x.append(float(row[j]))
                    cont.append(float(row[j]))
                else:
                    x.append(row[j])
                    disc.append(row[j])
            cont, disc = np.array(cont), np.array(disc)

            similarities = []
            for i, medoids in enumerate(state["medoids"]):
                gower = 0 if len(cont) == 0 else np.sum(1 - np.true_divide(
                    np.abs(cont - medoids[0]), state["gower_ranges"][i]),
                                                        axis=1)
                gower += 0 if len(disc) == 0 else np.sum(disc == medoids[1],
                                                         axis=1)
                similarities += zip(np.round(1 - gower / float(len(x)), 4),
                                    [(i, j) for j in range(len(x))])

            similarities = sorted(similarities)
            threshold = similarities[0][0] * (1 + coeff)
            similar_medoids = [similarities[0][1]]
            pos = 1
            while pos < len(
                    similarities) and similarities[pos][0] <= threshold:
                similar_medoids.append(similarities[pos][1])
                pos += 1

            global_predictions = {}
            for i, j in similar_medoids:
                predictions = {}
                margin = "margin" + str(j)
                for tree in state["forest"][i]:
                    if margin in tree and tree[margin] >= state["stats"][i][j]:
                        pred = decision_tree.predict(tree, x)
                        predictions[pred] = predictions.get(
                            pred, []) + [tree[margin]]

                for k, v in predictions.iteritems():
                    predictions[k] = np.average(v) * len(v)

                max_pred = max(predictions, key=predictions.get)
                if max_pred not in global_predictions:
                    global_predictions[max_pred] = predictions[max_pred]
                elif predictions[max_pred] > global_predictions[max_pred]:
                    global_predictions[max_pred] = predictions[max_pred]

            out.add(x_id,
                    (max(global_predictions, key=global_predictions.get), ))
def predict(forest, sample):
    predictions = [decision_tree.predict(tree, sample, dist=True) for tree in forest]
    y_dist = {k: v / float(len(forest)) for k, v in np.sum(predictions).iteritems()}
    return max(y_dist, key=y_dist.get)
def map_fit(interface, state, label, inp):
    import numpy as np
    from itertools import permutations
    import decision_tree, measures, k_medoids

    out = interface.output(0)
    x, y, margins, forest = [], [], [], []
    attr_mapping, y_mapping, similarity_mat = {}, {}, {}
    missing_vals_attr = set()

    for row in inp:
        row = row.strip().split(state["delimiter"])
        if len(row) > 1:
            new_row = []
            for i, j in enumerate(state["X_indices"]):
                if row[j] in state["missing_vals"]:
                    new_row.append(row[j])
                    missing_vals_attr.add(i)
                elif state["X_meta"][i] == "c":
                    new_row.append(float(row[j]))
                else:
                    if row[j] not in attr_mapping:
                        attr_mapping[row[j]] = len(attr_mapping)
                    new_row.append(attr_mapping[row[j]])
            x.append(new_row)

            if row[state["y_index"]] not in y_mapping:
                y_mapping[row[state["y_index"]]] = len(y_mapping)
            y.append(y_mapping[row[state["y_index"]]])
    if len(y_mapping) == 1:
        print "Warning: Only one class in the subset!"
        return

    fill_in_values = []
    attr_mapping = {v: k for k, v in attr_mapping.iteritems()}
    y_mapping = {v: k for k, v in y_mapping.iteritems()}
    if len(missing_vals_attr) > 0:
        for i in range(len(state["X_indices"])):
            if state["X_meta"][i] == "c":
                value = np.average([sample[i] for sample in x if type(sample[i]) == float])
                fill_in_values.append(value)
            else:
                value = np.bincount([sample[i] for sample in x if type(sample[i]) == int]).argmax()
                fill_in_values.append(attr_mapping[value])
            if i in missing_vals_attr:
                for j in range(len(x)):
                    if x[j][i] in state["missing_vals"]:
                        x[j][i] = value
    x, y = np.array(x), np.array(y)

    iteration = 0
    while len(forest) < state["trees_per_chunk"]:
        if iteration == state["trees_per_chunk"] * 2:
            return
        bag_indices = np.random.randint(len(x), size=(len(x)))
        unique = set(bag_indices)
        out_of_bag_indices = [i for i in range(len(x)) if i not in unique][:500]
        iteration += 1

        if len(np.unique(y[bag_indices])) == 1:
            continue

        tree = decision_tree.fit(
            x=x[bag_indices],
            y=y[bag_indices],
            t=state["X_meta"],
            randomized=True,
            max_tree_nodes=state["max_tree_nodes"],
            min_samples_leaf=state["min_samples_leaf"],
            min_samples_split=state["min_samples_split"],
            class_majority=state["class_majority"],
            measure=measures.info_gain if state["measure"] == "info_gain" else measures.mdl,
            accuracy=state["accuracy"],
            separate_max=state["separate_max"],
        )

        if len(tree) < 2:
            continue
        # calculate margins
        tree_margins, leafs_grouping = {}, {}
        for j in out_of_bag_indices:
            leaf, margin = decision_tree.predict(tree, x[j], y[j])
            tree_margins[j] = margin
            if leaf in leafs_grouping:
                leafs_grouping[leaf].append(j)
            else:
                leafs_grouping[leaf] = [j]
        margins.append(tree_margins)

        for k, v in leafs_grouping.iteritems():
            for cx, cy in permutations(v, 2):
                if cx in similarity_mat:
                    similarity_mat[cx][cy] = similarity_mat[cx].get(cy, 0) - 1
                else:
                    similarity_mat[cx] = {cy: -1}

        tree_mapped = {}
        for k, v in tree.iteritems():
            tree_mapped[k] = [None for i in range(2)]
            for i, node in enumerate(v):
                dist_map = dict([(y_mapping[label], freq) for label, freq in node[3].iteritems()])
                split_map = set([attr_mapping[int(s)] for s in list(node[2])]) if node[5] == "d" else node[2]
                tree_mapped[k][i] = (node[0], node[1], split_map, dist_map, node[4], node[5])
        forest.append(tree_mapped)

    min_elements = []
    for k, v in similarity_mat.iteritems():
        min_id = min(similarity_mat[k], key=similarity_mat[k].get)
        min_elements.append((similarity_mat[k][min_id], min_id))
    min_elements = sorted(min_elements)

    if state["k"] == "sqrt":
        k = int(np.sqrt(len(x[0]))) + 1
    elif state["k"] == "square":
        k = len(np.unique(y)) * len(np.unique(y))

    cidx = set()
    counter = 0
    while counter < len(min_elements) and len(cidx) < k:
        cidx.add(min_elements[counter][1])
        counter += 1

    inds, medoids_i = k_medoids.fit(similarity_mat, len(x), list(cidx))
    sample_ids = np.array(similarity_mat.keys())
    medoids_i = [sample_ids[i] for i in medoids_i]

    clusters = [sample_ids[np.where(inds == i)[0]] for i in np.unique(inds)]
    medoids = x[medoids_i].tolist()  # set medoids without sample identifier

    cont, disc = [], []
    for i in range(len(medoids)):
        cont.append([medoids[i][j] for j in range(len(medoids[i])) if state["X_meta"][j] == "c"])
        disc.append([attr_mapping[int(medoids[i][j])] for j in range(len(medoids[i])) if state["X_meta"][j] == "d"])
    medoids = [np.array(cont), np.array(disc)]

    stats = [[] for i in range(len(medoids_i))]
    for i in range(len(forest)):  # for every tree in forest
        for num, cluster in enumerate(clusters):
            # calculate average margin for cluster
            values = [margins[i][sample_id] for sample_id in cluster if int(sample_id) in margins[i]]
            if values != []:
                avg = np.average(values)
                forest[i]["margin" + str(num)] = avg
                stats[num].append(avg)

    stats = [np.median(value) for value in stats]
    gower_range = np.array([np.ptp(x[:, i]) for i in range(len(state["X_meta"])) if state["X_meta"][i] == "c"])
    gower_range[gower_range == 0] = 1e-9
    out.add("model", (forest, medoids, stats, gower_range))
    out.add("fill_in_values", fill_in_values)
Beispiel #26
0
def score_model(model, features, labels, depth):
    labels_pred = predict(model, features)
    correct = 0.0
    for i, _ in enumerate(labels_pred):
        correct += int(labels_pred[i] == labels[i])
    print(correct/len(labels))