def fit(X, y, max_depth=np.inf, n_bootstrap=50):
    trees = []
    for m in range(n_bootstrap):
        # if bootstrapped:
        randomVal = np.random.choice(X.shape[0], X.shape[0])
        X = X[randomVal]
        y = y[randomVal]
        trees.append(decision_tree.fit(X, y, max_depth))

    model = dict()
    model['trees'] = trees
    model['predict'] = predict
    return model
def fit(x, y, t, num_trees, max_tree_nodes, min_samples_leaf, min_samples_split, class_majority, measure, separate_max):
    forest = []
    for i in range(num_trees):
        x_train, y_train, x_out_of_bag, y_out_of_bag = bootstrap(x, y)

        tree = decision_tree.fit(
            x=x_train,
            y=y_train,
            t=t,
            randomized=True,
            max_tree_nodes=max_tree_nodes,
            min_samples_leaf=min_samples_leaf,
            min_samples_split=min_samples_split,
            class_majority=class_majority,
            measure=measure,
            separate_max=separate_max)
        forest.append(tree)
    return forest
Esempio n. 3
0
 def fit(self, X, y, dist, max_depth):
     tree = dt.fit(X, y, dist, max_depth)
     self.tree = tree
Esempio n. 4
0
def map_fit(interface, state, label, inp):
    import numpy as np
    import decision_tree, measures

    attr_mapping, y_mapping = {}, {}
    x, y, fill_in_values = [], [], []
    out = interface.output(0)
    missing_vals_attr = set()

    for row in inp:
        row = row.strip().split(state["delimiter"])
        if len(row) > 1:
            new_row = []
            for i, j in enumerate(state["X_indices"]):
                if row[j] in state["missing_vals"]:
                    new_row.append(row[j])
                    missing_vals_attr.add(i)
                elif state["X_meta"][i] == "c":
                    new_row.append(float(row[j]))
                else:
                    if row[j] not in attr_mapping:
                        attr_mapping[row[j]] = len(attr_mapping)
                    new_row.append(attr_mapping[row[j]])
            x.append(new_row)

            if row[state["y_index"]] not in y_mapping:
                y_mapping[row[state["y_index"]]] = len(y_mapping)
            y.append(y_mapping[row[state["y_index"]]])

    if len(y_mapping) == 1:
        print "Warning: Only one class in the subset!"
        return

    attr_mapping = {v: k for k, v in attr_mapping.iteritems()}
    y_mapping = {v: k for k, v in y_mapping.iteritems()}

    if len(missing_vals_attr) > 0:
        for i in range(len(state["X_indices"])):
            if state["X_meta"][i] == "c":
                value = np.average(
                    [sample[i] for sample in x if type(sample[i]) == float])
                fill_in_values.append(value)
            else:
                value = np.bincount([
                    sample[i] for sample in x if type(sample[i]) == int
                ]).argmax()
                fill_in_values.append(attr_mapping[value])
            if i in missing_vals_attr:
                for j in range(len(x)):
                    if x[j][i] in state["missing_vals"]:
                        x[j][i] = value

    tree = decision_tree.fit(x=np.array(x, dtype=np.float32),
                             y=np.array(y, dtype=np.uint16),
                             t=state["X_meta"],
                             randomized=False,
                             max_tree_nodes=state["max_tree_nodes"],
                             min_samples_leaf=state["min_samples_leaf"],
                             min_samples_split=state["min_samples_split"],
                             class_majority=state["class_majority"],
                             measure=measures.info_gain if state["measure"]
                             == "info_gain" else measures.mdl,
                             accuracy=state["accuracy"],
                             separate_max=state["separate_max"])

    tree_mapped = {}
    for k, v in tree.iteritems():
        tree_mapped[k] = [None for i in range(2)]
        for i, node in enumerate(v):
            dist_map = dict([(y_mapping[label], freq)
                             for label, freq in node[3].iteritems()])
            split_map = set([attr_mapping[int(s)] for s in list(node[2])
                             ]) if node[5] == "d" else node[2]
            tree_mapped[k][i] = (node[0], node[1], split_map, dist_map,
                                 node[4], node[5])
    out.add("tree", tree_mapped)
    out.add("fill_in_values", fill_in_values)
Esempio n. 5
0
def map_fit(interface, state, label, inp):
    import numpy as np
    from itertools import permutations
    import decision_tree, measures, k_medoids

    out = interface.output(0)
    x, y, margins, forest = [], [], [], []
    attr_mapping, y_mapping, similarity_mat = {}, {}, {}
    missing_vals_attr = set()

    for row in inp:
        row = row.strip().split(state["delimiter"])
        if len(row) > 1:
            new_row = []
            for i, j in enumerate(state["X_indices"]):
                if row[j] in state["missing_vals"]:
                    new_row.append(row[j])
                    missing_vals_attr.add(i)
                elif state["X_meta"][i] == "c":
                    new_row.append(float(row[j]))
                else:
                    if row[j] not in attr_mapping:
                        attr_mapping[row[j]] = len(attr_mapping)
                    new_row.append(attr_mapping[row[j]])
            x.append(new_row)

            if row[state["y_index"]] not in y_mapping:
                y_mapping[row[state["y_index"]]] = len(y_mapping)
            y.append(y_mapping[row[state["y_index"]]])
    if len(y_mapping) == 1:
        print "Warning: Only one class in the subset!"
        return

    fill_in_values = []
    attr_mapping = {v: k for k, v in attr_mapping.iteritems()}
    y_mapping = {v: k for k, v in y_mapping.iteritems()}
    if len(missing_vals_attr) > 0:
        for i in range(len(state["X_indices"])):
            if state["X_meta"][i] == "c":
                value = np.average(
                    [sample[i] for sample in x if type(sample[i]) == float])
                fill_in_values.append(value)
            else:
                value = np.bincount([
                    sample[i] for sample in x if type(sample[i]) == int
                ]).argmax()
                fill_in_values.append(attr_mapping[value])
            if i in missing_vals_attr:
                for j in range(len(x)):
                    if x[j][i] in state["missing_vals"]:
                        x[j][i] = value
    x, y = np.array(x), np.array(y)

    iteration = 0
    while len(forest) < state["trees_per_chunk"]:
        if iteration == state["trees_per_chunk"] * 2:
            return
        bag_indices = np.random.randint(len(x), size=(len(x)))
        unique = set(bag_indices)
        out_of_bag_indices = [i for i in range(len(x))
                              if i not in unique][:500]
        iteration += 1

        if len(np.unique(y[bag_indices])) == 1:
            continue

        tree = decision_tree.fit(x=x[bag_indices],
                                 y=y[bag_indices],
                                 t=state["X_meta"],
                                 randomized=True,
                                 max_tree_nodes=state["max_tree_nodes"],
                                 min_samples_leaf=state["min_samples_leaf"],
                                 min_samples_split=state["min_samples_split"],
                                 class_majority=state["class_majority"],
                                 measure=measures.info_gain if state["measure"]
                                 == "info_gain" else measures.mdl,
                                 accuracy=state["accuracy"],
                                 separate_max=state["separate_max"])

        if len(tree) < 2:
            continue
        # calculate margins
        tree_margins, leafs_grouping = {}, {}
        for j in out_of_bag_indices:
            leaf, margin = decision_tree.predict(tree, x[j], y[j])
            tree_margins[j] = margin
            if leaf in leafs_grouping:
                leafs_grouping[leaf].append(j)
            else:
                leafs_grouping[leaf] = [j]
        margins.append(tree_margins)

        for k, v in leafs_grouping.iteritems():
            for cx, cy in permutations(v, 2):
                if cx in similarity_mat:
                    similarity_mat[cx][cy] = similarity_mat[cx].get(cy, 0) - 1
                else:
                    similarity_mat[cx] = {cy: -1}

        tree_mapped = {}
        for k, v in tree.iteritems():
            tree_mapped[k] = [None for i in range(2)]
            for i, node in enumerate(v):
                dist_map = dict([(y_mapping[label], freq)
                                 for label, freq in node[3].iteritems()])
                split_map = set([attr_mapping[int(s)] for s in list(node[2])
                                 ]) if node[5] == "d" else node[2]
                tree_mapped[k][i] = (node[0], node[1], split_map, dist_map,
                                     node[4], node[5])
        forest.append(tree_mapped)

    min_elements = []
    for k, v in similarity_mat.iteritems():
        min_id = min(similarity_mat[k], key=similarity_mat[k].get)
        min_elements.append((similarity_mat[k][min_id], min_id))
    min_elements = sorted(min_elements)

    if state["k"] == "sqrt":
        k = int(np.sqrt(len(x[0]))) + 1
    elif state["k"] == "square":
        k = len(np.unique(y)) * len(np.unique(y))

    cidx = set()
    counter = 0
    while counter < len(min_elements) and len(cidx) < k:
        cidx.add(min_elements[counter][1])
        counter += 1

    inds, medoids_i = k_medoids.fit(similarity_mat, len(x), list(cidx))
    sample_ids = np.array(similarity_mat.keys())
    medoids_i = [sample_ids[i] for i in medoids_i]

    clusters = [sample_ids[np.where(inds == i)[0]] for i in np.unique(inds)]
    medoids = x[medoids_i].tolist()  # set medoids without sample identifier

    cont, disc = [], []
    for i in range(len(medoids)):
        cont.append([
            medoids[i][j] for j in range(len(medoids[i]))
            if state["X_meta"][j] == "c"
        ])
        disc.append([
            attr_mapping[int(medoids[i][j])] for j in range(len(medoids[i]))
            if state["X_meta"][j] == "d"
        ])
    medoids = [np.array(cont), np.array(disc)]

    stats = [[] for i in range(len(medoids_i))]
    for i in range(len(forest)):  # for every tree in forest
        for num, cluster in enumerate(clusters):
            # calculate average margin for cluster
            values = [
                margins[i][sample_id] for sample_id in cluster
                if int(sample_id) in margins[i]
            ]
            if values != []:
                avg = np.average(values)
                forest[i]["margin" + str(num)] = avg
                stats[num].append(avg)

    stats = [np.median(value) for value in stats]
    gower_range = np.array([
        np.ptp(x[:, i]) for i in range(len(state["X_meta"]))
        if state["X_meta"][i] == "c"
    ])
    gower_range[gower_range == 0] = 1e-9
    out.add("model", (forest, medoids, stats, gower_range))
    out.add("fill_in_values", fill_in_values)
def map_fit(interface, state, label, inp):
    import numpy as np
    import decision_tree, measures
    from collections import Counter

    out = interface.output(0)
    num_samples = sum([1 for row in inp if len(row.strip().split(state["delimiter"])) > 1])
    missing_vals_attr = set()

    for counter in range(state["trees_per_chunk"]):
        bag_indices = Counter(np.random.randint(num_samples, size=(num_samples)))
        attr_mapping, y_mapping = {}, {}
        x, y, fill_in_values = [], [], []
        row_num = 0
        for row in inp:
            row = row.strip().split(state["delimiter"])
            if len(row) > 1:
                while bag_indices[row_num] > 0:
                    new_row = []
                    for i, j in enumerate(state["X_indices"]):
                        if row[j] in state["missing_vals"]:
                            new_row.append(row[j])
                            missing_vals_attr.add(i)
                        elif state["X_meta"][i] == "c":
                            new_row.append(row[j])
                        else:
                            if row[j] not in attr_mapping:
                                attr_mapping[row[j]] = len(attr_mapping)
                            new_row.append(attr_mapping[row[j]])
                    x.append(new_row)
                    if row[state["y_index"]] not in y_mapping:
                        y_mapping[row[state["y_index"]]] = len(y_mapping)
                    y.append(y_mapping[row[state["y_index"]]])
                    bag_indices[row_num] -= 1
                row_num += 1

        attr_mapping = {v: k for k, v in attr_mapping.iteritems()}
        y_mapping = {v: k for k, v in y_mapping.iteritems()}

        if len(y_mapping) == 1:
            print "Warning: Only one class in the subset!"
            return

        if len(missing_vals_attr) > 0:
            for i in range(len(state["X_indices"])):
                if state["X_meta"][i] == "c":
                    value = np.average([sample[i] for sample in x if type(sample[i]) == float])
                    fill_in_values.append(value)
                else:
                    value = np.bincount([sample[i] for sample in x if type(sample[i]) == int]).argmax()
                    fill_in_values.append(attr_mapping[value])
                if i in missing_vals_attr:
                    for j in range(len(x)):
                        if x[j][i] in state["missing_vals"]:
                            x[j][i] = value
        x = np.array(x, dtype=np.float32)
        y = np.array(y, dtype=np.uint16)

        tree = decision_tree.fit(
            x=x,
            y=y,
            t=state["X_meta"],
            randomized=True,
            max_tree_nodes=state["max_tree_nodes"],
            min_samples_leaf=state["min_samples_leaf"],
            min_samples_split=state["min_samples_split"],
            class_majority=state["class_majority"],
            measure=measures.info_gain if state["measure"] == "info_gain" else measures.mdl,
            accuracy=state["accuracy"],
            separate_max=state["separate_max"])

        if len(tree) < 2:
            continue
        print "tree was build"
        tree_mapped = {}
        for k, v in tree.iteritems():
            tree_mapped[k] = [None for i in range(2)]
            for i, node in enumerate(v):
                dist_map = dict([(y_mapping[label], freq) for label, freq in node[3].iteritems()])
                split_map = set([attr_mapping[int(s)] for s in list(node[2])]) if node[5] == "d" else node[2]
                tree_mapped[k][i] = (node[0], node[1], split_map, dist_map, node[4], node[5])
        out.add("tree", tree_mapped)
        out.add("fill_in_values", fill_in_values)
        # print("Error: %.3f" % error)

        # # 3. Evaluate decision tree that uses information gain
        # tree = DecisionTreeClassifier(max_depth=3)
        # tree.fit(X, y)

        # y_pred = tree.predict(X)
        # error = np.mean(y_pred != y)

        # print("Error: %.3f" % error)
      
        for maxDepth in range(2,15):
            print "******* MAX DEPTH =", maxDepth, "***********"
            # 2. Evaluate decision tree 
            model = decision_tree.fit(X, y, maxDepth=maxDepth)
            # print model
            y_pred = decision_tree.predict(model, X)
            error = np.mean(y_pred != y)
            # print model
            print("Error: %.3f" % error)

            # 3. Evaluate decision tree that uses information gain
            tree = DecisionTreeClassifier(max_depth=maxDepth+1)
            tree.fit(X, y)

            y_pred = tree.predict(X)
            error = np.mean(y_pred != y)

            print("Error: %.3f" % error)
Esempio n. 8
0
        # part 2: print training/test errors as well as number of examples for k=1
        # part 3: plot classification boundaries for k=1

    if question == '2.1':
        dataset = utils.load_dataset('vowel')
        X = dataset['X']
        y = dataset['y']
        Xtest = dataset['Xtest']
        ytest = dataset['ytest']

        # # part 1: plot decision_tree as depth varies from 1 to 15
        train_errors = np.zeros(15)
        test_errors = np.zeros(15)

        for i in range(1, 16):
            model = decision_tree.fit(X, y, i)
            y_pred = decision_tree.predict(model, X)
            training_error = np.sum(y_pred != y) / float(X.shape[0])

            # print "Training error:", training_error, "at depth", i
            y_pred = decision_tree.predict(model, Xtest)
            test_error = np.sum(y_pred != ytest) / float(Xtest.shape[0])
            # print "Test error:", test_error, "at depth", i

            train_errors[i - 1] = training_error
            test_errors[i - 1] = test_error
        x_vals = np.arange(1, 16)

        plt.title("Tree depth vs. training and test error")
        plt.plot(x_vals, train_errors, label="Training error")
        plt.plot(x_vals, test_errors, label="Testing error")
def map_fit(interface, state, label, inp):
    import numpy as np
    from itertools import permutations
    import decision_tree, measures, k_medoids

    out = interface.output(0)
    x, y, margins, forest = [], [], [], []
    attr_mapping, y_mapping, similarity_mat = {}, {}, {}
    missing_vals_attr = set()

    for row in inp:
        row = row.strip().split(state["delimiter"])
        if len(row) > 1:
            new_row = []
            for i, j in enumerate(state["X_indices"]):
                if row[j] in state["missing_vals"]:
                    new_row.append(row[j])
                    missing_vals_attr.add(i)
                elif state["X_meta"][i] == "c":
                    new_row.append(float(row[j]))
                else:
                    if row[j] not in attr_mapping:
                        attr_mapping[row[j]] = len(attr_mapping)
                    new_row.append(attr_mapping[row[j]])
            x.append(new_row)

            if row[state["y_index"]] not in y_mapping:
                y_mapping[row[state["y_index"]]] = len(y_mapping)
            y.append(y_mapping[row[state["y_index"]]])
    if len(y_mapping) == 1:
        print "Warning: Only one class in the subset!"
        return

    fill_in_values = []
    attr_mapping = {v: k for k, v in attr_mapping.iteritems()}
    y_mapping = {v: k for k, v in y_mapping.iteritems()}
    if len(missing_vals_attr) > 0:
        for i in range(len(state["X_indices"])):
            if state["X_meta"][i] == "c":
                value = np.average([sample[i] for sample in x if type(sample[i]) == float])
                fill_in_values.append(value)
            else:
                value = np.bincount([sample[i] for sample in x if type(sample[i]) == int]).argmax()
                fill_in_values.append(attr_mapping[value])
            if i in missing_vals_attr:
                for j in range(len(x)):
                    if x[j][i] in state["missing_vals"]:
                        x[j][i] = value
    x, y = np.array(x), np.array(y)

    iteration = 0
    while len(forest) < state["trees_per_chunk"]:
        if iteration == state["trees_per_chunk"] * 2:
            return
        bag_indices = np.random.randint(len(x), size=(len(x)))
        unique = set(bag_indices)
        out_of_bag_indices = [i for i in range(len(x)) if i not in unique][:500]
        iteration += 1

        if len(np.unique(y[bag_indices])) == 1:
            continue

        tree = decision_tree.fit(
            x=x[bag_indices],
            y=y[bag_indices],
            t=state["X_meta"],
            randomized=True,
            max_tree_nodes=state["max_tree_nodes"],
            min_samples_leaf=state["min_samples_leaf"],
            min_samples_split=state["min_samples_split"],
            class_majority=state["class_majority"],
            measure=measures.info_gain if state["measure"] == "info_gain" else measures.mdl,
            accuracy=state["accuracy"],
            separate_max=state["separate_max"],
        )

        if len(tree) < 2:
            continue
        # calculate margins
        tree_margins, leafs_grouping = {}, {}
        for j in out_of_bag_indices:
            leaf, margin = decision_tree.predict(tree, x[j], y[j])
            tree_margins[j] = margin
            if leaf in leafs_grouping:
                leafs_grouping[leaf].append(j)
            else:
                leafs_grouping[leaf] = [j]
        margins.append(tree_margins)

        for k, v in leafs_grouping.iteritems():
            for cx, cy in permutations(v, 2):
                if cx in similarity_mat:
                    similarity_mat[cx][cy] = similarity_mat[cx].get(cy, 0) - 1
                else:
                    similarity_mat[cx] = {cy: -1}

        tree_mapped = {}
        for k, v in tree.iteritems():
            tree_mapped[k] = [None for i in range(2)]
            for i, node in enumerate(v):
                dist_map = dict([(y_mapping[label], freq) for label, freq in node[3].iteritems()])
                split_map = set([attr_mapping[int(s)] for s in list(node[2])]) if node[5] == "d" else node[2]
                tree_mapped[k][i] = (node[0], node[1], split_map, dist_map, node[4], node[5])
        forest.append(tree_mapped)

    min_elements = []
    for k, v in similarity_mat.iteritems():
        min_id = min(similarity_mat[k], key=similarity_mat[k].get)
        min_elements.append((similarity_mat[k][min_id], min_id))
    min_elements = sorted(min_elements)

    if state["k"] == "sqrt":
        k = int(np.sqrt(len(x[0]))) + 1
    elif state["k"] == "square":
        k = len(np.unique(y)) * len(np.unique(y))

    cidx = set()
    counter = 0
    while counter < len(min_elements) and len(cidx) < k:
        cidx.add(min_elements[counter][1])
        counter += 1

    inds, medoids_i = k_medoids.fit(similarity_mat, len(x), list(cidx))
    sample_ids = np.array(similarity_mat.keys())
    medoids_i = [sample_ids[i] for i in medoids_i]

    clusters = [sample_ids[np.where(inds == i)[0]] for i in np.unique(inds)]
    medoids = x[medoids_i].tolist()  # set medoids without sample identifier

    cont, disc = [], []
    for i in range(len(medoids)):
        cont.append([medoids[i][j] for j in range(len(medoids[i])) if state["X_meta"][j] == "c"])
        disc.append([attr_mapping[int(medoids[i][j])] for j in range(len(medoids[i])) if state["X_meta"][j] == "d"])
    medoids = [np.array(cont), np.array(disc)]

    stats = [[] for i in range(len(medoids_i))]
    for i in range(len(forest)):  # for every tree in forest
        for num, cluster in enumerate(clusters):
            # calculate average margin for cluster
            values = [margins[i][sample_id] for sample_id in cluster if int(sample_id) in margins[i]]
            if values != []:
                avg = np.average(values)
                forest[i]["margin" + str(num)] = avg
                stats[num].append(avg)

    stats = [np.median(value) for value in stats]
    gower_range = np.array([np.ptp(x[:, i]) for i in range(len(state["X_meta"])) if state["X_meta"][i] == "c"])
    gower_range[gower_range == 0] = 1e-9
    out.add("model", (forest, medoids, stats, gower_range))
    out.add("fill_in_values", fill_in_values)
def map_fit(interface, state, label, inp):
    import numpy as np
    import decision_tree, measures, random
    from collections import Counter

    out = interface.output(0)
    margins, forest, medoids, medoids_y = [], [], [], []
    missing_vals_attr = set()

    num_test_samples = state["num_medoids"]
    num_samples = sum(
        [1 for row in inp if len(row.strip().split(state["delimiter"])) > 1])
    test_indices = set(
        random.sample([i for i in range(num_samples)], num_test_samples))

    for counter in range(state["trees_per_chunk"]):
        bag_indices = Counter(
            np.random.randint(num_samples, size=(num_samples)))
        _ = [
            bag_indices.pop(test_id) for test_id in test_indices
            if test_id in bag_indices
        ]

        x, y, fill_in_values = [], [], []
        attr_mapping, y_mapping = {}, {}

        row_num = -1
        for row in inp:
            row_num += 1
            row = row.strip().split(state["delimiter"])
            if len(row) > 1:
                if row_num in test_indices:
                    if counter == 0:
                        new_row = []
                        for i, j in enumerate(state["X_indices"]):
                            if state["X_meta"][i] == "c":
                                new_row.append(float(row[j]))
                            else:
                                new_row.append(row[j])
                        medoids.append(new_row)
                        medoids_y.append(row[state["y_index"]])
                    else:
                        continue
                else:
                    while bag_indices[row_num] > 0:
                        new_row = []
                        for i, j in enumerate(state["X_indices"]):
                            if row[j] in state["missing_vals"]:
                                new_row.append(row[j])
                                missing_vals_attr.add(i)
                            elif state["X_meta"][i] == "c":
                                new_row.append(float(row[j]))
                            else:
                                if row[j] not in attr_mapping:
                                    attr_mapping[row[j]] = len(attr_mapping)
                                new_row.append(attr_mapping[row[j]])
                        x.append(new_row)
                        if row[state["y_index"]] not in y_mapping:
                            y_mapping[row[state["y_index"]]] = len(y_mapping)
                        y.append(y_mapping[row[state["y_index"]]])
                        bag_indices[row_num] -= 1

        attr_mapping = {v: k for k, v in attr_mapping.iteritems()}
        y_mapping = {v: k for k, v in y_mapping.iteritems()}

        if len(y_mapping) == 1:
            print "Warning: Only one class in the subset!"
            return

        if len(missing_vals_attr) > 0:
            for i in range(len(state["X_indices"])):
                if state["X_meta"][i] == "c":
                    value = np.average([
                        sample[i] for sample in x if type(sample[i]) == float
                    ])
                    fill_in_values.append(value)
                else:
                    value = np.bincount([
                        sample[i] for sample in x if type(sample[i]) == int
                    ]).argmax()
                    fill_in_values.append(attr_mapping[value])
                if i in missing_vals_attr:
                    for j in range(len(x)):
                        if x[j][i] in state["missing_vals"]:
                            x[j][i] = value

        x = np.array(x, dtype=np.float32)
        y = np.array(y, dtype=np.uint16)

        tree = decision_tree.fit(x=x,
                                 y=y,
                                 t=state["X_meta"],
                                 randomized=True,
                                 max_tree_nodes=state["max_tree_nodes"],
                                 min_samples_leaf=state["min_samples_leaf"],
                                 min_samples_split=state["min_samples_split"],
                                 class_majority=state["class_majority"],
                                 measure=measures.info_gain if state["measure"]
                                 == "info_gain" else measures.mdl,
                                 accuracy=state["accuracy"],
                                 separate_max=state["separate_max"])
        print "Tree was built"
        if len(tree) < 2:
            print "tree was removed"
            continue

        tree_mapped = {}
        for k, v in tree.iteritems():
            tree_mapped[k] = [None for i in range(2)]
            for i, node in enumerate(v):
                dist_map = dict([(y_mapping[label], freq)
                                 for label, freq in node[3].iteritems()])
                split_map = set([attr_mapping[int(s)] for s in list(node[2])
                                 ]) if node[5] == "d" else node[2]
                tree_mapped[k][i] = (node[0], node[1], split_map, dist_map,
                                     node[4], node[5])
        forest.append(tree_mapped)

        tree_margins = []
        for ti in range(num_test_samples):
            leaf, margin = decision_tree.predict(tree_mapped, medoids[ti],
                                                 medoids_y[ti])
        tree_margins.append(margin)
        margins.append(tree_margins)
    print "tree was build"

    cont, disc = [], []
    for i in range(len(medoids)):
        cont.append([
            medoids[i][j] for j in range(len(medoids[i]))
            if state["X_meta"][j] == "c"
        ])
        disc.append([
            medoids[i][j] for j in range(len(medoids[i]))
            if state["X_meta"][j] == "d"
        ])
    medoids = [np.array(cont), np.array(disc)]

    gower_range = np.array([
        np.ptp(x[:, i]) for i in range(len(state["X_meta"]))
        if state["X_meta"][i] == "c"
    ])
    gower_range[gower_range == 0] = 1e-9

    out.add("model", (forest, margins, medoids, gower_range))
    out.add("fill_in_values", fill_in_values)
def map_fit(interface, state, label, inp):
    import numpy as np
    import decision_tree, measures, random
    from collections import Counter

    out = interface.output(0)
    margins, forest, medoids, medoids_y = [], [], [], []
    missing_vals_attr = set()

    num_test_samples = state["num_medoids"]
    num_samples = sum([1 for row in inp if len(row.strip().split(state["delimiter"])) > 1])
    test_indices = set(random.sample([i for i in range(num_samples)], num_test_samples))

    for counter in range(state["trees_per_chunk"]):
        bag_indices = Counter(np.random.randint(num_samples, size=(num_samples)))
        _ = [bag_indices.pop(test_id) for test_id in test_indices if test_id in bag_indices]

        x, y, fill_in_values = [], [], []
        attr_mapping, y_mapping = {}, {}

        row_num = -1
        for row in inp:
            row_num += 1
            row = row.strip().split(state["delimiter"])
            if len(row) > 1:
                if row_num in test_indices:
                    if counter == 0:
                        new_row = []
                        for i, j in enumerate(state["X_indices"]):
                            if state["X_meta"][i] == "c":
                                new_row.append(float(row[j]))
                            else:
                                new_row.append(row[j])
                        medoids.append(new_row)
                        medoids_y.append(row[state["y_index"]])
                    else:
                        continue
                else:
                    while bag_indices[row_num] > 0:
                        new_row = []
                        for i, j in enumerate(state["X_indices"]):
                            if row[j] in state["missing_vals"]:
                                new_row.append(row[j])
                                missing_vals_attr.add(i)
                            elif state["X_meta"][i] == "c":
                                new_row.append(float(row[j]))
                            else:
                                if row[j] not in attr_mapping:
                                    attr_mapping[row[j]] = len(attr_mapping)
                                new_row.append(attr_mapping[row[j]])
                        x.append(new_row)
                        if row[state["y_index"]] not in y_mapping:
                            y_mapping[row[state["y_index"]]] = len(y_mapping)
                        y.append(y_mapping[row[state["y_index"]]])
                        bag_indices[row_num] -= 1

        attr_mapping = {v: k for k, v in attr_mapping.iteritems()}
        y_mapping = {v: k for k, v in y_mapping.iteritems()}

        if len(y_mapping) == 1:
            print "Warning: Only one class in the subset!"
            return

        if len(missing_vals_attr) > 0:
            for i in range(len(state["X_indices"])):
                if state["X_meta"][i] == "c":
                    value = np.average([sample[i] for sample in x if type(sample[i]) == float])
                    fill_in_values.append(value)
                else:
                    value = np.bincount([sample[i] for sample in x if type(sample[i]) == int]).argmax()
                    fill_in_values.append(attr_mapping[value])
                if i in missing_vals_attr:
                    for j in range(len(x)):
                        if x[j][i] in state["missing_vals"]:
                            x[j][i] = value

        x = np.array(x, dtype=np.float32)
        y = np.array(y, dtype=np.uint16)

        tree = decision_tree.fit(
            x=x,
            y=y,
            t=state["X_meta"],
            randomized=True,
            max_tree_nodes=state["max_tree_nodes"],
            min_samples_leaf=state["min_samples_leaf"],
            min_samples_split=state["min_samples_split"],
            class_majority=state["class_majority"],
            measure=measures.info_gain if state["measure"] == "info_gain" else measures.mdl,
            accuracy=state["accuracy"],
            separate_max=state["separate_max"])
        print "Tree was built"
        if len(tree) < 2:
            print "tree was removed"
            continue


        tree_mapped = {}
        for k, v in tree.iteritems():
            tree_mapped[k] = [None for i in range(2)]
            for i, node in enumerate(v):
                dist_map = dict([(y_mapping[label], freq) for label, freq in node[3].iteritems()])
                split_map = set([attr_mapping[int(s)] for s in list(node[2])]) if node[5] == "d" else node[2]
                tree_mapped[k][i] = (node[0], node[1], split_map, dist_map, node[4], node[5])
        forest.append(tree_mapped)

        tree_margins = []
        for ti in range(num_test_samples):
            leaf, margin = decision_tree.predict(tree_mapped, medoids[ti], medoids_y[ti])
        tree_margins.append(margin)
        margins.append(tree_margins)
    print "tree was build"

    cont, disc = [], []
    for i in range(len(medoids)):
        cont.append([medoids[i][j] for j in range(len(medoids[i])) if state["X_meta"][j] == "c"])
        disc.append([medoids[i][j] for j in range(len(medoids[i])) if state["X_meta"][j] == "d"])
    medoids = [np.array(cont), np.array(disc)]

    gower_range = np.array([np.ptp(x[:, i]) for i in range(len(state["X_meta"])) if state["X_meta"][i] == "c"])
    gower_range[gower_range == 0] = 1e-9

    out.add("model", (forest, margins, medoids, gower_range))
    out.add("fill_in_values", fill_in_values)