def __init__(self, data, model, sess):
        """
        :param data:
        :param model: expect an initialized and trained model
        :return:
        """

        roots_size = [tree_util.size_of_tree(root) for root in data]
        data = helper.sort_by(data, roots_size)

        probs, labels = model.predict_and_label(data, sess)
        labels = get_prediction(labels)
        predictions = get_prediction(probs)

        self.acc = get_accuracy(labels, predictions)
        if len(data) < 1500:
            print(model.accuracy(data, sess))

        self.TP, self.FP, self.TN, self.FN = get_confusion_matrix(labels, predictions)

        self.precision = self.TP / (self.TP + self.FP)
        self.recall = self.TP / (self.TP + self.FN)
        self.F1 = 2 * (self.precision * self.recall) / (self.precision + self.recall)

        self.TPR_list, self.FPR_list = get_roc_values(labels, probs)
        self.auc = metrics.auc(self.FPR_list, self.TPR_list)
Exemple #2
0
    def build_feed_dict(self, roots, sort=True, train=False):
        if sort:
            roots_size = [tree_util.size_of_tree(root) for root in roots]
            roots = helper.sort_by(roots, roots_size)
        roots_size = [tree_util.size_of_tree(root) for root in roots]
        roots_list, permutation = helper.greedy_bin_packing(roots, roots_size, np.max(roots_size))

        node_list_list = []
        root_indices = []
        internal_nodes_array = []
        lstm_prev_list = []
        for i, roots in enumerate(roots_list):
            node_list = []
            root_index = 0
            leaf_index = 0
            lstm_prev = [0]
            lstm_prev_count = 0
            for root in roots:
                tree_util.depth_first_traverse(root, node_list, lambda node, node_list: node_list.append(node))
                leaf_count = tree_util.leafs_in_tree(root)
                root_index += leaf_count
                root_indices.append([i, root_index])
                for j in range(0,leaf_count):
                    leaf_index += 1
                    internal_nodes_array.append([i, leaf_index])

                leaf_count = tree_util.leafs_in_tree(root)
                for x in range(0, leaf_count):
                    if x == 0:
                        lstm_prev.append(0)
                    else:
                        lstm_prev.append(lstm_prev_count)
                    lstm_prev_count += 1

            node_list_list.append(node_list)
            lstm_prev_list.append(lstm_prev)

        feed_dict = {
            self.dropout_rate: FLAGS.dropout_prob if train else 0,
            self.lstm_prev_array: helper.lists_pad(lstm_prev_list, 0),
            self.leaf_word_array: helper.lists_pad(
                [[0] + [self.word_embed.get_idx(node.value) for node in node_list if node.is_leaf]
                for node_list in node_list_list]
            ,0),
            self.loss_array: root_indices if self.use_root_loss else internal_nodes_array,
            self.root_array: root_indices,
            self.label_array: helper.lists_pad([
                [[0, 0]] + [node.label for node in node_list if node.is_leaf]
                for node_list in node_list_list], [0, 0])
        }

        return feed_dict, permutation
Exemple #3
0
from utils import data_util, tree_util, helper

_data_util = data_util.DataUtil()
data = _data_util.get_data()
roots_size = [tree_util.size_of_tree(root) for root in data.train_trees]
roots = helper.sort_by(data.train_trees, roots_size)

for root in roots[-5:]:
    print(root.label)
    print(root.to_sentence())
    print()
    def build_feed_dict(self, roots, sort=True, train=False):
        if sort:
            roots_size = [tree_util.size_of_tree(root) for root in roots]
            roots = helper.sort_by(roots, roots_size)
        roots_size = [tree_util.size_of_tree(root) for root in roots]
        roots_list, permutation = helper.greedy_bin_packing(
            roots, roots_size, np.max(roots_size))

        node_list_list = []
        node_to_index_list = []
        root_indices = []
        lstm_idx_list = []
        internal_nodes_array = []
        for i, roots in enumerate(roots_list):
            node_list = []
            lstm_idx = [0]
            root_index = 0
            start = 0
            for root in roots:
                tree_util.depth_first_traverse(
                    root, node_list,
                    lambda node, node_list: node_list.append(node))

                _, start = tree_util.get_preceding_lstm_index(
                    root, start, start, lstm_idx)

                root_index += tree_util.size_of_tree(root)
                root_indices.append([i, root_index])
            node_list_list.append(node_list)
            node_to_index = helper.reverse_dict(node_list)
            node_to_index_list.append(node_to_index)
            lstm_idx_list.append(lstm_idx)
            for node in node_list:
                if not node.is_leaf:
                    internal_nodes_array.append([i, node_to_index[node] + 1])

        internal_nodes_array = internal_nodes_array if len(
            internal_nodes_array) > 0 else [[0, 0]]

        feed_dict = {
            self.dropout_rate:
            FLAGS.dropout_prob if train else 0,
            self.leaf_word_array:
            helper.lists_pad([[0] + [
                self.word_embed.get_idx(node.value)
                for node in node_list if node.is_leaf
            ] for node_list in node_list_list], 0),
            self.lstm_index_array:
            helper.lists_pad(lstm_idx_list, 0),
            self.loss_array:
            root_indices if self.use_root_loss else internal_nodes_array,
            self.root_array:
            root_indices,
            self.is_leaf_array:
            helper.lists_pad(
                [[0] + helper.to_int([node.is_leaf for node in node_list])
                 for node_list in node_list_list], 0),
            self.word_index_array:
            helper.lists_pad(
                [[0] +
                 [self.word_embed.get_idx(node.value) for node in node_list]
                 for node_list in node_list_list],
                self.word_embed.get_idx("ZERO")),
            self.left_child_array:
            helper.lists_pad([[0] + helper.add_one([
                node_to_index[node.left_child]
                if node.left_child is not None else -1 for node in node_list
            ])
                              for node_list, node_to_index in zip(
                                  node_list_list, node_to_index_list)], 0),
            self.right_child_array:
            helper.lists_pad([[0] + helper.add_one([
                node_to_index[node.right_child]
                if node.right_child is not None else -1 for node in node_list
            ])
                              for node_list, node_to_index in zip(
                                  node_list_list, node_to_index_list)], 0),
            self.label_array:
            helper.lists_pad([[[0, 0]] + [node.label for node in node_list]
                              for node_list in node_list_list], [0, 0])
        }

        return feed_dict, permutation
def get_data():
    if not os.path.exists(directories.CLASSIFIER_DATA_DIR):
        os.mkdir(directories.CLASSIFIER_DATA_DIR)
    if not os.path.exists(directories.CLASSIFIER_DATA(FLAGS.model_name)):
        os.mkdir(directories.CLASSIFIER_DATA(FLAGS.model_name))
    if os.path.exists(
            directories.CLASSIFIER_DATA(FLAGS.model_name) + 'x_train.npy'):
        x_train = np.load(
            directories.CLASSIFIER_DATA(FLAGS.model_name) + 'x_train.npy')
        y_train = np.load(
            directories.CLASSIFIER_DATA(FLAGS.model_name) + 'y_train.npy')
        x_val = np.load(
            directories.CLASSIFIER_DATA(FLAGS.model_name) + 'x_val.npy')
        y_val = np.load(
            directories.CLASSIFIER_DATA(FLAGS.model_name) + 'y_val.npy')
        x_test = np.load(
            directories.CLASSIFIER_DATA(FLAGS.model_name) + 'x_test.npy')
        y_test = np.load(
            directories.CLASSIFIER_DATA(FLAGS.model_name) + 'y_test.npy')
    else:
        _data_util = data_util.DataUtil()
        data = _data_util.get_data()

        roots_size = [
            tree_util.size_of_tree(root) for root in data.train_trees
        ]
        data.train_trees = helper.sort_by(data.train_trees, roots_size)

        roots_size = [tree_util.size_of_tree(root) for root in data.val_trees]
        data.val_trees = helper.sort_by(data.val_trees, roots_size)

        roots_size = [tree_util.size_of_tree(root) for root in data.test_trees]
        data.test_trees = helper.sort_by(data.test_trees, roots_size)

        if FLAGS.use_gpu:
            config = None
        else:
            config = tf.ConfigProto(device_count={'GPU': 0})

        if FLAGS.word_embed_model == constants.WORD2VEC:
            word_embeddings = Word2Vec(mode=FLAGS.word_embed_mode,
                                       dimensions=FLAGS.word_embedding_size)
        elif FLAGS.word_embed_model == constants.FASTTEXT:
            word_embeddings = FastText(mode=FLAGS.word_embed_mode,
                                       dimensions=FLAGS.word_embedding_size)
        else:  # FLAGS.word_embed_model == constants.GLOVE
            word_embeddings = GloVe(mode=FLAGS.word_embed_mode,
                                    dimensions=FLAGS.word_embedding_size)

        g_tree = tf.Graph()
        with g_tree.as_default():
            model = None
            if FLAGS.model == constants.DEEP_RNN:
                model = deepRNN(data, word_embeddings, FLAGS.model_name)
            elif FLAGS.model == constants.BATCH_TREE_RNN:
                model = treeRNN_batch(data, word_embeddings, FLAGS.model_name)
            elif FLAGS.model == constants.NEERBEK_TREE_RNN:
                model = treeRNN_neerbek(data, word_embeddings,
                                        FLAGS.model_name)
            elif FLAGS.model == constants.TREE_LSTM:
                model = treeLSTM(data, word_embeddings, FLAGS.model_name)
            elif FLAGS.model == constants.TRACKER_TREE_RNN:
                model = treeRNN_tracker(data, word_embeddings,
                                        FLAGS.model_name)
            elif FLAGS.model == constants.TRACKER_TREE_LSTM:
                model = treeLSTM_tracker(data, word_embeddings,
                                         FLAGS.model_name)
            elif FLAGS.model == constants.LSTM:
                model = LSTM(data, word_embeddings, FLAGS.model_name)

            with tf.Session(config=tf.ConfigProto(
                    device_count={'GPU': 0})) as sess:
                saver = tf.train.Saver()
                model.load_best(sess, saver, "validation")
                x_train = np.array(
                    model.get_representation(data.train_trees, sess))
                y_train = np.array(get_labels(data.train_trees))
                x_val = np.array(model.get_representation(
                    data.val_trees, sess))
                y_val = np.array(get_labels(data.val_trees))
                x_test = np.array(
                    model.get_representation(data.test_trees, sess))
                y_test = np.array(get_labels(data.test_trees))
            np.save(
                directories.CLASSIFIER_DATA(FLAGS.model_name) + 'x_train',
                x_train)
            np.save(
                directories.CLASSIFIER_DATA(FLAGS.model_name) + 'y_train',
                y_train)
            np.save(
                directories.CLASSIFIER_DATA(FLAGS.model_name) + 'x_val', x_val)
            np.save(
                directories.CLASSIFIER_DATA(FLAGS.model_name) + 'y_val', y_val)
            np.save(
                directories.CLASSIFIER_DATA(FLAGS.model_name) + 'x_test',
                x_test)
            np.save(
                directories.CLASSIFIER_DATA(FLAGS.model_name) + 'y_test',
                y_test)
    return {
        'train': (x_train, y_train),
        'val': (x_val, y_val),
        'test': (x_test, y_test)
    }
Exemple #6
0
    def select_data(self, data, cut_off, cluster_predictions=None):
        roots_size = [tree_util.size_of_tree(root) for root in data]
        data = np.array(helper.sort_by(data, roots_size))

        t = time()
        if cluster_predictions is None:

            # Get representations
            representations, predictions, labels, permutations = [], [], [], []
            batch_size = 500
            batches = helper.batches(data, batch_size, perm=False)
            pbar = tqdm(
                bar_format=
                '{percentage:.0f}%|{bar}| Elapsed: {elapsed}, Remaining: {remaining} (batches: {n_fmt}/{total_fmt}) ',
                total=len(batches))

            for i, batch in enumerate(batches):
                feed_dict, permuts = self.model.build_feed_dict(batch,
                                                                sort=True)
                reps, labs = self.session.run(
                    [self.model.sentence_representations, self.model.labels],
                    feed_dict=feed_dict)
                representations.extend(reps)
                labels.extend(labs)
                permutations.extend(list(i * batch_size + np.array(permuts)))
                pbar.update(1)
            pbar.close()
            print()

            self.representations = np.array(representations)[permutations]
            self.labels = np.array(performance.get_prediction(
                np.array(labels)))[permutations]

            # Get clusters

            try_cluster = True
            tries = 10
            while try_cluster:
                tries -= 1
                self.cluster_predictions = self.cluster_model.cluster(
                    self.representations)
                if np.bincount(self.cluster_predictions).max() <= 0.8 * len(
                        self.representations) or tries >= 0:
                    try_cluster = False

        else:
            self.cluster_predictions = cluster_predictions
            self.labels = tree_util.get_labels(data)

        # Get acc of clusters
        cluster_mfo = []
        cluster_mfo_labels = []
        for i in range(self.num_clusters):
            mfo, l = self.mfo(i)
            cluster_mfo.append((i, mfo))
            cluster_mfo_labels.append((i, l))

        # Return data
        cluster_mfo.sort(key=lambda el: el[1], reverse=True)
        helper._print(f'Cluster MFO scores:')
        for (k, mfo), (_, l) in zip(cluster_mfo, cluster_mfo_labels):
            helper._print(
                f'\tCluster {k}: {mfo}, highest label: {l}, size: {len(self.labels[self.cluster_predictions == k])}/{len(data)}'
            )

        removed_percent = 0
        data_to_use = []
        for cluster, acc in cluster_mfo:
            new_percent = removed_percent + len(
                data[self.cluster_predictions == cluster]) / len(data)
            removed_percent = new_percent
            if acc < cut_off:
                data_to_use.extend(data[self.cluster_predictions == cluster])

        helper._print(
            f'Done selecting data for training. Overall time used for selection is {int((time() - t)/60)} minutes and {int((time() - t) % 60)} seconds'
        )
        return data_to_use, self.cluster_predictions
def parse_trees(dataset="small",
                type='train',
                remove=False):  # todo maybe change input param
    """
    https://github.com/erickrf/treernn/blob/master/tree.py
    :param type: what dataset to use
    :return: a list of trees
    """
    file = directories.TREES_DIRS[dataset] + '%s.txt' % type
    if not os.path.isdir(directories.TREES_DIRS[dataset]):
        os.makedirs(directories.TREES_DIRS[dataset])
    if not os.path.isfile(file):
        if dataset == 'all':
            helper._print(f'Creating new {file}...')
            with open(file, 'w+') as f:
                for l in directories.TREES_ZIP_PATHS:
                    smaller_tree_file = directories.TREES_DIRS[
                        l] + '%s.txt' % type
                    helper._print(f'Merging from {smaller_tree_file}...')
                    if not os.path.isfile(smaller_tree_file):
                        helper._print(
                            f'Extracting {directories.TREES_ZIP_PATHS[l]}...')
                        with zipfile.ZipFile(directories.TREES_ZIP_PATHS[l],
                                             'r') as zip:
                            zip.extractall(path=directories.TREES_DIRS[l])
                        correct_labels(constants.TREE_LABELS[l], l)
                    with open(smaller_tree_file, 'r+') as sf:
                        for tree in sf:
                            f.write(tree)
        elif dataset == 'small':
            helper._print(
                'No small dataset. Try pulling from Git... Or make your own you lazy bastard!'
            )
        else:
            helper._print(
                f'Extracting {directories.TREES_ZIP_PATHS[dataset]}...')
            with zipfile.ZipFile(directories.TREES_ZIP_PATHS[dataset],
                                 'r') as zip:
                zip.extractall(path=directories.TREES_DIRS[dataset])
            correct_labels(constants.TREE_LABELS[dataset], dataset)

    helper._print("Loading %s trees.." % type)
    with open(file, 'r') as fid:
        trees = []
        lines = fid.readlines()
        pbar = tqdm(
            bar_format=
            '{percentage:.0f}%|{bar}| Elapsed: {elapsed}, Remaining: {remaining} ({n_fmt}/{total_fmt}) ',
            total=len(lines))
        for i, l in enumerate(lines):
            if (i + 1) % 1000 == 0:
                pbar.update(1000)
            trees.append(parse_tree(l))
        pbar.update(len(lines) % 1000)
        pbar.close()
        print()
    sentence_length = [count_leaf(tree) for tree in trees]
    sentence_length = np.array(sentence_length)
    helper._print("Avg length:", np.average(sentence_length))
    trees = np.array(trees)
    if remove:
        helper._print(
            "Shorten then 90 word:",
            int(
                np.sum(np.array(sentence_length) <= 90) /
                len(sentence_length) * 100), "%")
        helper._print("Ratio of removed labels:",
                      ratio_of_labels(trees[np.array(sentence_length) > 90]))
        trees = np.array(
            helper.sort_by(trees[np.array(sentence_length) <= 90],
                           sentence_length[np.array(sentence_length) <= 90]))
    return trees