def build_feed_dict(self, roots, sort=True, train=False): if sort: roots_size = [tree_util.size_of_tree(root) for root in roots] roots = helper.sort_by(roots, roots_size) roots_size = [tree_util.size_of_tree(root) for root in roots] roots_list, permutation = helper.greedy_bin_packing(roots, roots_size, np.max(roots_size)) node_list_list = [] root_indices = [] internal_nodes_array = [] lstm_prev_list = [] for i, roots in enumerate(roots_list): node_list = [] root_index = 0 leaf_index = 0 lstm_prev = [0] lstm_prev_count = 0 for root in roots: tree_util.depth_first_traverse(root, node_list, lambda node, node_list: node_list.append(node)) leaf_count = tree_util.leafs_in_tree(root) root_index += leaf_count root_indices.append([i, root_index]) for j in range(0,leaf_count): leaf_index += 1 internal_nodes_array.append([i, leaf_index]) leaf_count = tree_util.leafs_in_tree(root) for x in range(0, leaf_count): if x == 0: lstm_prev.append(0) else: lstm_prev.append(lstm_prev_count) lstm_prev_count += 1 node_list_list.append(node_list) lstm_prev_list.append(lstm_prev) feed_dict = { self.dropout_rate: FLAGS.dropout_prob if train else 0, self.lstm_prev_array: helper.lists_pad(lstm_prev_list, 0), self.leaf_word_array: helper.lists_pad( [[0] + [self.word_embed.get_idx(node.value) for node in node_list if node.is_leaf] for node_list in node_list_list] ,0), self.loss_array: root_indices if self.use_root_loss else internal_nodes_array, self.root_array: root_indices, self.label_array: helper.lists_pad([ [[0, 0]] + [node.label for node in node_list if node.is_leaf] for node_list in node_list_list], [0, 0]) } return feed_dict, permutation
def build_feed_dict_batch_test(self, root): node_list = [] tree_util.depth_first_traverse( root[0], node_list, lambda node, node_list: node_list.append(node)) node_to_index = helper.reverse_dict(node_list) feed_dict = { self.root_array: [tree_util.size_of_tree(root[0]) - 1 + 1], self.is_leaf_array: [False] + [node.is_leaf for node in node_list], self.word_index_array: [0] + [ self.data.word_embed_util.get_idx(node.value) for node in node_list ], self.left_child_array: [0] + helper.add_one([ node_to_index[node.left_child] if node.left_child is not None else -1 for node in node_list ]), self.right_child_array: [0] + helper.add_one([ node_to_index[node.right_child] if node.right_child is not None else -1 for node in node_list ]), self.label_array: [[0, 0]] + [node.label for node in node_list] } return feed_dict
def build_feed_dict_batch(self, batch): node_list_list = [] node_to_index_list = [] root_array = [] last_root = -1 for root in batch: node_list = [] tree_util.depth_first_traverse(root, node_list, lambda node, node_list: node_list.append(node)) node_list_list.append(node_list) node_to_index = helper.reverse_dict(node_list) node_to_index_list.append(node_to_index) last_root += tree_util.size_of_tree(root) root_array.append(last_root) # TODO der er sikkert noget galt her feed_dict = { self.root_array: root_array, self.is_leaf_array: helper.flatten([[node.is_leaf for node in node_list] for node_list in node_list_list]), self.word_index_array: helper.flatten([[self.data.word_embed_util.get_idx(node.value) for node in node_list] for node_list in node_list_list]), # todo måske wrap på en anden måde self.left_child_array: helper.flatten([ [node_to_index[node.left_child] if node.left_child is not None else -1 for node in node_list] for node_list, node_to_index in zip(node_list_list, node_to_index_list)]), self.right_child_array: helper.flatten([ [node_to_index[node.right_child] if node.right_child is not None else -1 for node in node_list] for node_list, node_to_index in zip(node_list_list, node_to_index_list)]), self.label_array: helper.flatten([[node.label for node in node_list] for node_list in node_list_list]) } return feed_dict
def __init__(self, data, model, sess): """ :param data: :param model: expect an initialized and trained model :return: """ roots_size = [tree_util.size_of_tree(root) for root in data] data = helper.sort_by(data, roots_size) probs, labels = model.predict_and_label(data, sess) labels = get_prediction(labels) predictions = get_prediction(probs) self.acc = get_accuracy(labels, predictions) if len(data) < 1500: print(model.accuracy(data, sess)) self.TP, self.FP, self.TN, self.FN = get_confusion_matrix(labels, predictions) self.precision = self.TP / (self.TP + self.FP) self.recall = self.TP / (self.TP + self.FN) self.F1 = 2 * (self.precision * self.recall) / (self.precision + self.recall) self.TPR_list, self.FPR_list = get_roc_values(labels, probs) self.auc = metrics.auc(self.FPR_list, self.TPR_list)
def build_feed_dict_batch(self, roots): print("Batch size:", len(roots)) node_list_list = [] node_to_index_list = [] for root in roots: node_list = [] tree_util.depth_first_traverse( root, node_list, lambda node, node_list: node_list.append(node)) node_list_list.append(node_list) node_to_index = helper.reverse_dict(node_list) node_to_index_list.append(node_to_index) feed_dict = { self.root_array: [tree_util.size_of_tree(root) for root in roots], self.is_leaf_array: helper.lists_pad([[False] + [node.is_leaf for node in node_list] for node_list in node_list_list], False), self.word_index_array: helper.lists_pad([[0] + [ self.data.word_embed_util.get_idx(node.value) for node in node_list ] for node_list in node_list_list], 0), self.left_child_array: helper.lists_pad([[0] + helper.add_one([ node_to_index[node.left_child] if node.left_child is not None else -1 for node in node_list ]) for node_list, node_to_index in zip( node_list_list, node_to_index_list)], 0), self.right_child_array: helper.lists_pad([[0] + helper.add_one([ node_to_index[node.right_child] if node.right_child is not None else -1 for node in node_list ]) for node_list, node_to_index in zip( node_list_list, node_to_index_list)], 0), self.label_array: helper.lists_pad([[[0, 0]] + [node.label for node in node_list] for node_list in node_list_list], [0, 0]) } print(feed_dict[self.right_child_array]) return feed_dict
def build_feed_dict(self, root): node_list = [] tree_util.depth_first_traverse(root, node_list, lambda node, node_list: node_list.append(node)) node_to_index = helper.reverse_dict(node_list) # TODO der er sikkert noget galt her feed_dict = { self.root_array: [tree_util.size_of_tree(root) - 1], self.is_leaf_array: [node.is_leaf for node in node_list], self.word_index_array: [self.data.word_embed_util.get_idx(node.value) for node in node_list], # todo måske wrap på en anden måde self.left_child_array: [node_to_index[node.left_child] if node.left_child is not None else -1 for node in node_list], self.right_child_array: [node_to_index[node.right_child] if node.right_child is not None else -1 for node in node_list], self.label_array: [node.label for node in node_list] } return feed_dict
from utils import data_util, tree_util, helper _data_util = data_util.DataUtil() data = _data_util.get_data() roots_size = [tree_util.size_of_tree(root) for root in data.train_trees] roots = helper.sort_by(data.train_trees, roots_size) for root in roots[-5:]: print(root.label) print(root.to_sentence()) print()
def build_feed_dict(self, roots, sort=True, train=False): if sort: roots_size = [tree_util.size_of_tree(root) for root in roots] roots = helper.sort_by(roots, roots_size) roots_size = [tree_util.size_of_tree(root) for root in roots] roots_list, permutation = helper.greedy_bin_packing( roots, roots_size, np.max(roots_size)) node_list_list = [] node_to_index_list = [] root_indices = [] lstm_idx_list = [] internal_nodes_array = [] for i, roots in enumerate(roots_list): node_list = [] lstm_idx = [0] root_index = 0 start = 0 for root in roots: tree_util.depth_first_traverse( root, node_list, lambda node, node_list: node_list.append(node)) _, start = tree_util.get_preceding_lstm_index( root, start, start, lstm_idx) root_index += tree_util.size_of_tree(root) root_indices.append([i, root_index]) node_list_list.append(node_list) node_to_index = helper.reverse_dict(node_list) node_to_index_list.append(node_to_index) lstm_idx_list.append(lstm_idx) for node in node_list: if not node.is_leaf: internal_nodes_array.append([i, node_to_index[node] + 1]) internal_nodes_array = internal_nodes_array if len( internal_nodes_array) > 0 else [[0, 0]] feed_dict = { self.dropout_rate: FLAGS.dropout_prob if train else 0, self.leaf_word_array: helper.lists_pad([[0] + [ self.word_embed.get_idx(node.value) for node in node_list if node.is_leaf ] for node_list in node_list_list], 0), self.lstm_index_array: helper.lists_pad(lstm_idx_list, 0), self.loss_array: root_indices if self.use_root_loss else internal_nodes_array, self.root_array: root_indices, self.is_leaf_array: helper.lists_pad( [[0] + helper.to_int([node.is_leaf for node in node_list]) for node_list in node_list_list], 0), self.word_index_array: helper.lists_pad( [[0] + [self.word_embed.get_idx(node.value) for node in node_list] for node_list in node_list_list], self.word_embed.get_idx("ZERO")), self.left_child_array: helper.lists_pad([[0] + helper.add_one([ node_to_index[node.left_child] if node.left_child is not None else -1 for node in node_list ]) for node_list, node_to_index in zip( node_list_list, node_to_index_list)], 0), self.right_child_array: helper.lists_pad([[0] + helper.add_one([ node_to_index[node.right_child] if node.right_child is not None else -1 for node in node_list ]) for node_list, node_to_index in zip( node_list_list, node_to_index_list)], 0), self.label_array: helper.lists_pad([[[0, 0]] + [node.label for node in node_list] for node_list in node_list_list], [0, 0]) } return feed_dict, permutation
def get_data(): if not os.path.exists(directories.CLASSIFIER_DATA_DIR): os.mkdir(directories.CLASSIFIER_DATA_DIR) if not os.path.exists(directories.CLASSIFIER_DATA(FLAGS.model_name)): os.mkdir(directories.CLASSIFIER_DATA(FLAGS.model_name)) if os.path.exists( directories.CLASSIFIER_DATA(FLAGS.model_name) + 'x_train.npy'): x_train = np.load( directories.CLASSIFIER_DATA(FLAGS.model_name) + 'x_train.npy') y_train = np.load( directories.CLASSIFIER_DATA(FLAGS.model_name) + 'y_train.npy') x_val = np.load( directories.CLASSIFIER_DATA(FLAGS.model_name) + 'x_val.npy') y_val = np.load( directories.CLASSIFIER_DATA(FLAGS.model_name) + 'y_val.npy') x_test = np.load( directories.CLASSIFIER_DATA(FLAGS.model_name) + 'x_test.npy') y_test = np.load( directories.CLASSIFIER_DATA(FLAGS.model_name) + 'y_test.npy') else: _data_util = data_util.DataUtil() data = _data_util.get_data() roots_size = [ tree_util.size_of_tree(root) for root in data.train_trees ] data.train_trees = helper.sort_by(data.train_trees, roots_size) roots_size = [tree_util.size_of_tree(root) for root in data.val_trees] data.val_trees = helper.sort_by(data.val_trees, roots_size) roots_size = [tree_util.size_of_tree(root) for root in data.test_trees] data.test_trees = helper.sort_by(data.test_trees, roots_size) if FLAGS.use_gpu: config = None else: config = tf.ConfigProto(device_count={'GPU': 0}) if FLAGS.word_embed_model == constants.WORD2VEC: word_embeddings = Word2Vec(mode=FLAGS.word_embed_mode, dimensions=FLAGS.word_embedding_size) elif FLAGS.word_embed_model == constants.FASTTEXT: word_embeddings = FastText(mode=FLAGS.word_embed_mode, dimensions=FLAGS.word_embedding_size) else: # FLAGS.word_embed_model == constants.GLOVE word_embeddings = GloVe(mode=FLAGS.word_embed_mode, dimensions=FLAGS.word_embedding_size) g_tree = tf.Graph() with g_tree.as_default(): model = None if FLAGS.model == constants.DEEP_RNN: model = deepRNN(data, word_embeddings, FLAGS.model_name) elif FLAGS.model == constants.BATCH_TREE_RNN: model = treeRNN_batch(data, word_embeddings, FLAGS.model_name) elif FLAGS.model == constants.NEERBEK_TREE_RNN: model = treeRNN_neerbek(data, word_embeddings, FLAGS.model_name) elif FLAGS.model == constants.TREE_LSTM: model = treeLSTM(data, word_embeddings, FLAGS.model_name) elif FLAGS.model == constants.TRACKER_TREE_RNN: model = treeRNN_tracker(data, word_embeddings, FLAGS.model_name) elif FLAGS.model == constants.TRACKER_TREE_LSTM: model = treeLSTM_tracker(data, word_embeddings, FLAGS.model_name) elif FLAGS.model == constants.LSTM: model = LSTM(data, word_embeddings, FLAGS.model_name) with tf.Session(config=tf.ConfigProto( device_count={'GPU': 0})) as sess: saver = tf.train.Saver() model.load_best(sess, saver, "validation") x_train = np.array( model.get_representation(data.train_trees, sess)) y_train = np.array(get_labels(data.train_trees)) x_val = np.array(model.get_representation( data.val_trees, sess)) y_val = np.array(get_labels(data.val_trees)) x_test = np.array( model.get_representation(data.test_trees, sess)) y_test = np.array(get_labels(data.test_trees)) np.save( directories.CLASSIFIER_DATA(FLAGS.model_name) + 'x_train', x_train) np.save( directories.CLASSIFIER_DATA(FLAGS.model_name) + 'y_train', y_train) np.save( directories.CLASSIFIER_DATA(FLAGS.model_name) + 'x_val', x_val) np.save( directories.CLASSIFIER_DATA(FLAGS.model_name) + 'y_val', y_val) np.save( directories.CLASSIFIER_DATA(FLAGS.model_name) + 'x_test', x_test) np.save( directories.CLASSIFIER_DATA(FLAGS.model_name) + 'y_test', y_test) return { 'train': (x_train, y_train), 'val': (x_val, y_val), 'test': (x_test, y_test) }
def select_data(self, data, cut_off, cluster_predictions=None): roots_size = [tree_util.size_of_tree(root) for root in data] data = np.array(helper.sort_by(data, roots_size)) t = time() if cluster_predictions is None: # Get representations representations, predictions, labels, permutations = [], [], [], [] batch_size = 500 batches = helper.batches(data, batch_size, perm=False) pbar = tqdm( bar_format= '{percentage:.0f}%|{bar}| Elapsed: {elapsed}, Remaining: {remaining} (batches: {n_fmt}/{total_fmt}) ', total=len(batches)) for i, batch in enumerate(batches): feed_dict, permuts = self.model.build_feed_dict(batch, sort=True) reps, labs = self.session.run( [self.model.sentence_representations, self.model.labels], feed_dict=feed_dict) representations.extend(reps) labels.extend(labs) permutations.extend(list(i * batch_size + np.array(permuts))) pbar.update(1) pbar.close() print() self.representations = np.array(representations)[permutations] self.labels = np.array(performance.get_prediction( np.array(labels)))[permutations] # Get clusters try_cluster = True tries = 10 while try_cluster: tries -= 1 self.cluster_predictions = self.cluster_model.cluster( self.representations) if np.bincount(self.cluster_predictions).max() <= 0.8 * len( self.representations) or tries >= 0: try_cluster = False else: self.cluster_predictions = cluster_predictions self.labels = tree_util.get_labels(data) # Get acc of clusters cluster_mfo = [] cluster_mfo_labels = [] for i in range(self.num_clusters): mfo, l = self.mfo(i) cluster_mfo.append((i, mfo)) cluster_mfo_labels.append((i, l)) # Return data cluster_mfo.sort(key=lambda el: el[1], reverse=True) helper._print(f'Cluster MFO scores:') for (k, mfo), (_, l) in zip(cluster_mfo, cluster_mfo_labels): helper._print( f'\tCluster {k}: {mfo}, highest label: {l}, size: {len(self.labels[self.cluster_predictions == k])}/{len(data)}' ) removed_percent = 0 data_to_use = [] for cluster, acc in cluster_mfo: new_percent = removed_percent + len( data[self.cluster_predictions == cluster]) / len(data) removed_percent = new_percent if acc < cut_off: data_to_use.extend(data[self.cluster_predictions == cluster]) helper._print( f'Done selecting data for training. Overall time used for selection is {int((time() - t)/60)} minutes and {int((time() - t) % 60)} seconds' ) return data_to_use, self.cluster_predictions