Esempio n. 1
0
 def compute(self, data_set, data, model, _print=False):
     if FLAGS.use_gpu:
         feed_dict, _ = model.build_feed_dict(data)
         acc, loss = self.sess.run([model.acc, model.loss], feed_dict=feed_dict)
         self.add(data_set, acc, loss)
         self.write_and_reset(data_set, _print=_print)
     else:
         for batch in helper.batches(data, 2):
             feed_dict, _ = model.build_feed_dict(batch)
             acc, loss = self.sess.run([model.acc, model.loss], feed_dict=feed_dict)
             self.add(data_set, acc, loss)
         self.write_and_reset(data_set, _print=_print)
Esempio n. 2
0
 def predict_and_label(self, data, sess):
     helper._print_subheader("Predicting")
     prob, labels = [], []
     batches = helper.batches(data,
                              batch_size=500 if FLAGS.use_gpu else 2,
                              use_tail=True,
                              perm=False)
     for batch in batches:
         feed_dict, _ = self.build_feed_dict(batch)
         p, l = sess.run([self.p, self.labels], feed_dict=feed_dict)
         prob.extend(p)
         labels.extend(l)
     return prob, labels
Esempio n. 3
0
    def train(self, train_data):
        helper._print("Learning rate:", self.sess.run(self.model.lr))
        done = False
        run_time = 0
        while not done:
            batches = helper.batches(train_data, self.batch_size, perm=True)
            pbar = tqdm(
                bar_format=
                "(Training) {percentage:.0f}%|{bar}| Elapsed: {elapsed}, Remaining: {remaining} ({n_fmt}/{total_fmt})",
                total=len(batches))
            for step, batch in enumerate(batches):
                self.summary.batch_inc()
                feed_dict, _ = self.model.build_feed_dict(batch, train=True)
                start_run_time = time.time()

                _, acc, loss = self.sess.run(
                    [self.model.train_op, self.model.acc, self.model.loss],
                    feed_dict=feed_dict)
                self.summary.add(self.summary.TRAIN, acc, loss)

                end_run_time = time.time()
                run_time += end_run_time - start_run_time

                pbar.update(1)
            pbar.close()
            print()

            # pbar = tqdm(
            #     bar_format="(Accuracy) {percentage:.0f}%|{bar}| Elapsed: {elapsed}, Remaining: {remaining} ({n_fmt}/{total_fmt})",
            #     total=len(batches))
            # for step, batch in enumerate(batches):
            #     acc_feed_dict, _ = self.model.build_feed_dict(batch)
            #     acc, loss = self.sess.run([self.model.acc, self.model.loss],
            #                               feed_dict=acc_feed_dict)
            #     self.summary.add(self.summary.TRAIN, acc, loss)
            #     pbar.update(1)
            # pbar.close()
            # print()
            # loading and saving tmp model - just in case something goes wrong
            if not self.summary.write_and_reset(self.summary.TRAIN,
                                                _print=True):
                helper._print("Nan loss encountered, trying again...")
                self.model.load_tmp(self.sess, self.saver)
            else:
                done = True
                self.model.save_tmp(self.sess, self.saver)

            helper._print(
                "Training time:",
                str(int(run_time / 60)) + "m " + str(int(run_time % 60)) + "s")
        return run_time
Esempio n. 4
0
 def get_representation(self, data, sess):
     rep = []
     batches = helper.batches(data, batch_size=2, use_tail=True, perm=False)
     pbar = tqdm(
         bar_format=
         "(Representation) {percentage:.0f}%|{bar}| Elapsed: {elapsed}, Remaining: {remaining} ({n_fmt}/{total_fmt})",
         total=len(batches))
     for batch in batches:
         feed_dict, _ = self.build_feed_dict(batch)
         r = sess.run(self.sentence_representations, feed_dict=feed_dict)
         rep.extend(r)
         pbar.update(1)
     pbar.close()
     return rep
Esempio n. 5
0
    def select_data(self, data, cut_off, cluster_predictions=None):
        roots_size = [tree_util.size_of_tree(root) for root in data]
        data = np.array(helper.sort_by(data, roots_size))

        t = time()
        if cluster_predictions is None:

            # Get representations
            representations, predictions, labels, permutations = [], [], [], []
            batch_size = 500
            batches = helper.batches(data, batch_size, perm=False)
            pbar = tqdm(
                bar_format=
                '{percentage:.0f}%|{bar}| Elapsed: {elapsed}, Remaining: {remaining} (batches: {n_fmt}/{total_fmt}) ',
                total=len(batches))

            for i, batch in enumerate(batches):
                feed_dict, permuts = self.model.build_feed_dict(batch,
                                                                sort=True)
                reps, labs = self.session.run(
                    [self.model.sentence_representations, self.model.labels],
                    feed_dict=feed_dict)
                representations.extend(reps)
                labels.extend(labs)
                permutations.extend(list(i * batch_size + np.array(permuts)))
                pbar.update(1)
            pbar.close()
            print()

            self.representations = np.array(representations)[permutations]
            self.labels = np.array(performance.get_prediction(
                np.array(labels)))[permutations]

            # Get clusters

            try_cluster = True
            tries = 10
            while try_cluster:
                tries -= 1
                self.cluster_predictions = self.cluster_model.cluster(
                    self.representations)
                if np.bincount(self.cluster_predictions).max() <= 0.8 * len(
                        self.representations) or tries >= 0:
                    try_cluster = False

        else:
            self.cluster_predictions = cluster_predictions
            self.labels = tree_util.get_labels(data)

        # Get acc of clusters
        cluster_mfo = []
        cluster_mfo_labels = []
        for i in range(self.num_clusters):
            mfo, l = self.mfo(i)
            cluster_mfo.append((i, mfo))
            cluster_mfo_labels.append((i, l))

        # Return data
        cluster_mfo.sort(key=lambda el: el[1], reverse=True)
        helper._print(f'Cluster MFO scores:')
        for (k, mfo), (_, l) in zip(cluster_mfo, cluster_mfo_labels):
            helper._print(
                f'\tCluster {k}: {mfo}, highest label: {l}, size: {len(self.labels[self.cluster_predictions == k])}/{len(data)}'
            )

        removed_percent = 0
        data_to_use = []
        for cluster, acc in cluster_mfo:
            new_percent = removed_percent + len(
                data[self.cluster_predictions == cluster]) / len(data)
            removed_percent = new_percent
            if acc < cut_off:
                data_to_use.extend(data[self.cluster_predictions == cluster])

        helper._print(
            f'Done selecting data for training. Overall time used for selection is {int((time() - t)/60)} minutes and {int((time() - t) % 60)} seconds'
        )
        return data_to_use, self.cluster_predictions
Esempio n. 6
0
    def train_old(self):
        helper._print_header("Training tRNN")
        helper._print("Test ration:",
                      tree_util.ratio_of_labels(self.data.test_trees))
        helper._print("Validation ration:",
                      tree_util.ratio_of_labels(self.data.val_trees))
        helper._print("Train ration:",
                      tree_util.ratio_of_labels(self.data.train_trees))

        # todo make a flag for this
        config = tf.ConfigProto(device_count={'GPU': 0})

        with tf.Session(config=config) as sess:
            model_placement = FLAGS.models_dir + FLAGS.model_name + "model.ckpt"

            # Summary writer for both the training and the set acc and loss - used for tensorboard
            self.make_needed_dir()
            directory = FLAGS.logs_dir + FLAGS.model_name
            train_writer = tf.summary.FileWriter(directory + 'train',
                                                 sess.graph)
            validation_writer = tf.summary.FileWriter(directory + 'validation')
            test_writer = tf.summary.FileWriter(directory + 'test')

            history = self.get_history()
            starting_steps = 0
            best_acc = 0

            # Run the init
            saver = tf.train.Saver()
            self.run_tensorboard()
            if FLAGS.load_model:
                history, starting_steps, best_acc = self.load_history()
                helper._print("Previously", starting_steps,
                              "steps has been ran, best acc was:", best_acc)

                self.load_model(sess, model_placement, saver)
                self.write_history_to_summary(history, train_writer,
                                              validation_writer, test_writer)
                sess.run(tf.assign(self.global_step, starting_steps))
            else:
                sess.run(self.init)
                self.handle_val_test(history, sess, test_writer, 0,
                                     validation_writer)

            start_time = time.time()
            loss_total = 0
            acc_total = 0
            for epoch in range(FLAGS.epochs):
                helper._print_header("Epoch " + str(epoch + 1))

                batch_size = (FLAGS.batch_size if epoch >= 10 else 1)

                print_interval = FLAGS.print_step_interval / batch_size
                for step, tree in enumerate(
                        helper.batches(
                            np.random.permutation(self.data.train_trees),
                            batch_size)):  # todo build train get_trees
                    if step % int(print_interval) == 0:
                        total_step = starting_steps + epoch * int(
                            len(self.data.train_trees)) + step * batch_size
                        helper._print("Step:", total_step)
                        helper._print("Learning rate:",
                                      sess.run(self.learning_rate))

                        avg_acc = acc_total / print_interval
                        avg_loss = loss_total / print_interval
                        if epoch != 0 or step != 0:
                            self.write_to_summary(avg_acc, avg_loss,
                                                  total_step, train_writer)
                            helper._print("Train -  acc:", avg_acc, "loss:",
                                          avg_loss)
                            history["train"].append(
                                (total_step, avg_acc, avg_loss))

                            val_acc = self.handle_val_test(
                                history, sess, test_writer, total_step,
                                validation_writer)

                            loss_total = 0
                            acc_total = 0

                            if val_acc > best_acc:
                                best_acc = val_acc
                                helper._print("A better model was found!")

                                saver.save(sess, model_placement)

                                np.savez(FLAGS.histories_dir +
                                         FLAGS.model_name + 'history.npz',
                                         train=history["train"],
                                         test=history["test"],
                                         val=history["val"],
                                         total_steps=total_step,
                                         best_acc=best_acc)

                                helper._print("Model saved!")

                    feed_dict = self.build_feed_dict_batch(
                        tree)  # todo maybe change to batches
                    _, acc, loss = sess.run(
                        [self.train_op, self.acc, self.loss],
                        feed_dict=feed_dict)

                    acc_total += acc
                    loss_total += loss

                helper._print("Avg Epoch Time:",
                              (time.time() - start_time) / (epoch + 1) / 60,
                              "m")