def trainIters(self, n_iters, model_file_path=None):
     print("trainIters__Started___model_file_path is : ", model_file_path)
     iter, running_avg_loss = self.setup_train(model_file_path)
     start = time.time()
     print("Max iteration : n_iters = ", n_iters)
     print("going to start running iter NO : ", iter)
     print("\n******************************\n")
     while iter < n_iters:
         print("\n###################################\n")
         print("iter : ", iter)
         batch = self.batcher.next_batch()
         print("batch data loading : ", batch)
         loss = self.train_one_batch(batch)
         running_avg_loss = calc_running_avg_loss(loss, running_avg_loss,
                                                  self.summary_writer, iter)
         print("running_avg_loss : ", running_avg_loss)
         iter += 1
         if iter % 100 == 0:  ##100
             self.summary_writer.flush()
         print_interval = 100  #1000
         if iter % print_interval == 0:
             print('steps %d, seconds for %d batch: %.2f , loss: %f' %
                   (iter, print_interval, time.time() - start, loss))
             start = time.time()
         if iter % 500 == 0:  ##5000
             self.save_model(running_avg_loss, iter)
    def trainIters(self, n_iters, model_file_path=None):
        iter, running_avg_loss = self.setup_train(model_file_path)
        start = time.time()
        while iter < n_iters:
            batch = self.batcher.next_batch()
            loss = self.train_one_batch(batch)

            running_avg_loss = calc_running_avg_loss(loss, running_avg_loss, self.summary_writer, iter)
            iter += 1

            if (math.isnan(running_avg_loss)):
                print('Found a nan loss return. Restarting the training at {}' \
                        .format(self.last_good_model_save_path))
                iter, running_avg_loss = self.setup_train(self.last_good_model_save_path)
                start = time.time()

            if iter % 100 == 0:
                self.summary_writer.flush()
            print_interval = 1000
            if iter % print_interval == 0:
                print('steps %d, seconds for %d batch: %.2f , loss: %f' % (iter, print_interval,
                                                                           time.time() - start, loss))
                start = time.time()
            if iter % 1000 == 0:
                self.save_model(running_avg_loss, iter)
    def run_eval(self):
        running_avg_loss, iter = 0, 0
        start = time.time()
        batch = self.batcher.next_batch()
        print(
            "-----------------------------------------STARTING EVALATION---------------------------------------"
        )
        with open(config.eval_log, 'a+', encoding='utf-8') as f:
            f.write(
                "-----------------------------------------STARTING EVALATION---------------------------------------"
                + "\n")
        while batch is not None:
            loss = self.eval_one_batch(batch)

            running_avg_loss = calc_running_avg_loss(loss, running_avg_loss,
                                                     self.summary_writer, iter)
            iter += 1
            if iter % 20 == 0:
                self.summary_writer.flush()
            print_interval = 100
            if iter % print_interval == 0:
                print('steps %d, seconds for %d batch: %.2f , loss: %f' %
                      (iter, print_interval, time.time() - start,
                       running_avg_loss))
                start = time.time()
                with open(config.eval_log, 'a+', encoding='utf-8') as f:
                    f.write("Steps: " + str(iter) + "   loss: " +
                            str(running_avg_loss) + "\n")
            if (iter + 1) % config.max_iterations_eval == 0:
                break
            batch = self.batcher.next_batch()
        return running_avg_loss
Beispiel #4
0
    def trainIters(self, n_iters, model_file_path=None):
        iter, running_avg_loss = self.setup_train(model_file_path)
        start = time.time()
        while iter < n_iters:
            # print("iteration", iter)
            batch = self.batcher.next_batch()
            loss = self.train_one_batch(batch)

            running_avg_loss = calc_running_avg_loss(loss, running_avg_loss,
                                                     self.summary_writer, iter)
            iter += 1

            if iter % 10000 == 0:
                self.summary_writer.flush()

            print_interval = 1000
            if iter % print_interval == 0:
                print('steps %d, seconds for %d batch: %.2f , loss: %f' %
                      (iter, print_interval, time.time() - start, loss))
                logging.info(
                    'steps %d, seconds for %d batch: %.2f , loss: %f' %
                    (iter, print_interval, time.time() - start, loss))
                start = time.time()
            if iter % 10000 == 0:
                self.save_model(running_avg_loss, iter)
Beispiel #5
0
    def trainIters(self, n_iters, model_file_path=None):
        iter, running_avg_loss = self.setup_train(model_file_path)
        start = time.time()
        best_loss = 20
        best_iter = 0
        while iter < n_iters:
            batch = self.batcher.next_batch()
            loss = self.train_one_batch(batch, iter)

            running_avg_loss = calc_running_avg_loss(loss, running_avg_loss, self.summary_writer, iter)
            iter += 1

#            is_new_best = (running_avg_loss < best_loss) and (iter - best_iter >= 100)
#            best_loss = min(running_avg_loss, best_loss)

            if iter % 20 == 0:
                self.summary_writer.flush()
            print_interval = 100
            if iter % print_interval == 0:
                print('steps %d, seconds for %d batch: %.2f , loss: %f' % (iter, print_interval,
                                                                           time.time() - start, loss))
                start = time.time()
            if iter % 2500 == 0:
                self.save_model(running_avg_loss, iter)
            if loss < 2.0:
              self.save_best_so_far(running_avg_loss, iter)
Beispiel #6
0
    def trainIters(self, n_iters, model_file_path=None):
        iter, running_avg_loss = self.setup_train(model_file_path)
        sys.stdout.flush()

        # data_path = "lib/data/batches_train.vocab50000.batch16.pk.bin"
        # with open(data_path, 'rb') as f:
        #     stored_batches = pickle.load(f, encoding="bytes")
        # print("loaded data: {}".format(data_path))
        # num_batches = len(stored_batches)

        while iter < n_iters:
            batch = self.batcher.next_batch()
            # batch_id = iter%num_batches
            # batch = stored_batches[batch_id]

            loss = self.train_one_batch(batch)

            # running_avg_loss = calc_running_avg_loss(loss, running_avg_loss, self.summary_writer, iter)
            running_avg_loss = calc_running_avg_loss(loss, running_avg_loss, iter)

            iter += 1

            # if iter % 100 == 0:
            #     self.summary_writer.flush()

            if iter % self.print_interval == 0:
                print("[{}] iter {}, loss: {:.5f}".format(str(datetime.now()), iter, loss))
                sys.stdout.flush()

            if iter % config.save_every == 0:
                self.save_model(running_avg_loss, iter)

        print("Finished training!")
Beispiel #7
0
    def run_eval(self):
        running_avg_loss, iter = 0, 0
        batch_losses = []
        # while batch is not None:
        for _ in range(835):
            batch = self.batcher.next_batch()

            loss = self.eval_one_batch(batch)
            batch_losses.append(loss)
            # running_avg_loss = calc_running_avg_loss(loss, running_avg_loss, self.summary_writer, iter)
            running_avg_loss = calc_running_avg_loss(loss, running_avg_loss,
                                                     iter)
            iter += 1

            # if iter % 100 == 0:
            #     self.summary_writer.flush()

            print_interval = 10
            if iter % print_interval == 0:
                print("[{}] iter {}, loss: {:.5f}".format(
                    str(datetime.now()), iter, loss))

        avg_loss = sum(batch_losses) / len(batch_losses)
        print("Finished Eval for Model {}: Avg Loss = {:.5f}".format(
            self.model_file_path, avg_loss))
Beispiel #8
0
    def trainIters(self, n_iters, model_file_path=None):
        iter, running_avg_loss = self.setup_train(model_file_path)
        start = time.time()
        while iter < n_iters:
            batch = self.batcher.next_batch()
            loss = self.train_one_batch(batch)

            running_avg_loss = calc_running_avg_loss(loss, running_avg_loss,
                                                     self.summary_writer, iter)
            iter += 1
            #print ("Iteration:",iter) #CM - debugging

            if iter % 100 == 0:
                self.summary_writer.flush()
            print_interval = 1000
            if iter % print_interval == 0:
                print('steps %d, seconds for %d batch: %.2f , loss: %f' %
                      (iter, print_interval, time.time() - start, loss))
                start = time.time()
            if iter % 5000 == 0:
                print("Iteration:", iter)  #CM - debugging

                self.save_model(running_avg_loss, iter)
            #CM - debugging - if reach the end before hitting 5000, write the model out
            elif iter == n_iters:
                self.save_model(running_avg_loss, iter)
    def trainIters(self, n_iters, model_file_path=None):
        start_iter, running_avg_loss = self.setup_train(model_file_path)
        start = time.time()
        best_val_loss = None
        for it in tqdm(range(start_iter, n_iters)):
            iter = start_iter + it
            self.model.train()
            batch = self.batcher.next_batch()
            loss = self.train_one_batch(batch)

            running_avg_loss = calc_running_avg_loss(loss, running_avg_loss,
                                                     iter)
            #iter += 1

            print_interval = 1000
            if iter != 0 and iter % print_interval == 0:
                print('steps %d, seconds for %d batch: %.2f , loss: %f' %
                      (iter, print_interval, time.time() - start, loss))
                start = time.time()
            if iter != 0 and iter % 5000 == 0:
                loss = self.run_eval()
                if best_val_loss is None or loss < best_val_loss:
                    best_val_loss = loss
                    self.save_model(running_avg_loss, iter)
                    print("Saving best model")
Beispiel #10
0
    def trainIters(self, n_iters, model_file_path=None):
        iter, running_avg_loss = self.setup_train(model_file_path)
        start = time.time()

        start_iter = iter
        while iter < n_iters:
            batch = self.batcher.next_batch()
            loss, tau = self.train_one_batch(batch)

            running_avg_loss = calc_running_avg_loss(loss, running_avg_loss,
                                                     self.summary_writer, iter)
            iter += 1

            if config.DEBUG:
                debug('iter', iter)
                if iter - start_iter > config.BREAK_POINT:
                    break

            if iter % 100 == 0:
                self.summary_writer.flush()
            print_interval = 100
            if iter % print_interval == 0:
                print('steps %d, seconds for %d batch: %.2f , loss: %f' %
                      (iter, print_interval, time.time() - start, loss))
                if config.adaptive_sparsemax:
                    print('tau + eps', [
                        round(e[0], 4)
                        for e in (tau +
                                  config.eps).detach().cpu().numpy().tolist()
                    ])
                start = time.time()
            if iter % 5000 == 0:
                self.save_model(running_avg_loss, iter)
    def trainIters(self, n_iters, model_file_path=None):
        iter, running_avg_loss = self.setup_train(model_file_path)
        start = time.time()
        while iter < n_iters:
            batch = self.batcher.next_batch()
            loss = self.train_one_batch(batch, iter)
            val_loss = None
            if iter % 100 == 0:
                val_batch = self.val_batcher.next_batch()
                val_loss = self.eval_one_batch(val_batch)
                # print("val_loss",val_loss)
                self.scheduler.step()
                print("lr", self.scheduler.get_lr())

            running_avg_loss = calc_running_avg_loss(loss, running_avg_loss,
                                                     self.summary_writer, iter)
            iter += 1

            if iter % 100 == 0:
                self.summary_writer.flush()
            print_interval = 1
            if iter % print_interval == 0:
                if val_loss is not None:
                    print(
                        'steps %d, seconds for %d batch: %.2f , loss: %f , eval_loss: %f'
                        % (iter, print_interval, time.time() - start, loss,
                           val_loss))
                else:
                    print('steps %d, seconds for %d batch: %.2f , loss: %f' %
                          (iter, print_interval, time.time() - start, loss))
                start = time.time()
            if iter % 1000 == 0:
                self.save_model(running_avg_loss, iter)
Beispiel #12
0
    def trainIters(self, n_iters, model_file_path=None):
        iter, running_avg_loss = self.setup_train(model_file_path)
        sys.stdout.flush()

        ami_data = load_ami_data('train')
        valid_data = load_ami_data('valid')
        # make the training data 100
        random.shuffle(valid_data)
        ami_data.extend(valid_data[:6])
        valid_data = valid_data[6:]

        num_batches = len(ami_data)
        idx = 0

        # validation & stopping
        best_valid_loss = 1000000000
        stop_counter = 0

        while iter < n_iters:
            if idx == 0:
                print("shuffle training data")
                random.shuffle(ami_data)

            loss = self.train_one_batch(ami_data, idx)

            running_avg_loss = calc_running_avg_loss(loss, running_avg_loss,
                                                     iter)

            iter += 1
            idx += config.batch_size
            if idx == num_batches: idx = 0

            if iter % self.print_interval == 0:
                print("[{}] iter {}, loss: {:.5f}".format(
                    str(datetime.now()), iter, loss))
                sys.stdout.flush()

            if iter % config.save_every == 0:
                self.save_model(running_avg_loss, iter)

            if iter % config.eval_every == 0:
                valid_loss = self.run_eval(valid_data)
                print("valid_loss = {:.5f}".format(valid_loss))
                if valid_loss < best_valid_loss:
                    stop_counter = 0
                    best_valid_loss = valid_loss
                    print("VALID better")
                else:
                    stop_counter += 1
                    print(
                        "VALID NOT better, counter = {}".format(stop_counter))
                    if stop_counter == config.stop_after:
                        print("Stop training")
                        return

        print("Finished training!")
Beispiel #13
0
 def run_eval(self):
     running_avg_loss, iter = 0, 0
     self.model.eval()
     self.eval_batcher._finished_reading = False
     self.eval_batcher.setup_queues()
     batch = self.eval_batcher.next_batch()
     while batch is not None:
         loss = self.get_loss(batch).item()
         if loss is not None:
             running_avg_loss = calc_running_avg_loss(loss, running_avg_loss, iter)
             iter += 1
         batch = self.eval_batcher.next_batch()
     msg = 'Eval: loss: %f' % running_avg_loss
     print(msg)
     return running_avg_loss
Beispiel #14
0
    def run_eval(self, eval_data):
        running_avg_loss, iter = 0, 0
        batch_losses = []
        num_batches = len(eval_data)
        print("valid data size = {}".format(num_batches))
        for idx in range(num_batches):
            loss = self.eval_one_batch(eval_data, idx)
            batch_losses.append(loss)
            running_avg_loss = calc_running_avg_loss(loss, running_avg_loss,
                                                     iter)
            print("#", end="")
            sys.stdout.flush()
        print()

        avg_loss = sum(batch_losses) / len(batch_losses)
        return avg_loss
Beispiel #15
0
    def trainIters(self, n_src_vocab, n_tgt_vocab, n_iters, model_file_path=None):

        print("Setting up the model...")

        iter, running_avg_loss = self.setup_train(n_src_vocab, n_tgt_vocab, model_file_path)

        print("Starting training...")
        print("Data for this model will be stored in", self.model_dir)
        
        start = time.time()

        #only_batch = None
        losses = []
        iters = []
        save_name = os.path.join(self.model_dir, "loss_lists")

        while iter < n_iters:
            batch = self.batcher.next_batch()
            
            # if iter == 0:
            #     only_batch = batch
            # else:
            #     batch = only_batch

            loss = self.train_one_batch(batch, iter)

            running_avg_loss = calc_running_avg_loss(loss, running_avg_loss, self.summary_writer, iter)
            iter += 1
            
            if iter % 100 == 0:
                self.summary_writer.flush()
            print_interval = 50
            if iter % print_interval == 0:
                print('steps %d, seconds for %d batch: %.2f , loss: %f' % (iter, print_interval,
                                                                           time.time() - start, loss))
                start = time.time()
                
                iters.append(iter)
                losses.append(loss)

                with open(save_name, 'wb') as f:
                    pickle.dump((losses, iters), f)
            
            if iter % 5000 == 0:
                path = self.save_model(running_avg_loss, iter)

                print("Saving Checkpoint at {}".format(path))
Beispiel #16
0
    def run_eval(self, model_dir, train_iter_id):
        dataloader = DataLoader(self.dataset,
                                batch_size=config.batch_size,
                                shuffle=False,
                                num_workers=1,
                                collate_fn=create_batch_collate(
                                    self.vocab, config.batch_size))
        running_avg_loss, iter = 0, 0
        start = time.time()
        # batch = self.batcher.next_batch()
        pg_losses = []
        run_avg_losses = []
        for batch in dataloader:
            loss = self.eval_one_batch(batch)

            running_avg_loss = calc_running_avg_loss(loss, running_avg_loss,
                                                     iter)
            print("Iteration:", iter, "  loss:", loss, "  Running avg loss:",
                  running_avg_loss)
            iter += 1

            print_interval = 1000
            if iter % print_interval == 0:
                print('steps %d, seconds for %d batch: %.2f , loss: %f' %
                      (iter, print_interval, time.time() - start,
                       running_avg_loss))
                start = time.time()

            pg_losses.append(loss)
            run_avg_losses.append(running_avg_loss)

        # Dump val losses
        pickle.dump(
            pg_losses,
            open(
                os.path.join(model_dir,
                             'val_pg_losses_{}.p'.format(train_iter_id)),
                'wb'))
        pickle.dump(
            run_avg_losses,
            open(
                os.path.join(model_dir,
                             'val_run_avg_losses_{}.p'.format(train_iter_id)),
                'wb'))

        return run_avg_losses
Beispiel #17
0
    def trainIters(self, n_iters, model_file_path=None):
        iter, running_avg_loss = self.setup_train(model_file_path)

        while iter < n_iters:
            batch = self.batcher.next_batch()
            loss = self.train_one_batch(batch)

            running_avg_loss = calc_running_avg_loss(loss, running_avg_loss,
                                                     self.summary_writer, iter)
            iter += 1

            if iter % 100 == 0:
                self.summary_writer.flush()
            if iter % self.print_interval == 0:
                print("[{}] iter {}, loss: {:.5f}".format(
                    str(datetime.now()), iter, loss))
            if iter % 5000 == 0:
                self.save_model(running_avg_loss, iter)
Beispiel #18
0
    def run_eval(self):
        running_avg_loss, iter = 0, 0
        start = time.time()
        batch = self.batcher.next_batch()
        while batch is not None:
            loss = self.eval_one_batch(batch)

            running_avg_loss = calc_running_avg_loss(loss, running_avg_loss, self.summary_writer, iter)
            iter += 1

            if iter % 100 == 0:
                self.summary_writer.flush()
            print_interval = 1000
            if iter % print_interval == 0:
                print('steps %d, seconds for %d batch: %.2f , loss: %f' % (
                iter, print_interval, time.time() - start, running_avg_loss))
                start = time.time()
            batch = self.batcher.next_batch()
Beispiel #19
0
    def trainIters(self, n_iters, model_file_path=None):
        iter, running_avg_loss = self.setup_train(model_file_path)
        start = time.time()
        while iter < n_iters:
            batch = self.batcher.next_batch()
            loss = self.train_one_batch(batch)

            running_avg_loss = calc_running_avg_loss(loss, running_avg_loss, iter)
            print("Iteration:", iter, "  loss:", loss, "  Running avg loss:", running_avg_loss)
            iter += 1

            print_interval = 1000
            if iter % print_interval == 0:
                print('steps %d, seconds for %d batch: %.2f , loss: %f' % (iter, print_interval,
                                                                           time.time() - start, loss))
                start = time.time()
            if iter % 1000 == 0:
                self.save_model(running_avg_loss, iter)
Beispiel #20
0
    def run_eval(self):
        running_avg_loss, iter = 0, 0
        start = time.time()
        batch = self.batcher.next_batch()
        while batch is not None:
            loss = self.eval_one_batch(batch)

            running_avg_loss = calc_running_avg_loss(loss, running_avg_loss, self.summary_writer, iter)
            iter += 1

            if iter % 100 == 0:
                self.summary_writer.flush()
            print_interval = 1000
            if iter % print_interval == 0:
                print('steps %d, seconds for %d batch: %.2f , loss: %f' % (
                iter, print_interval, time.time() - start, running_avg_loss))
                start = time.time()
            batch = self.batcher.next_batch()
Beispiel #21
0
    def run_eval(self):
        running_avg_loss, iter = 0, 0
        start = time.time()
        batch = self.batcher.next_batch()
        while batch is not None:
            loss = self.eval_one_batch(batch)

            running_avg_loss = calc_running_avg_loss(loss, running_avg_loss,
                                                     self.summary_writer, iter)
            iter += 1

            if iter % 10 == 0:
                self.summary_writer.flush()
            print_interval = 1
            if iter % print_interval == 0:
                print('iters = %d, time = %s , loss: %f' %
                      (iter, time_since(start), running_avg_loss))
                start = time.time()
            batch = self.batcher.next_batch()
Beispiel #22
0
    def trainIters(self, n_iters, model_file_path=None):
        iter, running_avg_loss = self.setup_train(model_file_path)
        start = time.time()
        while iter < n_iters:
            batch = self.batcher.next_batch()
            loss = self.train_one_batch(batch)

            running_avg_loss = calc_running_avg_loss(loss, running_avg_loss,
                                                     self.summary_writer, iter)
            iter += 1

            if iter % 10 == 0:
                self.summary_writer.flush()
            print_interval = 10
            if iter % print_interval == 0:
                print('iters = %d, time = %s, loss: %f' %
                      (iter, time_since(start), loss))
            if iter % 10 == 0:
                self.save_model(running_avg_loss, iter)
Beispiel #23
0
    def on_backward_begin(self, loss):
        """
        :param loss: []
        :return:
        """
        print("|epoch: %d  step: %d  loss: %.4f|" %
              (self.epoch, self.step, loss.item()))
        if not np.isfinite(loss.item()):
            logger.error("train Loss is not finite. Stopping.")
            logger.info(loss.item())
            for name, param in self.model.named_parameters():
                if param.requires_grad:
                    logger.info(name)
                    logger.info(param.grad.data.sum())
            raise Exception("train Loss is not finite. Stopping.")

        self.running_avg_loss = calc_running_avg_loss(loss.item(),
                                                      self.running_avg_loss,
                                                      self.summary_writer,
                                                      self.step)
Beispiel #24
0
    def run_eval(self):
        running_avg_loss, iter = 0, 0
        start = time.time()
        batch = self.batcher.next_batch()
        while batch is not None:
            loss = self.eval_one_batch(batch)

            running_avg_loss = calc_running_avg_loss(loss, running_avg_loss,
                                                     iter)
            print("Iteration:", iter, "  loss:", loss, "  Running avg loss:",
                  running_avg_loss)
            iter += 1

            print_interval = 1000
            if iter % print_interval == 0:
                print('steps %d, seconds for %d batch: %.2f , loss: %f' %
                      (iter, print_interval, time.time() - start,
                       running_avg_loss))
                start = time.time()
            batch = self.batcher.next_batch()
Beispiel #25
0
    def trainIters(self, n_iters, model_file_path=None):
        iter, running_avg_loss = self.setup_train(model_file_path)
        start = time.time()

        for iter in tqdm(range(n_iters), total=n_iters, desc='Training'):

            batch = self.batcher.next_batch()
            loss = self.train_one_batch(batch)

            running_avg_loss = calc_running_avg_loss(loss, running_avg_loss, self.summary_writer, iter)
            iter += 1

            if iter % 100 == 0:
                self.summary_writer.flush()
            print_interval = 1000
            if iter % print_interval == 0:
                print('steps %d, seconds for %d batch: %.2f , loss: %f' % (iter, print_interval,
                                                                           time.time() - start, loss))
                start = time.time()
            if iter % 5000 == 0:
                self.save_model(running_avg_loss, iter)
Beispiel #26
0
    def run_eval(self):

        self.model.eval()
        batch = self.batcher_eval.next_batch()
        iter = 0
        start = time.time()
        running_avg_loss = 0
        with torch.no_grad():
            while batch is not None:
                loss, _ = self.model_batch_step(batch, False)
                loss = loss.item()
                running_avg_loss = calc_running_avg_loss(loss, running_avg_loss)
                batch = self.batcher_eval.next_batch()

                iter += 1
                if iter % config.print_interval == 0:
                    print('Eval steps %d, seconds for %d batch: %.2f , loss: %f' % (
                        iter, config.print_interval, time.time() - start, running_avg_loss))
                    start = time.time()

        return running_avg_loss
Beispiel #27
0
    def trainIters(self, n_iters, model_file_path=None):
        iter, running_avg_loss = self.setup_train(model_file_path)
        start = time.time()
        while iter < n_iters:
            batch = self.batcher.next_batch()
            batch_ds = self.ds_batcher.next_batch()
            loss = self.train_one_batch(batch, iter, batch_ds)
            loss = loss.cpu()

            running_avg_loss = calc_running_avg_loss(loss, running_avg_loss,
                                                     self.summary_writer, iter)
            iter += 1

            if iter % 100 == 0:
                self.summary_writer.flush()
            print_interval = 5
            if iter % print_interval == 0:
                print('steps %d , loss: %f' % (iter, loss))
                start = time.time()
            if iter % 50000 == 0:
                self.save_model(running_avg_loss, iter)
Beispiel #28
0
    def on_backward_begin(self, loss):
        self.loss_update_every.append(loss.item())
        if isinstance(loss, tuple) and not np.isfinite(loss[0].item()):
            logger.error("train Loss is not finite. Stopping.")
            logger.info(loss[0].item())
            for name, param in self.model.named_parameters():
                if param.requires_grad:
                    logger.info(name)
                    logger.info(param.grad.data.sum())
            raise Exception("train Loss is not finite. Stopping.")

        if self.step % self.update_every == 0:
            assert len(self.loss_update_every) == self.update_every
            loss_batch = sum(self.loss_update_every)
            self.loss_update_every = []
            # report the loss
            if self.step < 10 or self.step % 1000 == 0:
                logger.info(
                    "|epoch: %d  step: %d  log_loss: %.4f |" %
                    (self.epoch, self.step / self.update_every, loss_batch))
            self.running_avg_loss = calc_running_avg_loss(
                loss_batch, self.running_avg_loss,
                self.step / self.update_every)
Beispiel #29
0
    def trainIters(self, n_iters, model_file_path=None):
        min_loss = float(100000)
        iter, running_avg_loss = self.setup_train(model_file_path)
        start = time.time()
        while iter < n_iters:
            batch = self.batcher.next_batch()
            loss = self.train_one_batch(batch)
            running_avg_loss = calc_running_avg_loss(loss, running_avg_loss,
                                                     self.summary_writer, iter)
            iter += 1

            if iter % 20 == 0:
                self.summary_writer.flush()

            print_interval = 1000
            if config.fix_bug:
                print_interval = 100  #100

            if iter % print_interval == 0:
                with open(config.train_log, 'a+', encoding='utf-8') as f:
                    f.write("steps: " + str(iter) + "   loss: " + str(loss) +
                            "\n")
                print('steps %d, seconds for %d batch: %.2f , loss: %f' %
                      (iter, print_interval, time.time() - start, loss))
                start = time.time()

            if iter % 5000 == 0:  #5000
                model_save_path = self.save_model(running_avg_loss, iter)
                eval_process = eval.Evaluate(model_save_path, self.vocab)
                curr_loss = eval_process.run_eval()
                if curr_loss < min_loss:
                    min_loss = curr_loss
                    print("Loss update: ", min_loss)
                    with open(config.best_model_log, 'a+',
                              encoding='utf-8') as f:
                        f.write(model_save_path + "\nLoss in eval data: " +
                                str(min_loss) + "\n\n")
Beispiel #30
0
    def trainIters(self, n_iters, model_file_path=None, evaluate=False):
        iter, running_avg_loss = self.setup_train(model_file_path)
        start = time.time()
        LOGGER.info('Starting training for {} iterations'.format(iter))

        while iter < n_iters:
            iter_start = get_time()
            LOGGER.debug('Starting iteration {} at time {}'.format(
                iter + 1, iter_start))
            batch = self.batcher.next_batch()
            loss = self.train_one_batch(batch)

            running_avg_loss = calc_running_avg_loss(loss, running_avg_loss,
                                                     self.summary_writer, iter)
            iter += 1

            if iter % 100 == 0:
                self.summary_writer.flush()
            print_interval = 1000
            if iter % print_interval == 0:
                LOGGER.info('steps %d, seconds for %d batch: %.2f , loss: %f' %
                            (iter, print_interval, time.time() - start, loss))
                start = time.time()
            if iter % 5000 == 0:
                LOGGER.info('Saving model at iteration = {}'.format(iter + 1))
                model_path = self.save_model(running_avg_loss, iter)

                if evaluate:
                    beam_search_processor = BeamSearch(model_path)
                    beam_search_processor.decode()

            iter_end = get_time()
            LOGGER.debug('Iteration {} ended at time {}'.format(
                iter + 1, iter_end))
            LOGGER.debug('Time taken for iteration {} = {}'.format(
                iter + 1, time_diff_as_minutes(iter_start, iter_end)))
Beispiel #31
0
    def train(self, model_path=None):

        train_iter = self.get_train_dataloader()
        iter, running_avg_loss = 0, 0
        start = time.time()
        for epoch in range(self.args.epoches):
            print(f"Epoch: {epoch+1}")
            self.model.train()
            for i, batch in tqdm(enumerate(train_iter), total=len(train_iter)):
                # print(batch.source[0].size())
                # exit()
                batch_size = batch.batch_size
                # encoder part
                enc_padding_mask = self.get_mask(batch.source)
                enc_batch = batch.source[0]
                enc_lens = batch.source[1]
                encoder_outputs, encoder_feature, encoder_hidden = self.model.encoder(
                    enc_batch, enc_lens)
                s_t_1 = self.model.reduce_state(encoder_hidden)
                coverage = Variable(torch.zeros(batch.source[0].size())).to(
                    self.args.device)
                c_t_1 = Variable(
                    torch.zeros(
                        (batch_size,
                         2 * self.args.hidden_dim))).to(self.args.device)
                extra_zeros, enc_batch_extend_vocab, max_art_oovs = self.get_extra_features(
                    batch.source[0])
                extra_zeros = extra_zeros.to(self.args.device)
                enc_batch_extend_vocab = enc_batch_extend_vocab.to(
                    self.args.device)
                # decoder part
                dec_batch = batch.target[0][:, :-1]
                # print(dec_batch.size())
                target_batch = batch.target[0][:, 0:]
                dec_lens_var = batch.target[1]
                dec_padding_mask = self.get_mask(batch.target)
                max_dec_len = max(dec_lens_var)

                step_losses = []
                for di in range(min(max_dec_len, self.args.max_dec_steps) - 1):
                    y_t_1 = dec_batch[:, di]  # Teacher forcing
                    final_dist, s_t_1, c_t_1, attn_dist, p_gen, next_coverage = self.model.decoder(
                        y_t_1, s_t_1, encoder_outputs, encoder_feature,
                        enc_padding_mask, c_t_1, extra_zeros,
                        enc_batch_extend_vocab, coverage, di)
                    target = target_batch[:, di]
                    gold_probs = torch.gather(final_dist, 1,
                                              target.unsqueeze(1)).squeeze()
                    step_loss = -torch.log(gold_probs + self.args.eps)
                    if self.args.is_coverage:
                        step_coverage_loss = torch.sum(
                            torch.min(attn_dist, coverage), 1)
                        step_loss = step_loss + self.args.cov_loss_wt * step_coverage_loss
                        coverage = next_coverage

                    step_mask = dec_padding_mask[:, di]
                    step_loss = step_loss * step_mask
                    step_losses.append(step_loss)
                sum_losses = torch.sum(torch.stack(step_losses, 1), 1)
                batch_avg_loss = sum_losses / dec_lens_var
                loss = torch.mean(batch_avg_loss)

                loss.backward()

                norm = clip_grad_norm_(self.model.encoder.parameters(),
                                       self.args.max_grad_norm)
                clip_grad_norm_(self.model.decoder.parameters(),
                                self.args.max_grad_norm)
                clip_grad_norm_(self.model.reduce_state.parameters(),
                                self.args.max_grad_norm)

                self.optimizer.step()

                running_avg_loss = calc_running_avg_loss(
                    loss.item(), running_avg_loss, summary_writer, iter,
                    'Train')
                iter += 1
                if iter % self.args.flush:
                    # print('flush')
                    summary_writer.flush()
                # print_interval = 10
                # if iter % print_interval == 0:
                #     print(f'steps {iter}, batch number: {i} with {time.time() - start} seconds, loss: {loss}')
                #     start = time.time()
                # if iter % 300 == 0:
            self.save_model(running_avg_loss, iter, model_dir)
            self.evaluate(self.eval_dataset, epoch)
            self.evaluate(self.test_dataset, epoch, True)