Example #1
0
    def train(self):
        self.net_mode(train=True)
        self.C_max = Variable(
            cuda(torch.FloatTensor([self.C_max]), self.use_cuda))
        out = False

        pbar = tqdm(total=self.max_iter)
        pbar.update(self.global_iter)
        ## write log to log file
        outfile = os.path.join(self.ckpt_dir, "train.log")
        fw_log = open(outfile, "w")

        ## init PID control
        PID = PIDControl()
        Kp = 0.01
        Ki = -0.0001
        Kd = 0.0
        fw_log.write("Kp:{0:.5f} Ki: {1:.6f}\n".format(Kp, Ki))
        fw_log.flush()

        while not out:
            for x in self.data_loader:
                # print('shape>>', x.size())
                self.global_iter += 1
                pbar.update(1)

                x = Variable(cuda(x, self.use_cuda))
                x_recon, mu, logvar = self.net(x)
                recon_loss = reconstruction_loss(x, x_recon, self.decoder_dist)
                total_kld, dim_wise_kld, mean_kld = kl_divergence(mu, logvar)

                if self.is_PID:
                    # print(self.beta)
                    self.beta, _ = PID.pid(self.KL_loss, total_kld.item(), Kp,
                                           Ki, Kd)
                    beta_vae_loss = recon_loss + self.beta * total_kld
                else:
                    beta_vae_loss = recon_loss + 1.0 * total_kld
                    ### tricks for C
                    # C = torch.clamp(self.C_max/self.C_stop_iter*self.global_iter, 0, self.C_max.data[0])
                    # beta_vae_loss = recon_loss + self.gamma*(total_kld-C).abs()

                self.optim.zero_grad()
                beta_vae_loss.backward()
                self.optim.step()

                if self.viz_on and self.global_iter % self.gather_step == 0:
                    self.gather.insert(iter=self.global_iter,
                                       mu=mu.mean(0).data,
                                       var=logvar.exp().mean(0).data,
                                       recon_loss=recon_loss.data,
                                       total_kld=total_kld.data,
                                       mean_kld=mean_kld.data,
                                       beta=self.beta)
                    self.gather2.insert(iter=self.global_iter,
                                        mu=mu.mean(0).data,
                                        var=logvar.exp().mean(0).data,
                                        recon_loss=recon_loss.data,
                                        total_kld=total_kld.data,
                                        mean_kld=mean_kld.data,
                                        beta=self.beta)

                if self.global_iter % 20 == 0:
                    ## write log to file
                    fw_log.write(
                        '[{}] recon_loss:{:.3f} total_kld:{:.3f} mean_kld:{:.3f} beta:{:.4f}\n'
                        .format(self.global_iter, recon_loss.item(),
                                total_kld.item(), mean_kld.item(), self.beta))
                    fw_log.flush()

                # if self.global_iter%self.display_step == 0:
                # pbar.write('[{}] recon_loss:{:.3f} total_kld:{:.3f} mean_kld:{:.3f} beta:{:.4f}'.format(
                # self.global_iter, recon_loss.item(), total_kld.item(), mean_kld.item(), self.beta))

                if self.viz_on and self.global_iter % self.save_step == 0:
                    self.gather.insert(images=x.data)
                    self.gather.insert(images=F.sigmoid(x_recon).data)
                    self.viz_reconstruction()
                    self.viz_lines()
                    self.gather.flush()

                if (self.viz_on
                        or self.save_output) and self.global_iter % 50000 == 0:
                    self.viz_traverse()

                if self.global_iter % self.save_step == 0:
                    self.save_checkpoint('last')
                    pbar.write('Saved checkpoint(iter:{})'.format(
                        self.global_iter))

                if self.global_iter % 20000 == 0:
                    self.save_checkpoint(str(self.global_iter))

                if self.global_iter >= self.max_iter:
                    out = True
                    break

        pbar.write("[Training Finished]")
        pbar.close()
        fw_log.close()
Example #2
0
def main():
    """Entrypoint.
    """
    config: Any = importlib.import_module(args.config)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # train_data = tx.data.MonoTextData(config.train_data_hparams, device=device)
    # val_data = tx.data.MonoTextData(config.val_data_hparams, device=device)
    # test_data = tx.data.MonoTextData(config.test_data_hparams, device=device)

    train_data = tx.data.MonoTextData(config.train_data_hparams,
                                      device=torch.device("cpu"))
    val_data = tx.data.MonoTextData(config.val_data_hparams,
                                    device=torch.device("cpu"))
    test_data = tx.data.MonoTextData(config.test_data_hparams,
                                     device=torch.device("cpu"))

    iterator = tx.data.DataIterator({
        "train": train_data,
        "valid": val_data,
        "test": test_data
    })

    opt_vars = {
        'learning_rate': config.lr_decay_hparams["init_lr"],
        'best_valid_nll': 1e100,
        'steps_not_improved': 0,
        'kl_weight': config.kl_anneal_hparams["start"]
    }

    decay_cnt = 0
    max_decay = config.lr_decay_hparams["max_decay"]
    decay_factor = config.lr_decay_hparams["decay_factor"]
    decay_ts = config.lr_decay_hparams["threshold"]

    if 'pid' in args.model_name:
        save_dir = args.model_name + '_' + str(config.dataset) + '_KL' + str(
            args.exp_kl)
    elif 'cost' in args.model_name:
        save_dir = args.model_name + '_' + str(config.dataset) + '_step' + str(
            args.anneal_steps)
    elif 'cyclical' in args.model_name:
        save_dir = args.model_name + '_' + str(config.dataset) + '_cyc_' + str(
            args.cycle)

    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    suffix = f"{config.dataset}_{config.decoder_type}Decoder.ckpt"

    save_path = os.path.join(save_dir, suffix)

    # KL term annealing rate warm_up=10
    ## replace it with sigmoid function
    anneal_r = 1.0 / (config.kl_anneal_hparams["warm_up"] *
                      (len(train_data) / config.batch_size))

    vocab = train_data.vocab
    model = VAE(train_data.vocab.size, config)
    model.to(device)

    start_tokens = torch.full((config.batch_size, ),
                              vocab.bos_token_id,
                              dtype=torch.long).to(device)
    end_token = vocab.eos_token_id
    optimizer = tx.core.get_optimizer(params=model.parameters(),
                                      hparams=config.opt_hparams)
    scheduler = ExponentialLR(optimizer, decay_factor)

    ## max iteration
    max_iter = config.num_epochs * len(train_data) / config.batch_size
    max_iter = min(max_iter, args.max_steps)
    print('max steps:', max_iter)
    pbar = tqdm(total=int(max_iter))

    if args.mode == "train":
        outFile = os.path.join(save_dir, 'train.log')
        fw_log = open(outFile, "w")

    global_steps = {}
    global_steps['step'] = 0
    pid = PIDControl()
    opt_vars["kl_weight"] = 0.0
    Kp = args.Kp
    Ki = args.Ki
    exp_kl = args.exp_kl

    ## train model
    def _run_epoch(epoch: int, mode: str, display: int = 10) \
            -> Tuple[Tensor, float]:
        iterator.switch_to_dataset(mode)

        if mode == 'train':
            model.train()
            kl_weight = opt_vars["kl_weight"]
        else:
            model.eval()
            kl_weight = 1.0
            # kl_weight = opt_vars["kl_weight"]

        start_time = time.time()
        num_words = 0
        nll_total = 0.

        avg_rec = tx.utils.AverageRecorder()
        for batch in iterator:
            ## run model to get loss function
            if global_steps['step'] >= args.max_steps:
                break
            ret = model(batch, kl_weight, start_tokens, end_token)
            if mode == "train":
                pbar.update(1)
                global_steps['step'] += 1
                kl_loss = ret['kl_loss'].item()
                rec_loss = ret['rc_loss'].item()
                total_loss = ret["nll"].item()
                if 'cost' in args.model_name:
                    kl_weight = _cost_annealing(global_steps['step'], 1.0,
                                                args.anneal_steps)
                elif 'pid' in args.model_name:
                    kl_weight = pid.pid(exp_kl, kl_loss, Kp, Ki)
                elif 'cyclical' in args.model_name:
                    kl_weight = _cyclical_annealing(global_steps['step'],
                                                    max_iter / args.cycle)

                opt_vars["kl_weight"] = kl_weight

                ## total loss
                ret["nll"].backward()
                optimizer.step()
                optimizer.zero_grad()
                fw_log.write('epoch:{0} global_step:{1} total_loss:{2:.3f} kl_loss:{3:.3f} rec_loss:{4:.3f} kl_weight:{5:.4f}\n'\
                            .format(epoch, global_steps['step'], total_loss, kl_loss, rec_loss, kl_weight))
                fw_log.flush()

            batch_size = len(ret["lengths"])
            num_words += torch.sum(ret["lengths"]).item()
            nll_total += ret["nll"].item() * batch_size
            avg_rec.add([
                ret["nll"].item(), ret["kl_loss"].item(),
                ret["rc_loss"].item()
            ], batch_size)

            if global_steps['step'] % display == 1 and mode == 'train':
                nll = avg_rec.avg(0)
                klw = opt_vars["kl_weight"]
                KL = avg_rec.avg(1)
                rc = avg_rec.avg(2)
                writer.add_scalar(f'Loss/Rec_loss_{args.model_name}', rc,
                                  global_steps['step'])
                writer.add_scalar(f'Loss/KL_diverg_{args.model_name}', KL,
                                  global_steps['step'])
                writer.add_scalar(f'Loss/KL_weight_{args.model_name}', klw,
                                  global_steps['step'])

        nll = avg_rec.avg(0)
        KL = avg_rec.avg(1)
        rc = avg_rec.avg(2)
        if num_words > 0:
            log_ppl = nll_total / num_words
            ppl = math.exp(log_ppl)
        else:
            log_ppl = 100
            ppl = math.exp(log_ppl)
            nll = 1000
            KL = args.exp_kl

        print(f"\n{mode}: epoch {epoch}, nll {nll:.4f}, KL {KL:.4f}, "
              f"rc {rc:.4f}, log_ppl {log_ppl:.4f}, ppl {ppl:.4f}")
        return nll, ppl  # type: ignore

    args.model = save_path

    @torch.no_grad()
    def _generate(start_tokens: torch.LongTensor,
                  end_token: int,
                  filename: Optional[str] = None):
        ckpt = torch.load(args.model)
        model.load_state_dict(ckpt['model'])
        model.eval()

        batch_size = train_data.batch_size

        dst = MultivariateNormalDiag(loc=torch.zeros(batch_size,
                                                     config.latent_dims),
                                     scale_diag=torch.ones(
                                         batch_size, config.latent_dims))

        # latent_z = dst.rsample().to(device)
        latent_z = torch.FloatTensor(batch_size,
                                     config.latent_dims).uniform_(-1,
                                                                  1).to(device)
        # latent_z = torch.randn(batch_size, config.latent_dims).to(device)

        helper = model.decoder.create_helper(decoding_strategy='infer_sample',
                                             start_tokens=start_tokens,
                                             end_token=end_token)
        outputs = model.decode(helper=helper,
                               latent_z=latent_z,
                               max_decoding_length=100)

        if config.decoder_type == "transformer":
            outputs = outputs[0]

        sample_tokens = vocab.map_ids_to_tokens_py(outputs.sample_id.cpu())

        if filename is None:
            fh = sys.stdout
        else:
            fh = open(filename, 'a', encoding='utf-8')

        for sent in sample_tokens:
            sent = tx.utils.compat_as_text(list(sent))
            end_id = len(sent)
            if vocab.eos_token in sent:
                end_id = sent.index(vocab.eos_token)
            fh.write(' '.join(sent[:end_id + 1]) + '\n')

        print('Output done')
        fh.close()

    if args.mode == "predict":
        out_path = os.path.join(save_dir, 'results.txt')
        for _ in range(10):
            _generate(start_tokens, end_token, out_path)
        return

    # Counts trainable parameters
    total_parameters = sum(param.numel() for param in model.parameters())
    print(f"{total_parameters} total parameters")

    best_nll = best_ppl = 0.

    ## start running model
    for epoch in range(config.num_epochs):
        _, _ = _run_epoch(epoch, 'train', display=200)
        val_nll, _ = _run_epoch(epoch, 'valid')
        test_nll, test_ppl = _run_epoch(epoch, 'test')

        if val_nll < opt_vars['best_valid_nll']:
            opt_vars['best_valid_nll'] = val_nll
            opt_vars['steps_not_improved'] = 0
            best_nll = test_nll
            best_ppl = test_ppl

            states = {
                "model": model.state_dict(),
                "optimizer": optimizer.state_dict(),
                "scheduler": scheduler.state_dict()
            }
            torch.save(states, save_path)
        else:
            opt_vars['steps_not_improved'] += 1
            if opt_vars['steps_not_improved'] == decay_ts:
                old_lr = opt_vars['learning_rate']
                opt_vars['learning_rate'] *= decay_factor
                opt_vars['steps_not_improved'] = 0
                new_lr = opt_vars['learning_rate']
                ckpt = torch.load(save_path)
                model.load_state_dict(ckpt['model'])
                optimizer.load_state_dict(ckpt['optimizer'])
                scheduler.load_state_dict(ckpt['scheduler'])
                scheduler.step()
                print(f"-----\nchange lr, old lr: {old_lr}, "
                      f"new lr: {new_lr}\n-----")

                decay_cnt += 1
                if decay_cnt == max_decay:
                    break
        if global_steps['step'] >= args.max_steps:
            break

    print(f"\nbest testing nll: {best_nll:.4f},"
          f"best testing ppl {best_ppl:.4f}\n")

    if args.mode == "train":
        fw_log.write(f"\nbest testing nll: {best_nll:.4f},"
                     f"best testing ppl {best_ppl:.4f}\n")
        fw_log.close()
def main():
    ## random seeds
    seed = FLAGS.seed
    # tf.random.set_seed(seed)
    np.random.seed(seed)

    ## config for training
    config = Config()
    pid = PIDControl(FLAGS.exp_KL)
    
    # config for validation
    valid_config = Config()
    valid_config.keep_prob = 1.0
    valid_config.dec_keep_prob = 1.0
    valid_config.batch_size = 60

    # configuration for testing
    test_config = Config()
    test_config.keep_prob = 1.0
    test_config.dec_keep_prob = 1.0
    test_config.batch_size = 1

    pp(config)

    # get data set
    api = SWDADialogCorpus(FLAGS.data_dir, word2vec=FLAGS.word2vec_path, word2vec_dim=config.embed_size)
    dial_corpus = api.get_dialog_corpus()
    meta_corpus = api.get_meta_corpus()

    train_meta, valid_meta, test_meta = meta_corpus.get("train"), meta_corpus.get("valid"), meta_corpus.get("test")
    train_dial, valid_dial, test_dial = dial_corpus.get("train"), dial_corpus.get("valid"), dial_corpus.get("test")
    
    # convert to numeric input outputs that fits into TF models
    train_feed = SWDADataLoader("Train", train_dial, train_meta, config)
    valid_feed = SWDADataLoader("Valid", valid_dial, valid_meta, config)
    test_feed = SWDADataLoader("Test", test_dial, test_meta, config)

    if FLAGS.forward_only or FLAGS.resume:
        # log_dir = os.path.join(FLAGS.work_dir, FLAGS.test_path)
        log_dir = os.path.join(FLAGS.work_dir, FLAGS.model_name)
    else:
        log_dir = os.path.join(FLAGS.work_dir, FLAGS.model_name)

    
    ## begin training
    with tf.Session() as sess:
        initializer = tf.random_uniform_initializer(-1.0 * config.init_w, config.init_w)
        scope = "model"
        with tf.variable_scope(scope, reuse=None, initializer=initializer):
            model = KgRnnCVAE(sess, config, api, log_dir=None if FLAGS.forward_only else log_dir, forward=False, pid_control=pid, scope=scope)
        with tf.variable_scope(scope, reuse=True, initializer=initializer):
            valid_model = KgRnnCVAE(sess, valid_config, api, log_dir=None, forward=False, pid_control=pid, scope=scope)
        with tf.variable_scope(scope, reuse=True, initializer=initializer):
            test_model = KgRnnCVAE(sess, test_config, api, log_dir=None, forward=True, pid_control=pid, scope=scope)

        print("Created computation graphs")
        if api.word2vec is not None and not FLAGS.forward_only:
            print("Loaded word2vec")
            sess.run(model.embedding.assign(np.array(api.word2vec)))

        # write config to a file for logging
        if not FLAGS.forward_only:
            with open(os.path.join(log_dir, "configure.log"), "wb") as f:
                f.write(pp(config, output=False))
        
        # create a folder by force
        ckp_dir = os.path.join(log_dir, "checkpoints")
        print("*******checkpoint path: ", ckp_dir)
        if not os.path.exists(ckp_dir):
            os.mkdir(ckp_dir)

        ckpt = tf.train.get_checkpoint_state(ckp_dir)
        print("Created models with fresh parameters.")
        sess.run(tf.global_variables_initializer())

        if ckpt:
            print("Reading dm models parameters from %s" % ckpt.model_checkpoint_path)
            model.saver.restore(sess, ckpt.model_checkpoint_path)
        ### save log when running
        if not FLAGS.forward_only:
            logfileName = "train.log"
        else:
            logfileName = "test.log"
        fw_log = open(os.path.join(log_dir, logfileName), "w")
        print("log directory >>> : ", os.path.join(log_dir, "run.log"))
        if not FLAGS.forward_only:
            print('--start training now---')
            dm_checkpoint_path = os.path.join(ckp_dir, model.__class__.__name__+ ".ckpt")
            global_t = 1
            patience = 20  # wait for at least 10 epoch before stop
            dev_loss_threshold = np.inf
            best_dev_loss = np.inf
            pbar = tqdm(total = config.max_epoch)
            ## epoch start training
            for epoch in range(config.max_epoch):
                pbar.update(1)
                print(">> Epoch %d with lr %f" % (epoch, model.learning_rate.eval()))

                ## begin training
                FLAGS.mode = 'train'
                if train_feed.num_batch is None or train_feed.ptr >= train_feed.num_batch:
                    train_feed.epoch_init(config.batch_size, config.backward_size,
                                          config.step_size, shuffle=True)
                global_t, train_loss = model.train(global_t, sess, train_feed, update_limit=config.update_limit)
                
                FLAGS.mode = 'valid'
                valid_feed.epoch_init(valid_config.batch_size, valid_config.backward_size,
                                  valid_config.step_size, shuffle=False, intra_shuffle=False)
                test_feed.epoch_init(valid_config.batch_size, valid_config.backward_size,
                                  valid_config.step_size, shuffle=False, intra_shuffle=False)
                elbo, nll, ppl, au_count, kl_loss = valid_model.valid("ELBO_TEST", sess, valid_feed, test_feed)
                print('middle test nll: {} ppl: {} ActiveUnit: {} kl_loss:{}\n'.format(nll, ppl,au_count,kl_loss))
                fw_log.write('epoch:{} testing nll:{} ppl:{} ActiveUnit:{} kl_loss:{} elbo:{}\n'.\
                            format(epoch, nll, ppl, au_count, kl_loss, elbo))
                fw_log.flush()
                
                '''
                ## begin validation
                FLAGS.mode = 'valid'
                valid_feed.epoch_init(valid_config.batch_size, valid_config.backward_size,
                                      valid_config.step_size, shuffle=False, intra_shuffle=False)
                valid_loss = valid_model.valid("ELBO_VALID", sess, valid_feed)

                ## test model
                FLAGS.mode = 'test'
                test_feed.epoch_init(test_config.batch_size, test_config.backward_size,
                                     test_config.step_size, shuffle=True, intra_shuffle=False)
                test_model.test(sess, test_feed, num_batch=5)

                done_epoch = epoch + 1
                # only save a models if the dev loss is smaller
                # Decrease learning rate if no improvement was seen over last 3 times.
                if config.op == "sgd" and done_epoch > config.lr_hold:
                    sess.run(model.learning_rate_decay_op)

                if valid_loss < best_dev_loss:
                    if valid_loss <= dev_loss_threshold * config.improve_threshold:
                        patience = max(patience, done_epoch * config.patient_increase)
                        dev_loss_threshold = valid_loss

                    # still save the best train model
                    if FLAGS.save_model:
                        print("Save model!!")
                        model.saver.save(sess, dm_checkpoint_path, global_step=epoch)
                    best_dev_loss = valid_loss

                if config.early_stop and patience <= done_epoch:
                    print("!!Early stop due to run out of patience!!")
                    break
                    ## print("Best validation loss %f" % best_dev_loss)
                 '''
            print("Done training and save checkpoint")

            if FLAGS.save_model:
                print("Save model!!")
                model.saver.save(sess, dm_checkpoint_path, global_step=epoch)
            # begin validation
            print('--------after training to testing now-----')
            FLAGS.mode = 'test'
            # valid_feed.epoch_init(valid_config.batch_size, valid_config.backward_size,
                                #   valid_config.step_size, shuffle=False, intra_shuffle=False)
            # valid_model.valid("ELBO_VALID", sess, valid_feed)
            valid_feed.epoch_init(valid_config.batch_size, valid_config.backward_size,
                                  valid_config.step_size, shuffle=False, intra_shuffle=False)
            
            test_feed.epoch_init(valid_config.batch_size, valid_config.backward_size,
                                  valid_config.step_size, shuffle=False, intra_shuffle=False)
            elbo, nll, ppl, au_count,kl_loss = valid_model.valid("ELBO_TEST", sess, valid_feed, test_feed)

            print('final test nll: {} ppl: {} ActiveUnit: {} kl_loss:{}\n'.format(nll, ppl,au_count,kl_loss))
            fw_log.write('Final testing nll:{} ppl:{} ActiveUnit:{} kl_loss:{} elbo:{}\n'.\
                            format(nll, ppl, au_count, kl_loss, elbo))
            
            dest_f = open(os.path.join(log_dir, FLAGS.test_res), "wb")
            test_feed.epoch_init(test_config.batch_size, test_config.backward_size,
                                 test_config.step_size, shuffle=False, intra_shuffle=False)
            test_model.test(sess, test_feed, num_batch=None, repeat=10, dest=dest_f)
            dest_f.close()
            print("****testing done****")
        else:
            # begin validation
            # begin validation
            print('*'*89)
            print('--------testing now-----')
            print('*'*89)
            FLAGS.mode = 'test'
            valid_feed.epoch_init(valid_config.batch_size, valid_config.backward_size,
                                  valid_config.step_size, shuffle=False, intra_shuffle=False)
            # valid_model.valid("ELBO_VALID", sess, valid_feed)

            test_feed.epoch_init(valid_config.batch_size, valid_config.backward_size,
                                  valid_config.step_size, shuffle=False, intra_shuffle=False)
            elbo, nll, ppl, au_count, kl_loss = valid_model.valid("ELBO_TEST", sess, valid_feed, test_feed)

            print('final test nll: {} ppl: {} ActiveUnit: {} kl_loss:{}\n'.format(nll, ppl,au_count,kl_loss))
            fw_log.write('Final testing nll:{} ppl:{} ActiveUnit:{} kl_loss:{} elbo:{}\n'.\
                            format(nll, ppl, au_count, kl_loss, elbo))
            # dest_f = open(os.path.join(log_dir, FLAGS.test_res), "wb")
            # test_feed.epoch_init(test_config.batch_size, test_config.backward_size,
            #                      test_config.step_size, shuffle=False, intra_shuffle=False)
            # test_model.test(sess, test_feed, num_batch=None, repeat=10, dest=dest_f)
            # dest_f.close()
            print("****testing done****")
        fw_log.close()
#Evan Racah
#tests PID on whole system

from PID import PIDControl
from BBB.roboclaw import Roboclaw
from cv2 import KalmanFilter
#import vijay/oliver's ultrasonic function


pidCoefficients = [1, 0, 0] #still need to figure these out
maxSpeed = 100 #we need to test what the max rotational velocity of the wheels on the robot are
pid = PIDControl(0,[1,0,0],maxSpeed)
#encProt = EncoderProtractor(0, wheelCircum, robotDiam, encoderResolution)

"""INITIALIZE KALMAN FILTER"""
#we might need to use numpy based matrices for kalman implementation
kalman = KalmanFilter(nstateValues, nMeasurementValues, nControlInputs)
#kalman.transition_matrix = ....
#kalman.measurement_matrix = ....
#kalman.control_matrix = ....
"""#################"""


#########initiate ultrasonic object here#######

claw = Roboclaw(0x80, "/dev/ttyO1")

curM1 = 50 
curM2 = 50
claw.m1_forward(curM1) #I dont know what a good speed is yet
claw.m2_forward(curM2)