Example #1
0
    def init_training_algo(self, num_envs=None):
        """
        Initialize the training algorithm.

        This primarily calls the object creation functions for the A2C or PPO2 and the optimizer, but this also spawns
        a number of parallel environments, based on the self.num_cpu or num_envs input (if provided).

        Note, the spawning of parallel environments is VERY slow due to deepcopying the termination sets.  I tried some
        work arounds, but nothing worked properly, so we are stuck with it for now.

        :param num_envs: an override for the default number of environments to spawn (in self.num_cpu)
        """
        if not num_envs:
            num_envs = self.num_cpu

        if self.model_type == "A2C":
            # check to make sure that the A2C parameters are set
            assert self.optim_alpha
            self.training_envs = [deepcopy(self.env) for i in range(num_envs)
                                  ]  # spawn parallel environments

            if self.acmodel.recurrent:
                self.memories = torch.zeros(num_envs,
                                            self.acmodel.memory_size,
                                            device=self.device)

            self.algo = torch_ac.A2CAlgo(
                self.training_envs, self.acmodel, self.device,
                self.frames_per_proc, self.discount, self.lr, self.gae_lambda,
                self.entropy_coef, self.value_loss_coef, self.max_grad_norm,
                self.recurrence, self.optim_alpha, self.optim_eps,
                self.preprocess_obss)
        elif self.model_type == "PPO2":
            # check to see if the PPO2 parameters are set
            assert self.clip_eps and self.epochs and self.batch_size
            self.training_envs = [deepcopy(self.env) for i in range(num_envs)
                                  ]  # spawn parallel environments

            if self.acmodel.recurrent:
                self.memories = torch.zeros(num_envs,
                                            self.acmodel.memory_size,
                                            device=self.device)

            self.algo = torch_ac.PPOAlgo(
                self.training_envs, self.acmodel, self.device,
                self.frames_per_proc, self.discount, self.lr, self.gae_lambda,
                self.entropy_coef, self.value_loss_coef, self.max_grad_norm,
                self.recurrence, self.optim_eps, self.clip_eps, self.epochs,
                self.batch_size, self.preprocess_obss)
        else:
            raise ValueError("Incorrect algorithm name: {}".format(algo_type))

        # load the optimizer state, if it exists
        if "optimizer_state" in self.status:
            self.algo.optimizer.load_state_dict(self.status["optimizer_state"])
        self.txt_logger.info("Optimizer loaded\n")
Example #2
0
elif args.pretrained_gnn:
    acmodel.load_pretrained_gnn(pretrained_status["model_state"])
    txt_logger.info("Pretrained model loaded.\n")

acmodel.to(device)
txt_logger.info("Model loaded.\n")
txt_logger.info("{}\n".format(acmodel))

# Load algo
if args.algo == "a2c":
    algo = torch_ac.A2CAlgo(envs, acmodel, device, args.frames_per_proc, args.discount, args.lr, args.gae_lambda,
                            args.entropy_coef, args.value_loss_coef, args.max_grad_norm, args.recurrence,
                            args.optim_alpha, args.optim_eps, preprocess_obss)
elif args.algo == "ppo":
    algo = torch_ac.PPOAlgo(envs, acmodel, device, args.frames_per_proc, args.discount, args.lr, args.gae_lambda,
                            args.entropy_coef, args.value_loss_coef, args.max_grad_norm, args.recurrence,
                            args.optim_eps, args.clip_eps, args.epochs, args.batch_size, preprocess_obss)
else:
    raise ValueError("Incorrect algorithm name: {}".format(args.algo))

if "optimizer_state" in status:
    algo.optimizer.load_state_dict(status["optimizer_state"])
    txt_logger.info("Loading optimizer from existing run.\n")
txt_logger.info("Optimizer loaded.\n")

# init the evaluator
if args.eval:
    eval_samplers = args.ltl_samplers_eval if args.ltl_samplers_eval else [args.ltl_sampler]
    eval_env = args.eval_env if args.eval_env else args.env
    eval_procs = args.eval_procs if args.eval_procs else args.procs
Example #3
0
def main():
    # Parse arguments

    parser = argparse.ArgumentParser()

    ## General parameters
    parser.add_argument(
        "--algo",
        required=True,
        help="algorithm to use: a2c | ppo | ppo_intrinsic (REQUIRED)")
    parser.add_argument("--env",
                        required=True,
                        help="name of the environment to train on (REQUIRED)")
    parser.add_argument(
        "--model",
        default=None,
        help="name of the model (default: {ENV}_{ALGO}_{TIME})")
    parser.add_argument("--seed",
                        type=int,
                        default=1,
                        help="random seed (default: 1)")
    parser.add_argument("--log-interval",
                        type=int,
                        default=1,
                        help="number of updates between two logs (default: 1)")
    parser.add_argument(
        "--save-interval",
        type=int,
        default=10,
        help=
        "number of updates between two saves (default: 10, 0 means no saving)")
    parser.add_argument("--procs",
                        type=int,
                        default=16,
                        help="number of processes (default: 16)")
    parser.add_argument("--frames",
                        type=int,
                        default=10**7,
                        help="number of frames of training (default: 1e7)")

    ## Parameters for main algorithm
    parser.add_argument("--epochs",
                        type=int,
                        default=4,
                        help="number of epochs for PPO (default: 4)")
    parser.add_argument("--batch-size",
                        type=int,
                        default=256,
                        help="batch size for PPO (default: 256)")
    parser.add_argument(
        "--frames-per-proc",
        type=int,
        default=None,
        help=
        "number of frames per process before update (default: 5 for A2C and 128 for PPO)"
    )
    parser.add_argument("--discount",
                        type=float,
                        default=0.99,
                        help="discount factor (default: 0.99)")
    parser.add_argument("--lr",
                        type=float,
                        default=0.001,
                        help="learning rate (default: 0.001)")
    parser.add_argument(
        "--gae-lambda",
        type=float,
        default=0.95,
        help="lambda coefficient in GAE formula (default: 0.95, 1 means no gae)"
    )
    parser.add_argument("--entropy-coef",
                        type=float,
                        default=0.01,
                        help="entropy term coefficient (default: 0.01)")
    parser.add_argument("--value-loss-coef",
                        type=float,
                        default=0.5,
                        help="value loss term coefficient (default: 0.5)")
    parser.add_argument("--max-grad-norm",
                        type=float,
                        default=0.5,
                        help="maximum norm of gradient (default: 0.5)")
    parser.add_argument(
        "--optim-eps",
        type=float,
        default=1e-8,
        help="Adam and RMSprop optimizer epsilon (default: 1e-8)")
    parser.add_argument("--optim-alpha",
                        type=float,
                        default=0.99,
                        help="RMSprop optimizer alpha (default: 0.99)")
    parser.add_argument("--clip-eps",
                        type=float,
                        default=0.2,
                        help="clipping epsilon for PPO (default: 0.2)")
    parser.add_argument(
        "--recurrence",
        type=int,
        default=1,
        help=
        "number of time-steps gradient is backpropagated (default: 1). If > 1, a LSTM is added to the model to have memory."
    )
    parser.add_argument("--text",
                        action="store_true",
                        default=False,
                        help="add a GRU to the model to handle text input")
    parser.add_argument("--visualize",
                        default=False,
                        help="show real time CNN layer weight changes")

    args = parser.parse_args()

    args.mem = args.recurrence > 1

    # Set run dir

    date = datetime.datetime.now().strftime("%y-%m-%d-%H-%M-%S")
    default_model_name = f"{args.env}_{args.algo}_seed{args.seed}_{date}"

    model_name = args.model or default_model_name
    model_dir = utils.get_model_dir(model_name)

    # Load loggers and Tensorboard writer

    txt_logger = utils.get_txt_logger(model_dir)
    csv_file, csv_logger = utils.get_csv_logger(model_dir)
    tb_writer = tensorboardX.SummaryWriter(model_dir)

    # Log command and all script arguments

    txt_logger.info("{}\n".format(" ".join(sys.argv)))
    txt_logger.info("{}\n".format(args))

    # Set seed for all randomness sources

    utils.seed(args.seed)

    # Set device

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    txt_logger.info(f"Device: {device}\n")

    # Load environments

    envs = []
    for i in range(args.procs):
        envs.append(utils.make_env(args.env, args.seed + 10000 * i))
    txt_logger.info("Environments loaded\n")

    # Load training status

    try:
        status = utils.get_status(model_dir)
    except OSError:
        status = {"num_frames": 0, "update": 0}
    txt_logger.info("Training status loaded\n")

    # Load observations preprocessor

    obs_space, preprocess_obss = utils.get_obss_preprocessor(
        envs[0].observation_space)
    if "vocab" in status:
        preprocess_obss.vocab.load_vocab(status["vocab"])
    txt_logger.info("Observations preprocessor loaded")

    # Load model

    acmodel = ACModel(obs_space, envs[0].action_space, args.mem, args.text)
    if "model_state" in status:
        acmodel.load_state_dict(status["model_state"])
    acmodel.to(device)
    txt_logger.info("Model loaded\n")
    txt_logger.info("{}\n".format(acmodel))

    # Load algo

    if args.algo == "a2c":
        algo = torch_ac.A2CAlgo(envs, acmodel, device, args.frames_per_proc,
                                args.discount, args.lr, args.gae_lambda,
                                args.entropy_coef, args.value_loss_coef,
                                args.max_grad_norm, args.recurrence,
                                args.optim_alpha, args.optim_eps,
                                preprocess_obss)
    elif args.algo == "ppo":
        algo = torch_ac.PPOAlgo(envs, acmodel, device, args.frames_per_proc,
                                args.discount, args.lr, args.gae_lambda,
                                args.entropy_coef, args.value_loss_coef,
                                args.max_grad_norm, args.recurrence,
                                args.optim_eps, args.clip_eps, args.epochs,
                                args.batch_size, preprocess_obss)

    elif args.algo == "ppo_intrinsic":
        algo = torch_ac.PPOAlgoIntrinsic(
            envs, acmodel, device, args.frames_per_proc, args.discount,
            args.lr, args.gae_lambda, args.entropy_coef, args.value_loss_coef,
            args.max_grad_norm, args.recurrence, args.optim_eps, args.clip_eps,
            args.epochs, args.batch_size, preprocess_obss)
    elif args.algo == "a2c_intrinsic":
        algo = torch_ac.A2CAlgoIntrinsic(
            envs, acmodel, device, args.frames_per_proc, args.discount,
            args.lr, args.gae_lambda, args.entropy_coef, args.value_loss_coef,
            args.max_grad_norm, args.recurrence, args.optim_alpha,
            args.optim_eps, preprocess_obss)
    else:
        raise ValueError("Incorrect algorithm name: {}".format(args.algo))

    if "optimizer_state" in status:
        algo.optimizer.load_state_dict(status["optimizer_state"])
    txt_logger.info("Optimizer loaded\n")

    # Train model

    num_frames = status["num_frames"]
    update = status["update"]
    start_time = time.time()

    print_visual = args.visualize
    if print_visual:
        fig, axs = plt.subplots(1, 3)
        fig.suptitle('Convolution Layer Weights Normalized Difference')

    while num_frames < args.frames:

        # Store copies of s_t model params
        old_parameters = {}
        for name, param in acmodel.named_parameters():
            old_parameters[name] = param.detach().numpy().copy()

        # Update model parameters
        update_start_time = time.time()
        exps, logs1 = algo.collect_experiences()
        logs2 = algo.update_parameters(exps)
        logs = {**logs1, **logs2}
        update_end_time = time.time()

        # Store copies of s_t+1 model params
        new_parameters = {}
        for name, param in acmodel.named_parameters():
            new_parameters[name] = param.detach().numpy().copy()

        # Compute L2 Norm of model state differences
        # Print model weight change visualization
        for index in range(len(old_parameters.keys())):
            if index == 0 or index == 2 or index == 4:
                key = list(old_parameters.keys())[index]
                old_weights = old_parameters[key]
                new_weights = new_parameters[key]
                norm_diff = numpy.linalg.norm(new_weights - old_weights)
                diff_matrix = abs(new_weights - old_weights)
                diff_matrix[:, :, 0, 0] = normalize(diff_matrix[:, :, 0, 0],
                                                    norm='max',
                                                    axis=0)
                if print_visual:
                    axs[int(index / 2)].imshow(diff_matrix[:, :, 0, 0],
                                               cmap='Greens',
                                               interpolation='nearest')

        # This allows the plots to update as the model trains
        if print_visual:
            plt.ion()
            plt.show()
            plt.pause(0.001)

        num_frames += logs["num_frames"]
        update += 1

        # Print logs

        if update % args.log_interval == 0:
            fps = logs["num_frames"] / (update_end_time - update_start_time)
            duration = int(time.time() - start_time)
            return_per_episode = utils.synthesize(logs["return_per_episode"])
            rreturn_per_episode = utils.synthesize(
                logs["reshaped_return_per_episode"])
            num_frames_per_episode = utils.synthesize(
                logs["num_frames_per_episode"])

            header = ["update", "frames", "FPS", "duration"]
            data = [update, num_frames, fps, duration]
            header += ["rreturn_" + key for key in rreturn_per_episode.keys()]
            data += rreturn_per_episode.values()
            header += [
                "num_frames_" + key for key in num_frames_per_episode.keys()
            ]
            data += num_frames_per_episode.values()
            header += [
                "entropy", "value", "policy_loss", "value_loss", "grad_norm"
            ]
            data += [
                logs["entropy"], logs["value"], logs["policy_loss"],
                logs["value_loss"], logs["grad_norm"]
            ]

            txt_logger.info(
                "U {} | F {:06} | FPS {:04.0f} | D {} | rR:μσmM {:.2f} {:.2f} {:.2f} {:.2f} | F:μσmM {:.1f} {:.1f} {} {} | H {:.3f} | V {:.3f} | pL {:.3f} | vL {:.3f} | ∇ {:.3f}"
                .format(*data))

            header += ["return_" + key for key in return_per_episode.keys()]
            data += return_per_episode.values()

            if status["num_frames"] == 0:
                csv_logger.writerow(header)
            csv_logger.writerow(data)
            csv_file.flush()

            for field, value in zip(header, data):
                tb_writer.add_scalar(field, value, num_frames)

        # Save status

        if args.save_interval > 0 and update % args.save_interval == 0:
            status = {
                "num_frames": num_frames,
                "update": update,
                "model_state": acmodel.state_dict(),
                "optimizer_state": algo.optimizer.state_dict()
            }
            if hasattr(preprocess_obss, "vocab"):
                status["vocab"] = preprocess_obss.vocab.vocab
            utils.save_status(status, model_dir)
            txt_logger.info("Status saved")
def main(raw_args=None):

    # Parse arguments
    parser = argparse.ArgumentParser()

    ## General parameters
    parser.add_argument("--algo",
                        required=True,
                        help="algorithm to use: a2c | ppo | ipo (REQUIRED)")
    parser.add_argument("--domain1",
                        required=True,
                        help="name of the first domain to train on (REQUIRED)")
    parser.add_argument(
        "--domain2",
        required=True,
        help="name of the second domain to train on (REQUIRED)")
    parser.add_argument(
        "--p1",
        required=True,
        type=float,
        help="Proportion of training environments from first domain (REQUIRED)"
    )
    parser.add_argument("--model", required=True, help="name of the model")
    parser.add_argument("--seed",
                        type=int,
                        default=1,
                        help="random seed (default: 1)")
    parser.add_argument("--log-interval",
                        type=int,
                        default=1,
                        help="number of updates between two logs (default: 1)")
    parser.add_argument(
        "--save-interval",
        type=int,
        default=10,
        help=
        "number of updates between two saves (default: 10, 0 means no saving)")
    parser.add_argument("--procs",
                        type=int,
                        default=16,
                        help="number of processes (default: 16)")
    parser.add_argument("--frames",
                        type=int,
                        default=10**7,
                        help="number of frames of training (default: 1e7)")

    ## Parameters for main algorithm
    parser.add_argument("--epochs",
                        type=int,
                        default=4,
                        help="number of epochs for PPO (default: 4)")
    parser.add_argument("--batch-size",
                        type=int,
                        default=256,
                        help="batch size for PPO (default: 256)")
    parser.add_argument(
        "--frames-per-proc",
        type=int,
        default=None,
        help=
        "number of frames per process before update (default: 5 for A2C and 128 for PPO)"
    )
    parser.add_argument("--discount",
                        type=float,
                        default=0.99,
                        help="discount factor (default: 0.99)")
    parser.add_argument("--lr",
                        type=float,
                        default=0.001,
                        help="learning rate (default: 0.001)")
    parser.add_argument(
        "--gae-lambda",
        type=float,
        default=0.95,
        help="lambda coefficient in GAE formula (default: 0.95, 1 means no gae)"
    )
    parser.add_argument("--entropy-coef",
                        type=float,
                        default=0.01,
                        help="entropy term coefficient (default: 0.01)")
    parser.add_argument("--value-loss-coef",
                        type=float,
                        default=0.5,
                        help="value loss term coefficient (default: 0.5)")
    parser.add_argument("--max-grad-norm",
                        type=float,
                        default=0.5,
                        help="maximum norm of gradient (default: 0.5)")
    parser.add_argument(
        "--optim-eps",
        type=float,
        default=1e-8,
        help="Adam and RMSprop optimizer epsilon (default: 1e-8)")
    parser.add_argument("--optim-alpha",
                        type=float,
                        default=0.99,
                        help="RMSprop optimizer alpha (default: 0.99)")
    parser.add_argument("--clip-eps",
                        type=float,
                        default=0.2,
                        help="clipping epsilon for PPO (default: 0.2)")
    parser.add_argument(
        "--recurrence",
        type=int,
        default=1,
        help=
        "number of time-steps gradient is backpropagated (default: 1). If > 1, a LSTM is added to the model to have memory."
    )
    parser.add_argument("--text",
                        action="store_true",
                        default=False,
                        help="add a GRU to the model to handle text input")

    args = parser.parse_args(raw_args)

    args.mem = args.recurrence > 1

    # Check PyTorch version
    if (torch.__version__ != '1.2.0'):
        raise ValueError(
            "PyTorch version must be 1.2.0 (see README). Your version is {}.".
            format(torch.__version__))

    if args.mem:
        raise ValueError("Policies with memory not supported.")

    # Set run dir

    date = datetime.datetime.now().strftime("%y-%m-%d-%H-%M-%S")
    default_model_name = args.model

    model_name = args.model or default_model_name
    model_dir = utils.get_model_dir(model_name)

    # Load loggers and Tensorboard writer

    txt_logger = utils.get_txt_logger(model_dir)
    csv_file, csv_logger = utils.get_csv_logger(model_dir)
    tb_writer = tensorboardX.SummaryWriter(model_dir)

    # Log command and all script arguments

    txt_logger.info("{}\n".format(" ".join(sys.argv)))
    txt_logger.info("{}\n".format(args))

    # Set seed for all randomness sources

    torch.backends.cudnn.deterministic = True
    utils.seed(args.seed)

    # Set device

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    txt_logger.info(f"Device: {device}\n")

    # Load environments from different domains
    domain1 = args.domain1  # e.g., 'MiniGrid-ColoredKeysRed-v0'
    domain2 = args.domain2  # e.g., 'MiniGrid-ColoredKeysYellow-v0'

    p1 = args.p1  # Proportion of environments from domain1

    num_envs_total = args.procs  # Total number of environments
    num_domain1 = math.ceil(
        p1 * num_envs_total)  # Number of environments in domain1
    num_domain2 = num_envs_total - num_domain1  # Number of environments in domain2

    # Environments from domain1
    envs1 = []
    for i in range(num_domain1):
        envs1.append(utils.make_env(domain1, args.seed + 10000 * i))

    # Environments from domain2
    envs2 = []
    for i in range(num_domain2):
        envs2.append(utils.make_env(domain2, args.seed + 10000 * i))

    # All environments
    envs = envs1 + envs2

    txt_logger.info("Environments loaded\n")

    # Load training status

    try:
        status = utils.get_status(model_dir)
    except OSError:
        status = {"num_frames": 0, "update": 0}
    txt_logger.info("Training status loaded\n")

    # Load observations preprocessor

    obs_space, preprocess_obss = utils.get_obss_preprocessor(
        envs[0].observation_space)
    if "vocab" in status:
        preprocess_obss.vocab.load_vocab(status["vocab"])
    txt_logger.info("Observations preprocessor loaded")

    if args.algo == "ipo":
        # Load model for IPO game
        acmodel = ACModel_average(obs_space, envs[0].action_space, args.mem,
                                  args.text)
        if "model_state" in status:
            acmodel.load_state_dict(status["model_state"])
        acmodel.to(device)
        txt_logger.info("Model loaded\n")
        txt_logger.info("{}\n".format(acmodel))

    else:
        # Load model (for standard PPO or A2C)
        acmodel = ACModel(obs_space, envs[0].action_space, args.mem, args.text)
        if "model_state" in status:
            acmodel.load_state_dict(status["model_state"])
        acmodel.to(device)
        txt_logger.info("Model loaded\n")
        txt_logger.info("{}\n".format(acmodel))

    # Load algo

    if args.algo == "a2c":
        algo = torch_ac.A2CAlgo(envs, acmodel, device, args.frames_per_proc,
                                args.discount, args.lr, args.gae_lambda,
                                args.entropy_coef, args.value_loss_coef,
                                args.max_grad_norm, args.recurrence,
                                args.optim_alpha, args.optim_eps,
                                preprocess_obss)
        if "optimizer_state" in status:
            algo.optimizer.load_state_dict(status["optimizer_state"])
            txt_logger.info("Optimizer loaded\n")

    elif args.algo == "ppo":
        algo = torch_ac.PPOAlgo(envs, acmodel, device, args.frames_per_proc,
                                args.discount, args.lr, args.gae_lambda,
                                args.entropy_coef, args.value_loss_coef,
                                args.max_grad_norm, args.recurrence,
                                args.optim_eps, args.clip_eps, args.epochs,
                                args.batch_size, preprocess_obss)

        if "optimizer_state" in status:
            algo.optimizer.load_state_dict(status["optimizer_state"])
            txt_logger.info("Optimizer loaded\n")

    elif args.algo == "ipo":
        # One algo per domain. These have different envivonments, but shared acmodel
        algo1 = torch_ac.IPOAlgo(
            envs1, acmodel, 1, device, args.frames_per_proc, args.discount,
            args.lr, args.gae_lambda, args.entropy_coef, args.value_loss_coef,
            args.max_grad_norm, args.recurrence, args.optim_eps, args.clip_eps,
            args.epochs, args.batch_size, preprocess_obss)

        algo2 = torch_ac.IPOAlgo(
            envs2, acmodel, 2, device, args.frames_per_proc, args.discount,
            args.lr, args.gae_lambda, args.entropy_coef, args.value_loss_coef,
            args.max_grad_norm, args.recurrence, args.optim_eps, args.clip_eps,
            args.epochs, args.batch_size, preprocess_obss)

        if "optimizer_state1" in status:
            algo1.optimizer.load_state_dict(status["optimizer_state1"])
            txt_logger.info("Optimizer 1 loaded\n")
        if "optimizer_state2" in status:
            algo2.optimizer.load_state_dict(status["optimizer_state2"])
            txt_logger.info("Optimizer 2 loaded\n")

    else:
        raise ValueError("Incorrect algorithm name: {}".format(args.algo))

    # Train model

    num_frames = status["num_frames"]
    update = status["update"]
    start_time = time.time()

    while num_frames < args.frames:
        # Update model parameters

        update_start_time = time.time()

        if args.algo == "ipo":

            # Standard method

            # Collect experiences on first domain
            exps1, logs_exps1 = algo1.collect_experiences()

            # Update params of model corresponding to first domain
            logs_algo1 = algo1.update_parameters(exps1)

            # Collect experiences on second domain
            exps2, logs_exps2 = algo2.collect_experiences()

            # Update params of model corresponding to second domain
            logs_algo2 = algo2.update_parameters(exps2)

            # Update end time
            update_end_time = time.time()

            # Combine logs
            logs_exps = {
                'return_per_episode':
                logs_exps1["return_per_episode"] +
                logs_exps2["return_per_episode"],
                'reshaped_return_per_episode':
                logs_exps1["reshaped_return_per_episode"] +
                logs_exps2["reshaped_return_per_episode"],
                'num_frames_per_episode':
                logs_exps1["num_frames_per_episode"] +
                logs_exps2["num_frames_per_episode"],
                'num_frames':
                logs_exps1["num_frames"] + logs_exps2["num_frames"]
            }

            logs_algo = {
                'entropy':
                (num_domain1 * logs_algo1["entropy"] +
                 num_domain2 * logs_algo2["entropy"]) / num_envs_total,
                'value': (num_domain1 * logs_algo1["value"] +
                          num_domain2 * logs_algo2["value"]) / num_envs_total,
                'policy_loss':
                (num_domain1 * logs_algo1["policy_loss"] +
                 num_domain2 * logs_algo2["policy_loss"]) / num_envs_total,
                'value_loss':
                (num_domain1 * logs_algo1["value_loss"] +
                 num_domain2 * logs_algo2["value_loss"]) / num_envs_total,
                'grad_norm':
                (num_domain1 * logs_algo1["grad_norm"] +
                 num_domain2 * logs_algo2["grad_norm"]) / num_envs_total
            }

            logs = {**logs_exps, **logs_algo}
            num_frames += logs["num_frames"]

        else:
            exps, logs1 = algo.collect_experiences()
            logs2 = algo.update_parameters(exps)
            logs = {**logs1, **logs2}
            update_end_time = time.time()
            num_frames += logs["num_frames"]

        update += 1

        # Print logs

        if update % args.log_interval == 0:
            fps = logs["num_frames"] / (update_end_time - update_start_time)
            duration = int(time.time() - start_time)
            return_per_episode = utils.synthesize(logs["return_per_episode"])
            rreturn_per_episode = utils.synthesize(
                logs["reshaped_return_per_episode"])
            num_frames_per_episode = utils.synthesize(
                logs["num_frames_per_episode"])

            header = ["update", "frames", "FPS", "duration"]
            data = [update, num_frames, fps, duration]
            header += ["rreturn_" + key for key in rreturn_per_episode.keys()]
            data += rreturn_per_episode.values()
            header += [
                "num_frames_" + key for key in num_frames_per_episode.keys()
            ]
            data += num_frames_per_episode.values()
            header += [
                "entropy", "value", "policy_loss", "value_loss", "grad_norm"
            ]
            data += [
                logs["entropy"], logs["value"], logs["policy_loss"],
                logs["value_loss"], logs["grad_norm"]
            ]

            txt_logger.info(
                "U {} | F {:06} | FPS {:04.0f} | D {} | rR:μσmM {:.2f} {:.2f} {:.2f} {:.2f} | F:μσmM {:.1f} {:.1f} {} {} | H {:.3f} | V {:.3f} | pL {:.3f} | vL {:.3f} | ∇ {:.3f}"
                .format(*data))

            header += ["return_" + key for key in return_per_episode.keys()]
            data += return_per_episode.values()

            # header += ["debug_last_env_reward"]
            # data += [logs["debug_last_env_reward"]]

            header += ["total_loss"]
            data += [
                logs["policy_loss"] - args.entropy_coef * logs["entropy"] +
                args.value_loss_coef * logs["value_loss"]
            ]

            if status["num_frames"] == 0:
                csv_logger.writerow(header)

            csv_logger.writerow(data)
            csv_file.flush()

            for field, value in zip(header, data):
                tb_writer.add_scalar(field, value, num_frames)

        # Save status

        if args.save_interval > 0 and update % args.save_interval == 0:

            if args.algo == "ipo":
                status = {
                    "num_frames": num_frames,
                    "update": update,
                    "model_state": acmodel.state_dict(),
                    "optimizer_state1": algo1.optimizer.state_dict(),
                    "optimizer_state2": algo2.optimizer.state_dict()
                }
            else:
                status = {
                    "num_frames": num_frames,
                    "update": update,
                    "model_state": acmodel.state_dict(),
                    "optimizer_state": algo.optimizer.state_dict()
                }

            if hasattr(preprocess_obss, "vocab"):
                status["vocab"] = preprocess_obss.vocab.vocab
            utils.save_status(status, model_dir)
            txt_logger.info("Status saved")
Example #5
0
    def step(self, action):

        self.step_count += 1

        reward = 0
        done = False

        # Get the position in front of the agent
        fwd_pos = self.front_pos

        # Get the contents of the cell in front of the agent
        fwd_cell = self.grid.get(*fwd_pos)
        #print(action)
        # Rotate left
        if action == self.actions.left:
            self.agent_dir -= 1
            if self.agent_dir < 0:
                self.agent_dir += 4

        # Rotate right
        elif action == self.actions.right:
            self.agent_dir = (self.agent_dir + 1) % 4

        # Move forward
        elif action == self.actions.forward:
            if fwd_cell == None or fwd_cell.can_overlap():
                self.agent_pos = fwd_pos
            if fwd_cell != None and fwd_cell.type == 'goal' and not self.is_teaching:
                done = True
                reward = self._reward()
            if fwd_cell != None and fwd_cell.type == 'lava':
                done = True

        # Pick up an object
        elif action == self.actions.pickup:
            if fwd_cell and fwd_cell.can_pickup():
                if self.carrying is None:
                    self.carrying = fwd_cell
                    self.carrying.cur_pos = np.array([-1, -1])
                    self.grid.set(*fwd_pos, None)

        # Drop an object
        elif action == self.actions.drop:
            if not fwd_cell and self.carrying:
                self.grid.set(*fwd_pos, self.carrying)
                self.carrying.cur_pos = fwd_pos
                self.carrying = None

        # Toggle/activate an object
        elif action == self.actions.toggle:
            if fwd_cell:
                fwd_cell.toggle(self, fwd_pos)

        # Done action (not used by default)
        elif action == self.actions.done:
            if self.step_count >= self.min_steps and self.is_teaching:
                done = True
            else:
                pass
        else:
            assert False, "unknown action"

        if self.step_count >= self.max_steps:
            done = True

        obs = self.gen_obs()
        if done and self.is_teaching:
            student_return_avg = []
            envs = []
            for i in range(self.args.procs):
                env = gym.make(self.args.env)
                env.seed(self.args.seed)
                env.is_teaching = False
                env.end_pos = self.agent_pos
                envs.append(env)
            update = 0
            num_frames = 0

            # md_index = np.random.choice(range(len(self.student_hist_models)),1,p=self.sampling_dist(len(self.student_hist_models),strategy=self.args.sampling_strategy))[0]
            # if np.random.random() < self.args.historical_averaging and not self.args.intra:
            #     md = copy.deepcopy(self.student_hist_models[md_index])
            # else:
            #     md = self.student_hist_models[md_index]
            md = self.student_hist_models[-1]
            # while num_frames < self.args.frames and update<5:
            while update < self.args.s_iters_per_teaching:
                device = torch.device(
                    "cuda" if torch.cuda.is_available() else "cpu")
                algo = torch_ac.PPOAlgo(
                    envs, md, device, self.args.frames_per_proc,
                    self.args.discount, self.args.lr, self.args.gae_lambda,
                    self.args.entropy_coef, self.args.value_loss_coef,
                    self.args.max_grad_norm, self.args.recurrence,
                    self.args.optim_eps, self.args.clip_eps, self.args.epochs,
                    self.args.batch_size, self.preprocess_obss)
                algo.args = self.args
                if self.args.intra:
                    algo.historical_models = self.student_hist_models
                update_start_time = time.time()
                exps, logs1 = algo.collect_experiences()
                logs2 = algo.update_parameters(exps)
                logs = {**logs1, **logs2}
                update_end_time = time.time()
                num_frames += logs["num_frames"]
                student_return_avg.append(
                    self.synthesize(
                        logs["reshaped_return_per_episode"])["mean"])
                update += 1
                print(update, end=",")

                # status = {"num_frames": num_frames, "update": update,
                #           "model_state": md.state_dict(), "optimizer_state": algo.optimizer.state_dict()}
                # if hasattr(self.preprocess_obss, "vocab"):
                #     status["vocab"] = self.preprocess_obss.vocab.vocab
                # utils.save_status(status, self.model_dir)
                if not self.args.intra:
                    self.student_hist_models.append(copy.deepcopy(md))
                if np.random.random(
                ) < self.args.historical_averaging and not self.args.intra:
                    # self.student_hist_models.append(md)
                    md_index = np.random.choice(
                        range(len(self.student_hist_models)),
                        1,
                        p=self.sampling_dist(
                            len(self.student_hist_models),
                            strategy=self.args.sampling_strategy))[0]
                    md = copy.deepcopy(self.student_hist_models[md_index])

                if update % self.args.log_interval == 0 and False:
                    fps = logs["num_frames"] / (update_end_time -
                                                update_start_time)
                    duration = int(time.time() - start_time)
                    return_per_episode = utils.synthesize(
                        logs["return_per_episode"])
                    rreturn_per_episode = utils.synthesize(
                        logs["reshaped_return_per_episode"])
                    num_frames_per_episode = utils.synthesize(
                        logs["num_frames_per_episode"])
                    header = ["update", "frames", "FPS", "duration"]
                    data = [update, num_frames, fps, duration]
                    header += [
                        "rreturn_" + key for key in rreturn_per_episode.keys()
                    ]
                    data += rreturn_per_episode.values()
                    header += [
                        "num_frames_" + key
                        for key in num_frames_per_episode.keys()
                    ]
                    data += num_frames_per_episode.values()
                    header += [
                        "entropy", "value", "policy_loss", "value_loss",
                        "grad_norm"
                    ]
                    data += [
                        logs["entropy"], logs["value"], logs["policy_loss"],
                        logs["value_loss"], logs["grad_norm"]
                    ]

                    txt_logger.info(
                        "U {} | F {:06} | FPS {:04.0f} | D {} | rR:μσmM {:.2f} {:.2f} {:.2f} {:.2f} | F:μσmM {:.1f} {:.1f} {} {} | H {:.3f} | V {:.3f} | pL {:.3f} | vL {:.3f} | ∇ {:.3f}"
                        .format(*data))

                    header += [
                        "return_" + key for key in return_per_episode.keys()
                    ]
                    data += return_per_episode.values()

                    if status["num_frames"] == 0:
                        csv_logger.writerow(header)
                    csv_logger.writerow(data)
                    csv_file.flush()

                    for field, value in zip(header, data):
                        tb_writer.add_scalar(field, value, num_frames)
            reward = max(
                [0, self._reward() - numpy.average(student_return_avg)])
            #logs = {**logs1, **logs2}
            #rreturn_per_episode = self.synthesize(logs["reshaped_return_per_episode"])
            #print(rreturn_per_episode)
        return obs, reward, done, {"agent_pos": self.agent_pos}
Example #6
0
    def train(
        self, model: torch.nn.Module, env_factory: EnvironmentFactory
    ) -> (torch.nn.Module, TrainingStatistics):
        """
        Train the network.
        :param model: the network to train
        :param env_factory: environment factory
        :return: the trained network, and a list of dicts which contain the statistics for training
        """
        model = model.to(self.config.device)
        model.train()  # put network into training mode

        if self.config.inst_in_worker:
            # Note: this is prototype code for something like pygame...
            envs = [(copy.deepcopy(env_factory), env_cfg)
                    for env_cfg in self.config.train_cfgs]
        else:
            envs = [
                env_factory.new_environment(env_cfg)
                for env_cfg in self.config.train_cfgs
            ]

        num_frames_done = 0
        # use prog_bar or print statements as we like
        prog_bar = tqdm(total=self.config.num_frames,
                        desc="{} frames out of at least {} completed".format(
                            0, self.config.num_frames))
        if self.config.algorithm == 'ppo':
            algo = torch_ac.PPOAlgo(
                envs,
                model,
                device=self.config.device,
                num_frames_per_proc=self.config.max_num_frames_rollout,
                # Note: variable here name is misleading, this will actually be exactly the number
                #    of frames that will be collected per rollout
                discount=self.config.discount,
                lr=self.config.learning_rate,
                gae_lambda=self.config.gae_lambda,
                entropy_coef=self.config.entropy_coef,
                value_loss_coef=self.config.value_loss_coef,
                max_grad_norm=self.config.max_grad_norm,
                recurrence=self.config.
                recurrence,  # this last variable must be set to 1 for
                # non-recurrent models -- a torch_ac implementation detail
                adam_eps=self.config.adam_eps,
                clip_eps=self.config.clip_eps,
                epochs=self.config.num_epochs,
                batch_size=self.config.batch_size,
                preprocess_obss=self.config.preprocess_obss,
                reshape_reward=self.config.reshape_reward)
        else:
            raise NotImplementedError("Currently only PPO is supported")

        # intermediate testing setup
        frames_per_test = self.config.num_frames_per_test
        test_frames = frames_per_test
        checkpoint_frames = self.config.num_frames_per_checkpoint

        ts = TrainingStatistics(train_info=self.config.train_info_dict())
        batch_num = 1
        start_time = time.time()
        early_stop = False
        while num_frames_done < self.config.num_frames and not early_stop:
            exps, logs1 = algo.collect_experiences()
            logs2 = algo.update_parameters(exps)
            num_frames_done += logs1['num_frames']
            prog_bar.update(logs1['num_frames'])
            prog_bar.set_description(
                desc="{} frames out of at least {} completed".format(
                    num_frames_done, self.config.num_frames))
            ts.add_batch_stats(
                BatchTrainingStatistics(batch_num, logs2['entropy'],
                                        logs2['value'], logs2['policy_loss'],
                                        logs2['value_loss'],
                                        logs2['grad_norm']))

            if num_frames_done > test_frames and self.config.intermediate_test_cfgs:
                agg_results = self._test(model, env_factory, intermediate=True)
                model.train()
                ts.add_agent_run_stats(agg_results)
                test_frames += frames_per_test
                # Note: we should try to ensure this is not large
                #   might also be worth making a function/object for this

                if self.config.early_stop is not None:
                    early_stop = self.config.early_stop(
                        aggregated_test_results=agg_results,
                        logs1=logs1,
                        logs2=logs2,
                        optimizer_cfg=self.config)
                else:
                    early_stop = self._default_early_stop(
                        aggregated_test_results=agg_results,
                        logs1=logs1,
                        logs2=logs2,
                        optimizer_cfg=self.config)
                if early_stop:
                    ts.train_info['early_stop_frames'] = num_frames_done
            if checkpoint_frames is not None and self.config.checkpoint_dir is not None and num_frames_done > checkpoint_frames:
                fname = 'model.pt'
                # NOTE: we don't move the model off the device to save, b/c I think it would just
                #  be more efficient to load it directly onto cpu using torch's map_location directive
                #  than to move the model from device to cpu and back to device every time a checkpoint
                #  occurs
                model_state_dict = model.state_dict()
                # TODO: save the optimizer state. This requires an update to the `torch_ac.PPOAlgo`.  Here, a
                #   state_dict() method and load_state_dict() method should be implemented. This task is reflected
                #   in the ticket:
                output_dict = dict(model_state_dict=model_state_dict,
                                   num_frames=num_frames_done)
                torch.save(output_dict,
                           os.path.join(self.config.checkpoint_dir, fname))

                checkpoint_frames += self.config.num_frames_per_checkpoint

            batch_num += 1
        train_time = time.time() - start_time
        ts.add_train_time(train_time)
        return model, ts
Example #7
0
    add_agents(env)
    env.on_reset = add_agents
    env.before_reset = configure_board

    envs.append(env)

try:
    model = torch.load(MODEL_PATH)
    model.eval()
    print('Loaded Model (%s)' % MODEL_PATH)
except:
    model = FourKeysAgentModel(envs[0].action_space)
    print('New Model Created')

observation_preprocessor = model.get_observation_preprocessor()
algo = torch_ac.PPOAlgo(envs, model, preprocess_obss=observation_preprocessor)


def full_evaluation(model, visualize=False, evaluation_opponents=3, rounds=1):
    env = gym.make(ENVIRONMENT)
    env.seed(42)

    def eval_add_agents(coordinator):
        for i in range(evaluation_opponents):
            coordinator.add_agent(RandomAgent('Random Agent [%i]' % i))

    def eval_configure_board(coordinator):
        coordinator.shared_state_initialization = dict(side_length=15,
                                                       num_seed_walls=6,
                                                       wall_growth_factor=4)
Example #8
0
    def step(self, action):

        self.step_count += 1

        reward = 0
        done = False

        # Get the position in front of the agent
        fwd_pos = self.front_pos

        # Get the contents of the cell in front of the agent
        fwd_cell = self.grid.get(*fwd_pos)
        #print(action)
        # Rotate left
        if action == self.actions.left:
            self.agent_dir -= 1
            if self.agent_dir < 0:
                self.agent_dir += 4

        # Rotate right
        elif action == self.actions.right:
            self.agent_dir = (self.agent_dir + 1) % 4

        # Move forward
        elif action == self.actions.forward:
            if fwd_cell == None or fwd_cell.can_overlap():
                self.agent_pos = fwd_pos
            if fwd_cell != None and fwd_cell.type == 'goal' and not self.is_teaching:
                done = True
                reward = self._reward()
            if fwd_cell != None and fwd_cell.type == 'lava':
                done = True

        # Pick up an object
        elif action == self.actions.pickup:
            if fwd_cell and fwd_cell.can_pickup():
                if self.carrying is None:
                    self.carrying = fwd_cell
                    self.carrying.cur_pos = np.array([-1, -1])
                    self.grid.set(*fwd_pos, None)

        # Drop an object
        elif action == self.actions.drop:
            if not fwd_cell and self.carrying:
                self.grid.set(*fwd_pos, self.carrying)
                self.carrying.cur_pos = fwd_pos
                self.carrying = None

        # Toggle/activate an object
        elif action == self.actions.toggle:
            if fwd_cell:
                fwd_cell.toggle(self, fwd_pos)

        # Done action (not used by default)
        elif action == self.actions.done:
            if self.step_count >= self.min_steps and self.is_teaching:
                done = True
            else:
                pass
        else:
            assert False, "unknown action"

        if self.step_count >= self.max_steps:
            done = True

        obs = self.gen_obs()
        if done and self.is_teaching:
            student_return_avg = []
            for _ in range(1):
                envs = []
                for i in range(self.args.procs):
                    env = gym.make(self.args.env)
                    env.seed(self.args.seed)
                    env.is_teaching = False
                    env.end_pos = self.agent_pos
                    envs.append(env)
                device = torch.device(
                    "cuda" if torch.cuda.is_available() else "cpu")
                algo = torch_ac.PPOAlgo(
                    envs, self.acmodel, device, self.args.frames_per_proc,
                    self.args.discount, self.args.lr, self.args.gae_lambda,
                    self.args.entropy_coef, self.args.value_loss_coef,
                    self.args.max_grad_norm, self.args.recurrence,
                    self.args.optim_eps, self.args.clip_eps, self.args.epochs,
                    self.args.batch_size, self.preprocess_obss)
                update_start_time = time.time()
                exps, logs1 = algo.collect_experiences()
                logs2 = algo.update_parameters(exps)
                logs = {**logs1, **logs2}
                update_end_time = time.time()
                student_return_avg.append(
                    self.synthesize(
                        logs["reshaped_return_per_episode"])["mean"])
            reward = max(
                [0, self._reward() - numpy.average(student_return_avg)])
            #logs = {**logs1, **logs2}
            #rreturn_per_episode = self.synthesize(logs["reshaped_return_per_episode"])
            #print(rreturn_per_episode)
        return obs, reward, done, {"agent_pos": self.agent_pos}
Example #9
0
txt_logger.info("Observations preprocessor loaded")

# Load model

acmodel = ACModel(obs_space, envs[0].action_space)
if "model_state" in status:
    acmodel.load_state_dict(status["model_state"])
acmodel.to(device)
txt_logger.info("Model loaded\n")
txt_logger.info("{}\n".format(acmodel))

# Load algo

algo = torch_ac.PPOAlgo(envs, acmodel, device, args.frames_per_proc,
                        args.discount, args.lr, args.gae_lambda,
                        args.entropy_coef, args.value_loss_coef,
                        args.max_grad_norm, 1, args.adam_eps, args.clip_eps,
                        args.epochs, args.batch_size, preprocess_obss,
                        utils.reshape_reward)
if "optimizer_state" in status:
    algo.optimizer.load_state_dict(status["optimizer_state"])
txt_logger.info("Optimizer loaded\n")

# Train model

num_frames = status["num_frames"]
update = status["update"]
start_time = time.time()

while num_frames < args.frames:
    # Update model parameters
Example #10
0
    acmodel = ACModel(envs[0].observation_space, envs[0].action_space, memory, False)
    if "model_state" in status:
        acmodel.load_state_dict(status["model_state"])
    acmodel.to(device)
    txt_logger.info("Model loaded\n")
    txt_logger.info("{}\n".format(acmodel))

    # Load algo

    if algorithm == "a2c":
        algo = torch_ac.A2CAlgo(envs, acmodel, device, 5, discount, lr, gae_lambda,
                                entropy_coef, value_loss_coef, max_grad_norm, recurrence,
                                optim_alpha, optim_eps, preprocess_obss)
    elif algorithm == "ppo":
        algo = torch_ac.PPOAlgo(envs, acmodel, device, 128, discount, lr, gae_lambda,
                                entropy_coef, value_loss_coef, max_grad_norm, recurrence,
                                optim_eps, clip_eps, epochs, batch_size, preprocess_obss)
    else:
        raise ValueError("Incorrect algorithm name: {}".format(algorithm))

    if "optimizer_state" in status:
        algo.optimizer.load_state_dict(status["optimizer_state"])
    txt_logger.info("Optimizer loaded\n")

    # Train model

    num_frames = status["num_frames"]
    update = status["update"]
    start_time = time.time()

    while num_frames < frames: