Ejemplo n.º 1
0
    def __init__(self, action_size, transition_model, encoder, reward_model,
                 observation_model):

        self.encoder, self.reward_model, self.transition_model, self.observation_model = encoder, reward_model, transition_model, observation_model

        self.merge_value_model = ValueModel(
            args.belief_size, args.state_size, args.hidden_size,
            args.dense_activation_function).to(device=args.device)
        self.merge_actor_model = MergeModel(
            args.belief_size, args.state_size, args.hidden_size, action_size,
            args.pool_len,
            args.dense_activation_function).to(device=args.device)
        self.merge_actor_model.share_memory()
        self.merge_value_model.share_memory()

        # set actor, value pool
        self.actor_pool = [
            ActorModel(args.belief_size, args.state_size, args.hidden_size,
                       action_size,
                       args.dense_activation_function).to(device=args.device)
            for _ in range(args.pool_len)
        ]
        self.value_pool = [
            ValueModel(args.belief_size, args.state_size, args.hidden_size,
                       args.dense_activation_function).to(device=args.device)
            for _ in range(args.pool_len)
        ]
        [actor.share_memory() for actor in self.actor_pool]
        [value.share_memory() for value in self.value_pool]

        self.env_model_modules = get_modules([
            self.transition_model, self.encoder, self.observation_model,
            self.reward_model
        ])
        self.actor_pool_modules = get_modules(self.actor_pool)
        self.model_modules = self.env_model_modules + self.actor_pool_modules

        self.merge_value_model_modules = get_modules([self.merge_value_model])

        self.merge_actor_optimizer = optim.Adam(
            self.merge_actor_model.parameters(),
            lr=0
            if args.learning_rate_schedule != 0 else args.actor_learning_rate,
            eps=args.adam_epsilon)
        self.merge_value_optimizer = optim.Adam(
            self.merge_value_model.parameters(),
            lr=0
            if args.learning_rate_schedule != 0 else args.value_learning_rate,
            eps=args.adam_epsilon)

        self.actor_pipes = [
            Pipe() for i in range(1,
                                  len(self.actor_pool) + 1)
        ]  # Set Multi Pipe
        self.workers_actor = [
            Worker_actor(actor_l=self.actor_pool[i],
                         value_l=self.value_pool[i],
                         transition_model=self.transition_model,
                         encoder=self.encoder,
                         observation_model=self.observation_model,
                         reward_model=self.reward_model,
                         child_conn=child,
                         results_dir=args.results_dir,
                         id=i + 1)
            for i, [parent, child] in enumerate(self.actor_pipes)
        ]  # Set Worker_actor Using i'th actor_pipes

        [w.start()
         for i, w in enumerate(self.workers_actor)]  # Start Single Process

        self.metrics = {
            'episodes': [],
            'merge_actor_loss': [],
            'merge_value_loss': []
        }
        self.merge_losses = []
Ejemplo n.º 2
0
# Initialise model parameters randomly
transition_model = TransitionModel(
    args.belief_size, args.state_size, env.action_size, args.hidden_size,
    args.embedding_size, args.dense_activation_function).to(device=args.device)
observation_model = ObservationModel(
    args.symbolic_env, env.observation_size, args.belief_size, args.state_size,
    args.embedding_size, args.cnn_activation_function).to(device=args.device)
reward_model = RewardModel(
    args.belief_size, args.state_size, args.hidden_size,
    args.dense_activation_function).to(device=args.device)
encoder = Encoder(args.symbolic_env, env.observation_size, args.embedding_size,
                  args.cnn_activation_function).to(device=args.device)
actor_model = ActorModel(args.belief_size, args.state_size, args.hidden_size,
                         env.action_size,
                         args.dense_activation_function).to(device=args.device)
value_model = ValueModel(args.belief_size, args.state_size, args.hidden_size,
                         args.dense_activation_function).to(device=args.device)
param_list = list(transition_model.parameters()) + list(
    observation_model.parameters()) + list(reward_model.parameters()) + list(
        encoder.parameters())
value_actor_param_list = list(value_model.parameters()) + list(
    actor_model.parameters())
params_list = param_list + value_actor_param_list
model_optimizer = optim.Adam(
    param_list,
    lr=0 if args.learning_rate_schedule != 0 else args.model_learning_rate,
    eps=args.adam_epsilon)
actor_optimizer = optim.Adam(
    actor_model.parameters(),
    lr=0 if args.learning_rate_schedule != 0 else args.actor_learning_rate,
    eps=args.adam_epsilon)
value_optimizer = optim.Adam(
Ejemplo n.º 3
0
    def __init__(self, args):
        """
    All paras are passed by args
    :param args: a dict that includes parameters
    """
        super().__init__()
        self.args = args
        # Initialise model parameters randomly
        self.transition_model = TransitionModel(
            args.belief_size, args.state_size, args.action_size,
            args.hidden_size, args.embedding_size,
            args.dense_act).to(device=args.device)

        self.observation_model = ObservationModel(
            args.symbolic,
            args.observation_size,
            args.belief_size,
            args.state_size,
            args.embedding_size,
            activation_function=(args.dense_act if args.symbolic else
                                 args.cnn_act)).to(device=args.device)

        self.reward_model = RewardModel(args.belief_size, args.state_size,
                                        args.hidden_size,
                                        args.dense_act).to(device=args.device)

        self.encoder = Encoder(args.symbolic, args.observation_size,
                               args.embedding_size,
                               args.cnn_act).to(device=args.device)

        self.actor_model = ActorModel(
            args.action_size,
            args.belief_size,
            args.state_size,
            args.hidden_size,
            activation_function=args.dense_act).to(device=args.device)

        self.value_model = ValueModel(args.belief_size, args.state_size,
                                      args.hidden_size,
                                      args.dense_act).to(device=args.device)

        self.pcont_model = PCONTModel(args.belief_size, args.state_size,
                                      args.hidden_size,
                                      args.dense_act).to(device=args.device)

        self.target_value_model = deepcopy(self.value_model)

        for p in self.target_value_model.parameters():
            p.requires_grad = False

        # setup the paras to update
        self.world_param = list(self.transition_model.parameters())\
                          + list(self.observation_model.parameters())\
                          + list(self.reward_model.parameters())\
                          + list(self.encoder.parameters())
        if args.pcont:
            self.world_param += list(self.pcont_model.parameters())

        # setup optimizer
        self.world_optimizer = optim.Adam(self.world_param, lr=args.world_lr)
        self.actor_optimizer = optim.Adam(self.actor_model.parameters(),
                                          lr=args.actor_lr)
        self.value_optimizer = optim.Adam(list(self.value_model.parameters()),
                                          lr=args.value_lr)

        # setup the free_nat
        self.free_nats = torch.full(
            (1, ), args.free_nats, dtype=torch.float32,
            device=args.device)  # Allowed deviation in KL divergence
Ejemplo n.º 4
0
# Model
transition_model = TransitionModel(
    args.belief_size, args.state_size, env.action_size, args.hidden_size, args.embedding_size, args.dense_activation_function).to(device)
observation_model = ObservationModel(
    env.observation_size, args.belief_size, args.state_size, args.embedding_size, args.cnn_activation_function).to(device)
reward_model = RewardModel(
    args.belief_size, args.state_size, args.hidden_size, args.dense_activation_function).to(device)
pcont_model = PcontModel(
    args.belief_size, args.state_size, args.hidden_size, args.dense_activation_function).to(device)
encoder = Encoder(env.observation_size, args.embedding_size,
                  args.cnn_activation_function).to(device)
actor_model = ActorModel(args.belief_size, args.state_size, args.hidden_size, env.action_size,
                         args.action_dist, args.dense_activation_function).to(device)
# enabling doubleQ?
value_model = ValueModel(args.belief_size, args.state_size, args.hidden_size,
                         args.dense_activation_function, doubleQ=False).to(device)

# Param List
param_list = list(transition_model.parameters()) + list(
    observation_model.parameters()) + list(reward_model.parameters()) + list(
        encoder.parameters())
if args.pcont:
    param_list += list(pcont_model.parameters())

# Optimizer
model_optimizer = optim.Adam(
    param_list, lr=0 if args.learning_rate_schedule != 0 else args.model_learning_rate, eps=args.adam_epsilon)
actor_optimizer = optim.Adam(
    actor_model.parameters(), lr=0 if args.learning_rate_schedule != 0 else args.actor_learning_rate, eps=args.adam_epsilon)
value_optimizer = optim.Adam(
    value_model.parameters(), lr=0 if args.learning_rate_schedule != 0 else args.value_learning_rate, eps=args.adam_epsilon)
Ejemplo n.º 5
0
    def __init__(self, args):
        """
    All paras are passed by args
    :param args: a dict that includes parameters
    """
        super().__init__()
        self.args = args
        # Initialise model parameters randomly
        self.transition_model = TransitionModel(
            args.belief_size, args.state_size, args.action_size,
            args.hidden_size, args.embedding_size,
            args.dense_act).to(device=args.device)

        self.observation_model = ObservationModel(
            args.symbolic,
            args.observation_size,
            args.belief_size,
            args.state_size,
            args.embedding_size,
            activation_function=(args.dense_act if args.symbolic else
                                 args.cnn_act)).to(device=args.device)

        self.reward_model = RewardModel(args.belief_size, args.state_size,
                                        args.hidden_size,
                                        args.dense_act).to(device=args.device)

        self.encoder = Encoder(args.symbolic, args.observation_size,
                               args.embedding_size,
                               args.cnn_act).to(device=args.device)

        self.actor_model = ActorModel(
            args.action_size,
            args.belief_size,
            args.state_size,
            args.hidden_size,
            activation_function=args.dense_act,
            fix_speed=args.fix_speed,
            throttle_base=args.throttle_base).to(device=args.device)

        self.value_model = ValueModel(args.belief_size, args.state_size,
                                      args.hidden_size,
                                      args.dense_act).to(device=args.device)

        self.value_model2 = ValueModel(args.belief_size, args.state_size,
                                       args.hidden_size,
                                       args.dense_act).to(device=args.device)

        self.pcont_model = PCONTModel(args.belief_size, args.state_size,
                                      args.hidden_size,
                                      args.dense_act).to(device=args.device)

        self.target_value_model = deepcopy(self.value_model)
        self.target_value_model2 = deepcopy(self.value_model2)

        for p in self.target_value_model.parameters():
            p.requires_grad = False
        for p in self.target_value_model2.parameters():
            p.requires_grad = False

        # setup the paras to update
        self.world_param = list(self.transition_model.parameters())\
                          + list(self.observation_model.parameters())\
                          + list(self.reward_model.parameters())\
                          + list(self.encoder.parameters())
        if args.pcont:
            self.world_param += list(self.pcont_model.parameters())

        # setup optimizer
        self.world_optimizer = optim.Adam(self.world_param, lr=args.world_lr)
        self.actor_optimizer = optim.Adam(self.actor_model.parameters(),
                                          lr=args.actor_lr)
        self.value_optimizer = optim.Adam(list(self.value_model.parameters()) +
                                          list(self.value_model2.parameters()),
                                          lr=args.value_lr)

        # setup the free_nat to
        self.free_nats = torch.full(
            (1, ), args.free_nats, dtype=torch.float32,
            device=args.device)  # Allowed deviation in KL divergence

        # TODO: change it to the new replay buffer, in buffer.py
        self.D = ExperienceReplay(args.experience_size, args.symbolic,
                                  args.observation_size, args.action_size,
                                  args.bit_depth, args.device)

        if self.args.auto_temp:
            # setup for learning of alpha term (temp of the entropy term)
            self.log_temp = torch.zeros(1,
                                        requires_grad=True,
                                        device=args.device)
            self.target_entropy = -np.prod(
                args.action_size if not args.fix_speed else self.args.
                action_size - 1).item()  # heuristic value from SAC paper
            self.temp_optimizer = optim.Adam(
                [self.log_temp], lr=args.value_lr)  # use the same value_lr
Ejemplo n.º 6
0
from circular import CircularFun, CircularSpline
from models import ValueModel
from viz import value_plot

import matplotlib.pyplot as plt
import numpy as np

# value function
v = CircularSpline(3, 5, w=[0, 0, 0, 1, 0])

# movement penalty function
p = CircularFun(lambda x, a: a*x**2, p_circ=0.5, args=[10])

# model
m = ValueModel(v, p, 0, 10, 20)

# simulation
x = m.simulate(2000)
value_plot(x, m)
Ejemplo n.º 7
0
                         args.cnn_act)).to(device=args.device)

reward_model = RewardModel(args.belief_size, args.state_size, args.hidden_size,
                           args.dense_act).to(device=args.device)

encoder = Encoder(args.symbolic, env.observation_size, args.embedding_size,
                  args.cnn_act).to(device=args.device)

actor_model = ActorModel(
    env.action_size,
    args.belief_size,
    args.state_size,
    args.hidden_size,
    activation_function=args.dense_act).to(device=args.device)

value_model1 = ValueModel(args.belief_size, args.state_size, args.hidden_size,
                          args.dense_act).to(device=args.device)

value_model2 = ValueModel(args.belief_size, args.state_size, args.hidden_size,
                          args.dense_act).to(device=args.device)

pcont_model = PCONTModel(args.belief_size, args.state_size, args.hidden_size,
                         args.dense_act).to(device=args.device)

target_value_model1 = deepcopy(value_model1)
for p in target_value_model1.parameters():
    p.requires_grad = False

target_value_model2 = deepcopy(value_model2)
for p in target_value_model2.parameters():
    p.requires_grad = False