예제 #1
0
파일: DDPG.py 프로젝트: yulkang/pytorch-rl
    def __init__(self,
                 num_hidden_units,
                 input_dim,
                 num_actions,
                 num_q_val,
                 observation_dim,
                 goal_dim,
                 batch_size,
                 use_cuda,
                 gamma,
                 random_seed,
                 actor_optimizer,
                 critic_optimizer,
                 actor_learning_rate,
                 critic_learning_rate,
                 loss_function,
                 polyak_constant,
                 buffer_capacity,
                 non_conv=True,
                 num_conv_layers=None,
                 num_pool_layers=None,
                 conv_kernel_size=None,
                 img_height=None,
                 img_width=None,
                 input_channels=None):

        self.num_hidden_units = num_hidden_units
        self.non_conv = non_conv
        self.num_actions = num_actions
        self.num_q = num_q_val
        self.obs_dim = observation_dim
        self.goal_dim = goal_dim
        self.input_dim = input_dim
        self.batch_size = batch_size
        self.cuda = use_cuda
        self.gamma = gamma
        self.seed(random_seed)
        self.actor_optim = actor_optimizer
        self.critic_optim = critic_optimizer
        self.actor_lr = actor_learning_rate
        self.critic_lr = critic_learning_rate
        self.criterion = loss_function
        self.tau = polyak_constant
        self.buffer = Buffer.ReplayBuffer(capacity=buffer_capacity,
                                          seed=random_seed)

        # Convolution Parameters
        self.num_conv = num_conv_layers
        self.pool = num_pool_layers
        self.im_height = img_height
        self.im_width = img_width
        self.conv_kernel_size = conv_kernel_size
        self.input_channels = input_channels

        if non_conv:
            self.target_actor = ActorDDPGNonConvNetwork(
                num_hidden_layers=num_hidden_units,
                output_action=num_actions,
                input=input_dim)

            self.actor = ActorDDPGNonConvNetwork(
                num_hidden_layers=num_hidden_units,
                output_action=num_actions,
                input=input_dim)

            self.target_critic = CriticDDPGNonConvNetwork(
                num_hidden_layers=num_hidden_units,
                output_q_value=num_q_val,
                input=input_dim,
                action_dim=num_actions,
                goal_dim=self.goal_dim)
            self.critic = CriticDDPGNonConvNetwork(
                num_hidden_layers=num_hidden_units,
                output_q_value=num_q_val,
                input=input_dim,
                action_dim=num_actions,
                goal_dim=self.goal_dim)

        else:
            self.target_actor = ActorDDPGNetwork(
                num_conv_layers=self.num_conv,
                conv_kernel_size=self.conv_kernel_size,
                input_channels=self.input_channels,
                output_action=self.num_actions,
                dense_layer=self.num_hidden_units,
                pool_kernel_size=self.pool,
                IMG_HEIGHT=self.im_height,
                IMG_WIDTH=self.im_width)

            self.actor = ActorDDPGNetwork(
                num_conv_layers=self.num_conv,
                conv_kernel_size=self.conv_kernel_size,
                input_channels=self.input_channels,
                output_action=self.num_actions,
                dense_layer=self.num_hidden_units,
                pool_kernel_size=self.pool,
                IMG_HEIGHT=self.im_height,
                IMG_WIDTH=self.im_width)

            self.target_critic = CriticDDPGNetwork(
                num_conv_layers=self.num_conv,
                conv_kernel_size=self.conv_kernel_size,
                input_channels=self.input_channels,
                output_q_value=self.num_q,
                dense_layer=self.num_hidden_units,
                pool_kernel_size=self.pool,
                IMG_HEIGHT=self.im_height,
                IMG_WIDTH=self.im_width)
            self.critic = CriticDDPGNetwork(
                num_conv_layers=self.num_conv,
                conv_kernel_size=self.conv_kernel_size,
                input_channels=self.input_channels,
                output_q_value=self.num_q,
                dense_layer=self.num_hidden_units,
                pool_kernel_size=self.pool,
                IMG_HEIGHT=self.im_height,
                IMG_WIDTH=self.im_width)
        if self.cuda:
            self.target_actor = self.target_actor.cuda()
            self.actor = self.actor.cuda()
            self.target_critic = self.target_critic.cuda()
            self.critic = self.critic.cuda()

        # Initializing the target networks with the standard network weights
        self.target_actor.load_state_dict(self.actor.state_dict())
        self.target_critic.load_state_dict(self.critic.state_dict())

        # Create the optimizers for the actor and critic using the corresponding learning rate
        actor_parameters = self.actor.parameters()
        critic_parameters = self.critic.parameters()

        self.actor_optim = opt.Adam(actor_parameters, lr=self.actor_lr)
        self.critic_optim = opt.Adam(critic_parameters, lr=self.critic_lr)

        # Initialize a random exploration noise
        self.random_noise = random_process.OrnsteinUhlenbeckActionNoise(
            self.num_actions)
예제 #2
0
    def __init__(self,
                 env,
                 encoder,
                 forward_dynamics,
                 statistics_network,
                 target_policy_network,
                 policy_network,
                 forward_dynamics_lr,
                 stats_lr,
                 policy_lr,
                 num_train_epochs,
                 num_frames,
                 num_fwd_train_steps,
                 num_stats_train_steps,
                 fwd_dynamics_limit,
                 stats_network_limit,
                 policy_limit,
                 size_replay_buffer,
                 random_seed,
                 polyak_constant,
                 discount_factor,
                 batch_size,
                 action_space,
                 model_output_folder,
                 save_epoch,
                 target_stats_network=None,
                 target_fwd_dynamics_network=None,
                 clip_rewards=True,
                 clip_augmented_rewards=False,
                 print_every=2000,
                 update_network_every=2000,
                 plot_every=5000,
                 intrinsic_param=0.01,
                 non_episodic_intrinsic=True,
                 use_mine_formulation=True,
                 use_cuda=False,
                 save_models=True,
                 plot_stats=False,
                 verbose=True):

        self.encoder = encoder
        self.fwd = forward_dynamics
        self.stats = statistics_network
        self.use_cuda = use_cuda
        self.policy_network = policy_network
        self.target_policy_network = target_policy_network
        self.output_folder = model_output_folder
        self.use_mine_formulation = use_mine_formulation
        self.env = env
        self.train_epochs = num_train_epochs
        self.num_frames = num_frames
        self.num_fwd_train_steps = num_fwd_train_steps
        self.num_stats_train_steps = num_stats_train_steps
        self.fwd_lr = forward_dynamics_lr
        self.stats_lr = stats_lr
        self.policy_lr = policy_lr
        self.random_seed = random_seed
        self.save_models = save_models
        self.plot_stats = plot_stats
        self.verbose = verbose
        self.intrinsic_param = intrinsic_param
        self.save_epoch = save_epoch
        self.clip_rewards = clip_rewards
        self.clip_augmented_rewards = clip_augmented_rewards
        self.max = torch.zeros(1)
        self.min = torch.zeros(1)

        self.fwd_limit = fwd_dynamics_limit
        self.stats_limit = stats_network_limit
        self.policy_limit = policy_limit

        self.print_every = print_every
        self.update_every = update_network_every
        self.plot_every = plot_every
        self.non_episodic = non_episodic_intrinsic

        self.statistics = defaultdict(float)
        self.combined_statistics = defaultdict(list)

        self.target_stats_network = target_stats_network
        self.target_fwd_dynamics_network = target_fwd_dynamics_network

        # Fix the encoder weights
        for param in self.encoder.parameters():
            param.requires_grad = False

        self.replay_buffer = Buffer.ReplayBuffer(capacity=size_replay_buffer,
                                                 seed=self.random_seed)

        self.tau = polyak_constant
        self.gamma = discount_factor
        self.batch_size = batch_size
        self.action_space = action_space

        torch.manual_seed(self.random_seed)
        if self.use_cuda:
            torch.cuda.manual_seed(self.random_seed)

        if self.use_cuda:
            self.encoder = self.encoder.cuda()
            self.invd = self.invd.cuda()
            self.fwd = self.fwd.cuda()
            self.policy_network = self.policy_network.cuda()
            self.source_distribution = self.source_distribution.cuda()

        self.fwd_optim = optim.Adam(params=self.fwd.parameters(),
                                    lr=self.fwd_lr)
        self.policy_optim = optim.Adam(params=self.policy_network.parameters(),
                                       lr=self.policy_lr)
        self.stats_optim = optim.Adam(params=self.stats.parameters(),
                                      lr=self.stats_lr)
        # Update the policy and target policy networks
        self.update_networks()
예제 #3
0
    def __init__(
        self,
        state_dim,
        action_dim,
        hidden_dim,
        actor,
        critic,
        value_network,
        target_value_network,
        polyak_constant,
        actor_learning_rate,
        critic_learning_rate,
        value_learning_rate,
        num_q_value,
        num_v_value,
        batch_size,
        gamma,
        random_seed,
        num_epochs,
        num_rollouts,
        num_eval_rollouts,
        env,
        eval_env,
        nb_train_steps,
        max_episodes_per_epoch,
        output_folder,
        use_cuda,
        buffer_capacity,
        policy_reg_mean_weight=1e-3,
        policy_reg_std_weight=1e-3,
        policy_preactivation_weight=0,
        verbose=True,
        plot_stats=False,
    ):

        self.state_dim = state_dim
        self.action_dim = action_dim
        self.hidden = hidden_dim
        self.q_dim = num_q_value
        self.v_dim = num_v_value
        self.actor = actor
        self.critic = critic
        self.value = value_network
        self.tau = polyak_constant
        self.bs = batch_size
        self.gamma = gamma
        self.seed = random_seed
        self.use_cuda = use_cuda
        self.buffer = Buffer.ReplayBuffer(capacity=buffer_capacity,
                                          seed=self.seed)
        self.policy_reg_mean_weight = policy_reg_mean_weight
        self.policy_reg_std_weight = policy_reg_std_weight
        self.policy_pre_activation_weight = policy_preactivation_weight

        # Training specific parameters
        self.num_epochs = num_epochs
        self.num_rollouts = num_rollouts
        self.num_eval_rollouts = num_eval_rollouts
        self.env = env
        self.eval_env = eval_env
        self.nb_train_steps = nb_train_steps
        self.max_episodes_per_epoch = max_episodes_per_epoch
        self.statistics = defaultdict(float)
        self.combined_statistics = defaultdict(list)
        self.verbose = verbose
        self.output_folder = output_folder
        self.plot_stats = plot_stats

        self.actor_optim = optim.Adam(lr=actor_learning_rate,
                                      params=self.actor.parameters())
        self.critic_optim = optim.Adam(lr=critic_learning_rate,
                                       params=self.critic.parameters())
        self.value_optim = optim.Adam(lr=value_learning_rate,
                                      params=self.value.parameters())

        self.target_value = target_value_network

        if self.use_cuda:
            self.actor = self.actor.cuda()
            self.critic = self.critic.cuda()
            self.value = self.value.cuda()
            self.target_value = self.target_value.cuda()

        # Initializing the target networks with the standard network weights
        self.target_value.load_state_dict(self.value.state_dict())

        # Initialize a random exploration noise
        self.random_noise = random_process.OrnsteinUhlenbeckActionNoise(
            self.action_dim)
예제 #4
0
    def __init__(self,
                 env,
                 encoder,
                 inverse_dynamics,
                 forward_dynamics,
                 source_distribution,
                 statistics_network,
                 target_policy_network,
                 policy_network,
                 encoder_lr,
                 inverse_dynamics_lr,
                 forward_dynamics_lr,
                 source_d_lr,
                 stats_lr,
                 policy_lr,
                 num_train_epochs,
                 num_epochs,
                 num_rollouts,
                 size_replay_buffer,
                 size_dqn_replay_buffer,
                 random_seed,
                 polyak_constant,
                 discount_factor,
                 batch_size,
                 action_space,
                 observation_space,
                 model_output_folder,
                 train_encoder=False,
                 use_mine_formulation=True,
                 use_cuda=False):

        self.encoder = encoder
        self.invd = inverse_dynamics
        self.fwd = forward_dynamics
        self.source = source_distribution
        self.stats = statistics_network
        self.use_cuda = use_cuda
        self.policy_network = policy_network
        self.target_policy_network = target_policy_network
        self.model_output_folder = model_output_folder
        self.use_mine_formulation = use_mine_formulation
        self.env = env
        self.num_epochs = num_epochs
        self.train_epochs = num_train_epochs
        self.num_rollouts = num_rollouts
        self.e_lr = encoder_lr
        self.invd_lr = inverse_dynamics_lr
        self.fwd_lr = forward_dynamics_lr
        self.source_lr = source_d_lr
        self.stats_lr = stats_lr
        self.policy_lr = policy_lr
        self.random_seed = random_seed
        self.replay_buffer = Buffer.ReplayBuffer(capacity=size_replay_buffer,
                                                 seed=self.random_seed)
        self.dqn_replay_buffer = Buffer.ReplayBuffer(capacity=size_dqn_replay_buffer,
                                                     seed=self.random_seed)
        self.tau = polyak_constant
        self.gamma = discount_factor
        self.batch_size = batch_size
        self.action_space = action_space
        self.obs_space = observation_space

        torch.manual_seed(self.random_seed)
        if self.use_cuda:
            torch.cuda.manual_seed(self.random_seed)

        if self.use_cuda:
            self.encoder = self.encoder.cuda()
            self.invd = self.invd.cuda()
            self.fwd = self.fwd.cuda()
            self.policy_network = self.policy_network.cuda()
            self.source_distribution = self.source_distribution.cuda()

        # Define the optimizers
        if train_encoder:
            self.e_optim = optim.Adam(params=self.encoder.parameters(), lr=self.e_lr)
        self.invd_optim = optim.Adam(params=self.invd.parameters(), lr=self.invd_lr)
        self.fwd_optim = optim.Adam(params=self.fwd.parameters(), lr=self.fwd_lr)
        self.policy_optim = optim.Adam(params=self.policy_network.parameters(), lr=self.policy_lr)
        self.source_optim = optim.Adam(params=self.source_distribution.parameters(), lr=self.source_lr)
        self.stats_optim = optim.Adam(params=self.stats.parameters(), lr=self.stats_lr)