def __init__(self, num_hidden_units, input_dim, num_actions, num_q_val, observation_dim, goal_dim, batch_size, use_cuda, gamma, random_seed, actor_optimizer, critic_optimizer, actor_learning_rate, critic_learning_rate, loss_function, polyak_constant, buffer_capacity, non_conv=True, num_conv_layers=None, num_pool_layers=None, conv_kernel_size=None, img_height=None, img_width=None, input_channels=None): self.num_hidden_units = num_hidden_units self.non_conv = non_conv self.num_actions = num_actions self.num_q = num_q_val self.obs_dim = observation_dim self.goal_dim = goal_dim self.input_dim = input_dim self.batch_size = batch_size self.cuda = use_cuda self.gamma = gamma self.seed(random_seed) self.actor_optim = actor_optimizer self.critic_optim = critic_optimizer self.actor_lr = actor_learning_rate self.critic_lr = critic_learning_rate self.criterion = loss_function self.tau = polyak_constant self.buffer = Buffer.ReplayBuffer(capacity=buffer_capacity, seed=random_seed) # Convolution Parameters self.num_conv = num_conv_layers self.pool = num_pool_layers self.im_height = img_height self.im_width = img_width self.conv_kernel_size = conv_kernel_size self.input_channels = input_channels if non_conv: self.target_actor = ActorDDPGNonConvNetwork( num_hidden_layers=num_hidden_units, output_action=num_actions, input=input_dim) self.actor = ActorDDPGNonConvNetwork( num_hidden_layers=num_hidden_units, output_action=num_actions, input=input_dim) self.target_critic = CriticDDPGNonConvNetwork( num_hidden_layers=num_hidden_units, output_q_value=num_q_val, input=input_dim, action_dim=num_actions, goal_dim=self.goal_dim) self.critic = CriticDDPGNonConvNetwork( num_hidden_layers=num_hidden_units, output_q_value=num_q_val, input=input_dim, action_dim=num_actions, goal_dim=self.goal_dim) else: self.target_actor = ActorDDPGNetwork( num_conv_layers=self.num_conv, conv_kernel_size=self.conv_kernel_size, input_channels=self.input_channels, output_action=self.num_actions, dense_layer=self.num_hidden_units, pool_kernel_size=self.pool, IMG_HEIGHT=self.im_height, IMG_WIDTH=self.im_width) self.actor = ActorDDPGNetwork( num_conv_layers=self.num_conv, conv_kernel_size=self.conv_kernel_size, input_channels=self.input_channels, output_action=self.num_actions, dense_layer=self.num_hidden_units, pool_kernel_size=self.pool, IMG_HEIGHT=self.im_height, IMG_WIDTH=self.im_width) self.target_critic = CriticDDPGNetwork( num_conv_layers=self.num_conv, conv_kernel_size=self.conv_kernel_size, input_channels=self.input_channels, output_q_value=self.num_q, dense_layer=self.num_hidden_units, pool_kernel_size=self.pool, IMG_HEIGHT=self.im_height, IMG_WIDTH=self.im_width) self.critic = CriticDDPGNetwork( num_conv_layers=self.num_conv, conv_kernel_size=self.conv_kernel_size, input_channels=self.input_channels, output_q_value=self.num_q, dense_layer=self.num_hidden_units, pool_kernel_size=self.pool, IMG_HEIGHT=self.im_height, IMG_WIDTH=self.im_width) if self.cuda: self.target_actor = self.target_actor.cuda() self.actor = self.actor.cuda() self.target_critic = self.target_critic.cuda() self.critic = self.critic.cuda() # Initializing the target networks with the standard network weights self.target_actor.load_state_dict(self.actor.state_dict()) self.target_critic.load_state_dict(self.critic.state_dict()) # Create the optimizers for the actor and critic using the corresponding learning rate actor_parameters = self.actor.parameters() critic_parameters = self.critic.parameters() self.actor_optim = opt.Adam(actor_parameters, lr=self.actor_lr) self.critic_optim = opt.Adam(critic_parameters, lr=self.critic_lr) # Initialize a random exploration noise self.random_noise = random_process.OrnsteinUhlenbeckActionNoise( self.num_actions)
def __init__(self, env, encoder, forward_dynamics, statistics_network, target_policy_network, policy_network, forward_dynamics_lr, stats_lr, policy_lr, num_train_epochs, num_frames, num_fwd_train_steps, num_stats_train_steps, fwd_dynamics_limit, stats_network_limit, policy_limit, size_replay_buffer, random_seed, polyak_constant, discount_factor, batch_size, action_space, model_output_folder, save_epoch, target_stats_network=None, target_fwd_dynamics_network=None, clip_rewards=True, clip_augmented_rewards=False, print_every=2000, update_network_every=2000, plot_every=5000, intrinsic_param=0.01, non_episodic_intrinsic=True, use_mine_formulation=True, use_cuda=False, save_models=True, plot_stats=False, verbose=True): self.encoder = encoder self.fwd = forward_dynamics self.stats = statistics_network self.use_cuda = use_cuda self.policy_network = policy_network self.target_policy_network = target_policy_network self.output_folder = model_output_folder self.use_mine_formulation = use_mine_formulation self.env = env self.train_epochs = num_train_epochs self.num_frames = num_frames self.num_fwd_train_steps = num_fwd_train_steps self.num_stats_train_steps = num_stats_train_steps self.fwd_lr = forward_dynamics_lr self.stats_lr = stats_lr self.policy_lr = policy_lr self.random_seed = random_seed self.save_models = save_models self.plot_stats = plot_stats self.verbose = verbose self.intrinsic_param = intrinsic_param self.save_epoch = save_epoch self.clip_rewards = clip_rewards self.clip_augmented_rewards = clip_augmented_rewards self.max = torch.zeros(1) self.min = torch.zeros(1) self.fwd_limit = fwd_dynamics_limit self.stats_limit = stats_network_limit self.policy_limit = policy_limit self.print_every = print_every self.update_every = update_network_every self.plot_every = plot_every self.non_episodic = non_episodic_intrinsic self.statistics = defaultdict(float) self.combined_statistics = defaultdict(list) self.target_stats_network = target_stats_network self.target_fwd_dynamics_network = target_fwd_dynamics_network # Fix the encoder weights for param in self.encoder.parameters(): param.requires_grad = False self.replay_buffer = Buffer.ReplayBuffer(capacity=size_replay_buffer, seed=self.random_seed) self.tau = polyak_constant self.gamma = discount_factor self.batch_size = batch_size self.action_space = action_space torch.manual_seed(self.random_seed) if self.use_cuda: torch.cuda.manual_seed(self.random_seed) if self.use_cuda: self.encoder = self.encoder.cuda() self.invd = self.invd.cuda() self.fwd = self.fwd.cuda() self.policy_network = self.policy_network.cuda() self.source_distribution = self.source_distribution.cuda() self.fwd_optim = optim.Adam(params=self.fwd.parameters(), lr=self.fwd_lr) self.policy_optim = optim.Adam(params=self.policy_network.parameters(), lr=self.policy_lr) self.stats_optim = optim.Adam(params=self.stats.parameters(), lr=self.stats_lr) # Update the policy and target policy networks self.update_networks()
def __init__( self, state_dim, action_dim, hidden_dim, actor, critic, value_network, target_value_network, polyak_constant, actor_learning_rate, critic_learning_rate, value_learning_rate, num_q_value, num_v_value, batch_size, gamma, random_seed, num_epochs, num_rollouts, num_eval_rollouts, env, eval_env, nb_train_steps, max_episodes_per_epoch, output_folder, use_cuda, buffer_capacity, policy_reg_mean_weight=1e-3, policy_reg_std_weight=1e-3, policy_preactivation_weight=0, verbose=True, plot_stats=False, ): self.state_dim = state_dim self.action_dim = action_dim self.hidden = hidden_dim self.q_dim = num_q_value self.v_dim = num_v_value self.actor = actor self.critic = critic self.value = value_network self.tau = polyak_constant self.bs = batch_size self.gamma = gamma self.seed = random_seed self.use_cuda = use_cuda self.buffer = Buffer.ReplayBuffer(capacity=buffer_capacity, seed=self.seed) self.policy_reg_mean_weight = policy_reg_mean_weight self.policy_reg_std_weight = policy_reg_std_weight self.policy_pre_activation_weight = policy_preactivation_weight # Training specific parameters self.num_epochs = num_epochs self.num_rollouts = num_rollouts self.num_eval_rollouts = num_eval_rollouts self.env = env self.eval_env = eval_env self.nb_train_steps = nb_train_steps self.max_episodes_per_epoch = max_episodes_per_epoch self.statistics = defaultdict(float) self.combined_statistics = defaultdict(list) self.verbose = verbose self.output_folder = output_folder self.plot_stats = plot_stats self.actor_optim = optim.Adam(lr=actor_learning_rate, params=self.actor.parameters()) self.critic_optim = optim.Adam(lr=critic_learning_rate, params=self.critic.parameters()) self.value_optim = optim.Adam(lr=value_learning_rate, params=self.value.parameters()) self.target_value = target_value_network if self.use_cuda: self.actor = self.actor.cuda() self.critic = self.critic.cuda() self.value = self.value.cuda() self.target_value = self.target_value.cuda() # Initializing the target networks with the standard network weights self.target_value.load_state_dict(self.value.state_dict()) # Initialize a random exploration noise self.random_noise = random_process.OrnsteinUhlenbeckActionNoise( self.action_dim)
def __init__(self, env, encoder, inverse_dynamics, forward_dynamics, source_distribution, statistics_network, target_policy_network, policy_network, encoder_lr, inverse_dynamics_lr, forward_dynamics_lr, source_d_lr, stats_lr, policy_lr, num_train_epochs, num_epochs, num_rollouts, size_replay_buffer, size_dqn_replay_buffer, random_seed, polyak_constant, discount_factor, batch_size, action_space, observation_space, model_output_folder, train_encoder=False, use_mine_formulation=True, use_cuda=False): self.encoder = encoder self.invd = inverse_dynamics self.fwd = forward_dynamics self.source = source_distribution self.stats = statistics_network self.use_cuda = use_cuda self.policy_network = policy_network self.target_policy_network = target_policy_network self.model_output_folder = model_output_folder self.use_mine_formulation = use_mine_formulation self.env = env self.num_epochs = num_epochs self.train_epochs = num_train_epochs self.num_rollouts = num_rollouts self.e_lr = encoder_lr self.invd_lr = inverse_dynamics_lr self.fwd_lr = forward_dynamics_lr self.source_lr = source_d_lr self.stats_lr = stats_lr self.policy_lr = policy_lr self.random_seed = random_seed self.replay_buffer = Buffer.ReplayBuffer(capacity=size_replay_buffer, seed=self.random_seed) self.dqn_replay_buffer = Buffer.ReplayBuffer(capacity=size_dqn_replay_buffer, seed=self.random_seed) self.tau = polyak_constant self.gamma = discount_factor self.batch_size = batch_size self.action_space = action_space self.obs_space = observation_space torch.manual_seed(self.random_seed) if self.use_cuda: torch.cuda.manual_seed(self.random_seed) if self.use_cuda: self.encoder = self.encoder.cuda() self.invd = self.invd.cuda() self.fwd = self.fwd.cuda() self.policy_network = self.policy_network.cuda() self.source_distribution = self.source_distribution.cuda() # Define the optimizers if train_encoder: self.e_optim = optim.Adam(params=self.encoder.parameters(), lr=self.e_lr) self.invd_optim = optim.Adam(params=self.invd.parameters(), lr=self.invd_lr) self.fwd_optim = optim.Adam(params=self.fwd.parameters(), lr=self.fwd_lr) self.policy_optim = optim.Adam(params=self.policy_network.parameters(), lr=self.policy_lr) self.source_optim = optim.Adam(params=self.source_distribution.parameters(), lr=self.source_lr) self.stats_optim = optim.Adam(params=self.stats.parameters(), lr=self.stats_lr)