Esempio n. 1
0
    def __init__(self,
                 agent,
                 game,
                 buffer_file=None,
                 weights_file=None,
                 n_batches=0):
        self.agent = agent
        self.game = game
        self.replay_buffer = ReplayBuffer()

        if buffer_file is not None:
            self.replay_buffer.buffer = pickle.load(open(buffer_file, "rb"))

        self.current_network = NestedTTTNet()
        self.control_network = NestedTTTNet()

        if weights_file is not None:
            self.control_network.load_state_dict(torch.load(weights_file))

        self.current_network.load_state_dict(self.control_network.state_dict())
        self.control_network.eval()
        self.current_network.train()

        self.agent.update_control_net(self.control_network)

        self.n_batches = n_batches

        self.optim = torch.optim.Adam(self.current_network.parameters(),
                                      lr=.01,
                                      weight_decay=10e-4)
Esempio n. 2
0
    def __init__(self, state_size, action_size, num_agents):
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(RANDOM_SEED)
        self.num_agents = num_agents

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size).to(device)
        self.actor_target = Actor(state_size, action_size).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size).to(device)
        self.critic_target = Critic(state_size, action_size).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise(action_size)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE)
        
        # Directory where to save the model
        self.model_dir = os.getcwd() + "/DDPG/saved_models"
        os.makedirs(self.model_dir, exist_ok=True)
Esempio n. 3
0
    def __init__(
        self,
        env,
        gamma=0.99,
        polyak=0.995,
        act_noise=0.1,
        render=False,
        batch_size=32,
        q_lr=1e-3,
        p_lr=1e-4,
        d=2,
        buffer_capacity=5000,
        max_episodes=100,
        save_path=None,
        load_path=None,
        print_freq=1,
        start_steps=10000,
        log_dir='logs/train',
        training=True,
    ):
        self.gamma = gamma
        self.polyak = polyak
        self.act_noise = act_noise
        self.render = render
        self.batch_size = batch_size
        self.p_lr = p_lr
        self.q_lr = q_lr
        self.d = d
        self.max_episodes = max_episodes
        self.start_steps = start_steps
        self.actor, self.critic_1, self.critic_2 = create_actor_critic(
            env.observation_space.shape[0], env.action_space.shape[0],
            env.action_space.high)
        self.target_actor, self.target_critic_1, self.target_critic_2 = create_actor_critic(
            env.observation_space.shape[0], env.action_space.shape[0],
            env.action_space.high)
        self.target_actor.set_weights(self.actor.get_weights())
        self.target_critic_1.set_weights(self.critic_1.get_weights())
        self.target_critic_2.set_weights(self.critic_2.get_weights())
        self.env = env
        self.rewards = []
        self.print_freq = print_freq
        self.save_path = save_path

        if training:
            self.buffer = ReplayBuffer(buffer_capacity)
            self.actor_optimizer = tf.keras.optimizers.Adam(
                learning_rate=self.p_lr)
            self.critic_1_optimizer = tf.keras.optimizers.Adam(
                learning_rate=self.q_lr)
            self.critic_2_optimizer = tf.keras.optimizers.Adam(
                learning_rate=self.q_lr)
            self.summary_writer = tf.summary.create_file_writer(log_dir)
            self.mse = tf.keras.losses.MeanSquaredError()
        if load_path is not None:
            self.actor.load_weights(f'{load_path}/actor')
            self.critic_1.load_weights(f'{load_path}/critic_1')
            self.critic_2.load_weights(f'{load_path}/critic_2')
Esempio n. 4
0
    def __init__(self,
                 input_dims,
                 n_actions,
                 layer_sizes,
                 act_lr=0.00001,
                 crt_lr=0.0001,
                 tau=0.001,
                 gamma=0.99,
                 max_size=1000000,
                 batch_size=64,
                 chkpt_dir='tmp/ddpg',
                 name='ddpg',
                 layerNorm=True):
        self.input_dims = input_dims
        self.n_actions = n_actions
        self.layer_sizes = layer_sizes
        self.layerNorm = layerNorm
        self.gamma = gamma  # discount factor
        self.tau = tau  # target network updating weight
        self.memory = ReplayBuffer(max_size, self.input_dims, self.n_actions)
        self.batch_size = batch_size

        self.actor = ActorNetwork(act_lr,
                                  self.input_dims,
                                  self.n_actions,
                                  self.layer_sizes,
                                  name='Actor_' + name,
                                  chkpt_dir=chkpt_dir,
                                  layerNorm=self.layerNorm)
        self.critic = CriticNetwork(crt_lr,
                                    self.input_dims,
                                    self.n_actions,
                                    self.layer_sizes,
                                    name='Critic_' + name,
                                    chkpt_dir=chkpt_dir,
                                    layerNorm=self.layerNorm)

        self.target_actor = ActorNetwork(act_lr,
                                         self.input_dims,
                                         self.n_actions,
                                         self.layer_sizes,
                                         name='TargetActor_' + name,
                                         chkpt_dir=chkpt_dir,
                                         layerNorm=self.layerNorm)
        self.target_critic = CriticNetwork(crt_lr,
                                           self.input_dims,
                                           self.n_actions,
                                           self.layer_sizes,
                                           name='TargetCritic_' + name,
                                           chkpt_dir=chkpt_dir,
                                           layerNorm=self.layerNorm)

        self.noise = OUActionNoise(mu=np.zeros(self.n_actions))

        self.update_network_parameters(tau=1)
Esempio n. 5
0
    def initialize(self, args):
        BaseModel.initialize(self, args)
        self.input_B = self.Tensor(args.batchSize, 3, 1024, 256)
        self.input_C = self.Tensor(args.batchSize, 1, 1024, 256)

        self.fake_Buffer = ReplayBuffer()

        self.netG_BtoC = networks.define_G(3, 1, 64, 'unet_128', 'batch',
                                           False, args.init_type, self.gpu_ids)
        self.netD_C = networks.define_D(1,
                                        64,
                                        'basic',
                                        norm='batch',
                                        use_sigmoid=False,
                                        gpu_ids=args.gpu_ids)

        self.netG_BtoC.apply(weights_init_normal)
        self.netD_C.apply(weights_init_normal)

        checkpoint_BtoC_filename = 'netG_B2C.pth'
        checkpoint_D_C_filename = 'netD_C.pth'

        checkpoint_path_BtoC = os.path.join(args.checkpoints_dir,
                                            checkpoint_BtoC_filename)
        checkpoint_path_D_C = os.path.join(args.checkpoints_dir,
                                           checkpoint_D_C_filename)

        # Load checkpoint
        # self.netG_BtoC.load_state_dict(torch.load(checkpoint_path_BtoC))
        # self.netD_C.load_state_dict(torch.load(checkpoint_path_D_C))

        # define loss
        self.criterionGAN = torch.nn.MSELoss()
        self.criterionReconstruction = torch.nn.L1Loss().cuda()

        # init optimizer
        self.optimizer_G = torch.optim.Adam(self.netG_BtoC.parameters(),
                                            lr=0.0002,
                                            betas=(0.5, 0.999))
        self.optimizer_D = torch.optim.Adam(self.netD_C.parameters(),
                                            lr=0.0002,
                                            betas=(0.5, 0.999))

        self.lr_scheduler_G = torch.optim.lr_scheduler.LambdaLR(
            self.optimizer_G,
            lr_lambda=LambdaLR(args.n_epochs, args.epoch,
                               args.decay_epoch).step)
        self.lr_scheduler_D = torch.optim.lr_scheduler.LambdaLR(
            self.optimizer_D,
            lr_lambda=LambdaLR(args.n_epochs, args.epoch,
                               args.decay_epoch).step)
Esempio n. 6
0
	def __init__(self, gamma=0.999, buffer_size=1e5, batch_size=1024,
                 episodes_nr=50000, tau=2e-2, gym_name='MountainCarContinuous-v0'):
         
		self.lr_actor = 5e-3				# learning rate for the actor
		self.lr_critic = 1e-3			# learning rate for the critic
		self.lr_decay = 1				# learning rate decay (per episode)
		self.l2_reg_actor = 1e-7			# L2 regularization factor for the actor
		self.l2_reg_critic = 1e-7		# L2 regularization factor for the critic
         
		self.num_episodes = episodes_nr		# number of episodes
		self.max_steps_ep = 10000	# default max number of steps per episode (unless env has a lower hardcoded limit)
		self.train_every = 1			# number of steps to run the policy (and collect experience) before updating network weights
		self.replay_memory_capacity = buffer_size	# capacity of experience replay memory
		
		self.batch_size = batch_size
		self.memory = ReplayBuffer(int(buffer_size))
		self.episodes_nr = episodes_nr
		self.gamma = gamma
		self.tau = tau
        
		self.env = gym.make(gym_name)
		assert(self.env.action_space.high == -self.env.action_space.low)
		self.action_range = self.env.action_space.high[0]
        
		self.action_dim = np.prod(np.array(self.env.action_space.shape))
		self.state_dim = np.prod(np.array(self.env.observation_space.shape))
        
		#self.noise = OUNoise(self.action_dim)
		self.action_range = self.env.action_space.high - self.env.action_space.low
		
		self.initial_noise_scale = 0.1	# scale of the exploration noise process (1.0 is the range of each action dimension)
		self.noise_decay = 1 #0.99		# decay rate (per episode) of the scale of the exploration noise process
		self.exploration_mu = 0.0	# mu parameter for the exploration noise process: dXt = theta*(mu-Xt)*dt + sigma*dWt
		self.exploration_theta = 0.15 # theta parameter for the exploration noise process: dXt = theta*(mu-Xt)*dt + sigma*dWt
		self.exploration_sigma = 0.2	# sigma parameter for the exploration noise process: dXt = theta*(mu-Xt	)*dt + sigma*dWt

		self.noise = OUNoise(self.action_dim)
Esempio n. 7
0
	def __init__(self, task):
         
		self.lr_actor = 5e-3				# learning rate for the actor
		self.lr_critic = 1e-3			# learning rate for the critic
		#self.lr_decay = 1				# learning rate decay (per episode)
		self.l2_reg_actor = 1e-7			# L2 regularization factor for the actor
		self.l2_reg_critic = 1e-7		# L2 regularization factor for the critic
         
		#self.num_episodes = 2000		# number of episodes
		#self.max_steps_ep = 10000	# default max number of steps per episode (unless env has a lower hardcoded limit)
		
		self.batch_size = 1024
		self.memory = ReplayBuffer(int(1e5))
		#self.episodes_nr = 10000
		self.gamma = 0.999
		self.tau = 2e-2
Esempio n. 8
0
    def __init__(self,
                 env,
                 gamma=0.99,
                 polyak=0.995,
                 c=10,
                 d=2,
                 high_act_noise=0.1,
                 low_act_noise=0.1,
                 high_rew_scale=0.1,
                 low_rew_scale=1.0,
                 render=False,
                 batch_size=32,
                 q_lr=1e-3,
                 p_lr=1e-4,
                 buffer_capacity=5000,
                 max_episodes=100,
                 save_path=None,
                 load_path=None,
                 print_freq=1,
                 log_dir='logs/train',
                 training=True
                 ):
        self.gamma = gamma
        self.polyak = polyak
        self.low_act_noise = low_act_noise
        self.high_act_noise = high_act_noise
        self.low_rew_scale = low_rew_scale
        self.high_rew_scale = high_rew_scale
        self.render = render
        self.batch_size = batch_size
        self.p_lr = p_lr
        self.q_lr = q_lr
        self.max_episodes = max_episodes
        self.env = env
        self.rewards = []
        self.print_freq = print_freq
        self.save_path = save_path
        self.c = c
        self.d = d
        self.higher_buffer = ReplayBuffer(buffer_capacity, tuple_length=5)
        self.lower_buffer = ReplayBuffer(buffer_capacity, tuple_length=4)

        self.low_actor, self.low_critic_1, self.low_critic_2 = create_actor_critic(
            state_dim=2 * env.observation_space.shape[0],
            action_dim=env.action_space.shape[0],
            action_range=env.action_space.high)

        self.low_target_actor, self.low_target_critic_1, self.low_target_critic_2 = create_actor_critic(
            state_dim=2 * env.observation_space.shape[0],
            action_dim=env.action_space.shape[0],
            action_range=env.action_space.high)

        self.high_actor, self.high_critic_1, self.high_critic_2 = create_actor_critic(
            state_dim=env.observation_space.shape[0],
            action_dim=env.observation_space.shape[0],
            action_range=env.observation_space.high)

        self.high_target_actor, self.high_target_critic_1, self.high_target_critic_2 = create_actor_critic(
            state_dim=env.observation_space.shape[0],
            action_dim=env.observation_space.shape[0],
            action_range=env.observation_space.high)
        self.low_target_actor.set_weights(self.low_actor.get_weights())
        self.low_target_critic_1.set_weights(self.low_critic_1.get_weights())
        self.low_target_critic_2.set_weights(self.low_critic_2.get_weights())
        self.high_target_actor.set_weights(self.high_actor.get_weights())
        self.high_target_critic_1.set_weights(self.high_critic_1.get_weights())
        self.high_target_critic_2.set_weights(self.high_critic_2.get_weights())

        if training:
            self.low_actor_optimizer = tf.keras.optimizers.Adam(learning_rate=self.p_lr)
            self.low_critic_1_optimizer = tf.keras.optimizers.Adam(learning_rate=self.q_lr)
            self.low_critic_2_optimizer = tf.keras.optimizers.Adam(learning_rate=self.q_lr)
            self.high_actor_optimizer = tf.keras.optimizers.Adam(learning_rate=self.p_lr)
            self.high_critic_1_optimizer = tf.keras.optimizers.Adam(learning_rate=self.q_lr)
            self.high_critic_2_optimizer = tf.keras.optimizers.Adam(learning_rate=self.q_lr)
            self.mse = tf.keras.losses.MeanSquaredError()
            self.summary_writer = tf.summary.create_file_writer(log_dir)

            self.low_actor_train_fn = self.create_train_step_actor_fn(self.low_actor, self.low_critic_1,
                                                                      self.low_actor_optimizer)
            self.low_critic_train_fns = [self.create_train_step_critic_fn(critic=c, optimizer=o) for c, o in
                                         [(self.low_critic_1, self.low_critic_1_optimizer),
                                          (self.low_critic_2, self.low_critic_2_optimizer)]]

            self.high_actor_train_fn = self.create_train_step_actor_fn(self.high_actor, self.high_critic_1,
                                                                       self.high_actor_optimizer)
            self.high_critic_train_fns = [self.create_train_step_critic_fn(critic=c, optimizer=o) for c, o in
                                          [(self.high_critic_1, self.high_critic_1_optimizer),
                                           (self.high_critic_2, self.high_critic_2_optimizer)]]
        if load_path is not None:
            self.low_actor.load_weights(f'{load_path}/low/actor')
            self.low_critic_1.load_weights(f'{load_path}/low/critic_1')
            self.low_critic_2.load_weights(f'{load_path}/low/critic_2')
            self.high_actor.load_weights(f'{load_path}/high/actor')
            self.high_critic_1.load_weights(f'{load_path}/high/critic_1')
            self.high_critic_2.load_weights(f'{load_path}/high/critic_2')
Esempio n. 9
0
                                                         opt.n_epochs, 0,
                                                         opt.decay_epoch).step)
lr_scheduler_D_B = torch.optim.lr_scheduler.LambdaLR(optimizer_D_B,
                                                     lr_lambda=LambdaLR(
                                                         opt.n_epochs, 0,
                                                         opt.decay_epoch).step)

# Data
dataA = np.random.randn(shape=(3, 256, 256)).astype(np.float32)
dataB = np.random.randn(shape=(3, 256, 256)).astype(np.float32)
train = torch.utils.data.TensorDataset(torch.from_numpy(dataA),
                                       torch.from_numpy(dataB))
data_loader = torch.utils.data.DataLoader(train,
                                          batch_size=opt.batchSize,
                                          shuffle=True)
buffer_A = ReplayBuffer()
buffer_B = ReplayBuffer()

# targets
real_target = Tensor(opt.batchSize).fill_(1.0)
fake_target = Tensor(opt.batchSize).fill_(0.0)

# Parameters
lamnda_identity = 5.0
lambda_cycle = 10.0

for epoch in range(opt.n_epochs):
    for i, data in enumerate(data_loader):
        realA, realB = data

        ### Generator
Esempio n. 10
0
class HIRO:
    def __init__(self,
                 env,
                 gamma=0.99,
                 polyak=0.995,
                 c=10,
                 d=2,
                 high_act_noise=0.1,
                 low_act_noise=0.1,
                 high_rew_scale=0.1,
                 low_rew_scale=1.0,
                 render=False,
                 batch_size=32,
                 q_lr=1e-3,
                 p_lr=1e-4,
                 buffer_capacity=5000,
                 max_episodes=100,
                 save_path=None,
                 load_path=None,
                 print_freq=1,
                 log_dir='logs/train',
                 training=True
                 ):
        self.gamma = gamma
        self.polyak = polyak
        self.low_act_noise = low_act_noise
        self.high_act_noise = high_act_noise
        self.low_rew_scale = low_rew_scale
        self.high_rew_scale = high_rew_scale
        self.render = render
        self.batch_size = batch_size
        self.p_lr = p_lr
        self.q_lr = q_lr
        self.max_episodes = max_episodes
        self.env = env
        self.rewards = []
        self.print_freq = print_freq
        self.save_path = save_path
        self.c = c
        self.d = d
        self.higher_buffer = ReplayBuffer(buffer_capacity, tuple_length=5)
        self.lower_buffer = ReplayBuffer(buffer_capacity, tuple_length=4)

        self.low_actor, self.low_critic_1, self.low_critic_2 = create_actor_critic(
            state_dim=2 * env.observation_space.shape[0],
            action_dim=env.action_space.shape[0],
            action_range=env.action_space.high)

        self.low_target_actor, self.low_target_critic_1, self.low_target_critic_2 = create_actor_critic(
            state_dim=2 * env.observation_space.shape[0],
            action_dim=env.action_space.shape[0],
            action_range=env.action_space.high)

        self.high_actor, self.high_critic_1, self.high_critic_2 = create_actor_critic(
            state_dim=env.observation_space.shape[0],
            action_dim=env.observation_space.shape[0],
            action_range=env.observation_space.high)

        self.high_target_actor, self.high_target_critic_1, self.high_target_critic_2 = create_actor_critic(
            state_dim=env.observation_space.shape[0],
            action_dim=env.observation_space.shape[0],
            action_range=env.observation_space.high)
        self.low_target_actor.set_weights(self.low_actor.get_weights())
        self.low_target_critic_1.set_weights(self.low_critic_1.get_weights())
        self.low_target_critic_2.set_weights(self.low_critic_2.get_weights())
        self.high_target_actor.set_weights(self.high_actor.get_weights())
        self.high_target_critic_1.set_weights(self.high_critic_1.get_weights())
        self.high_target_critic_2.set_weights(self.high_critic_2.get_weights())

        if training:
            self.low_actor_optimizer = tf.keras.optimizers.Adam(learning_rate=self.p_lr)
            self.low_critic_1_optimizer = tf.keras.optimizers.Adam(learning_rate=self.q_lr)
            self.low_critic_2_optimizer = tf.keras.optimizers.Adam(learning_rate=self.q_lr)
            self.high_actor_optimizer = tf.keras.optimizers.Adam(learning_rate=self.p_lr)
            self.high_critic_1_optimizer = tf.keras.optimizers.Adam(learning_rate=self.q_lr)
            self.high_critic_2_optimizer = tf.keras.optimizers.Adam(learning_rate=self.q_lr)
            self.mse = tf.keras.losses.MeanSquaredError()
            self.summary_writer = tf.summary.create_file_writer(log_dir)

            self.low_actor_train_fn = self.create_train_step_actor_fn(self.low_actor, self.low_critic_1,
                                                                      self.low_actor_optimizer)
            self.low_critic_train_fns = [self.create_train_step_critic_fn(critic=c, optimizer=o) for c, o in
                                         [(self.low_critic_1, self.low_critic_1_optimizer),
                                          (self.low_critic_2, self.low_critic_2_optimizer)]]

            self.high_actor_train_fn = self.create_train_step_actor_fn(self.high_actor, self.high_critic_1,
                                                                       self.high_actor_optimizer)
            self.high_critic_train_fns = [self.create_train_step_critic_fn(critic=c, optimizer=o) for c, o in
                                          [(self.high_critic_1, self.high_critic_1_optimizer),
                                           (self.high_critic_2, self.high_critic_2_optimizer)]]
        if load_path is not None:
            self.low_actor.load_weights(f'{load_path}/low/actor')
            self.low_critic_1.load_weights(f'{load_path}/low/critic_1')
            self.low_critic_2.load_weights(f'{load_path}/low/critic_2')
            self.high_actor.load_weights(f'{load_path}/high/actor')
            self.high_critic_1.load_weights(f'{load_path}/high/critic_1')
            self.high_critic_2.load_weights(f'{load_path}/high/critic_2')

    @staticmethod
    def goal_transition(state, goal, next_state):
        return state + goal - next_state

    @staticmethod
    def intrinsic_reward(state, goal, next_state):
        return - np.linalg.norm(state + goal - next_state)

    def act(self, obs, goal, noise=False):
        norm_dist = tf.random.normal(self.env.action_space.shape, stddev=0.1 * self.env.action_space.high)
        action = self.low_actor(np.concatenate((obs, goal), axis=1)).numpy()
        action = np.clip(action + (norm_dist.numpy() if noise else 0),
                         a_min=self.env.action_space.low,
                         a_max=self.env.action_space.high)
        return action

    def get_goal(self, obs, noise=False):
        norm_dist = tf.random.normal(self.env.observation_space.shape, stddev=0.1 * self.env.observation_space.high)
        action = self.high_actor(obs).numpy()
        action = np.clip(action + (norm_dist.numpy() if noise else 0),
                         a_min=self.env.observation_space.low,
                         a_max=self.env.observation_space.high)
        return action

    @tf.function
    def log_probability(self, states, actions, candidate_goal):
        goals = tf.reshape(candidate_goal, (1, -1))

        def body(curr_i, curr_goals, s):
            new_goals = tf.concat(
                (curr_goals,
                 tf.reshape(self.goal_transition(s[curr_i - 1], curr_goals[curr_i - 1], s[curr_i]), (1, -1))), axis=0)
            curr_i += 1
            return [curr_i, new_goals, s]

        def condition(curr_i, curr_goals, s):
            return curr_i < s.shape[0] and not (
                    tf.equal(tf.math.count_nonzero(s[curr_i]), 0) and tf.equal(tf.math.count_nonzero(actions[curr_i]),
                                                                               0))

        # If a state-action pair is all zero, then the episode ended before an entire sequence of length c was recorded.
        # We must remove these empty states and actions from the log probability calculation, as they could skew the
        #   argmax computation
        i = tf.constant(1)
        i, goals, states = tf.while_loop(condition, body, [i, goals, states],
                                         shape_invariants=[tf.TensorShape(None), tf.TensorShape([None, goals.shape[1]]),
                                                           states.shape])
        states = states[:i, :]
        actions = actions[:i, :]

        action_predictions = self.low_actor(tf.concat((states, goals), axis=1))
        return -(1 / 2) * tf.reduce_sum(tf.linalg.norm(actions - action_predictions, axis=1))

    @tf.function
    def off_policy_correct(self, states, goals, actions, new_states):
        first_states = tf.reshape(states, (self.batch_size, -1))[:, :new_states[0].shape[0]]
        means = new_states - first_states
        std_dev = 0.5 * (1 / 2) * tf.convert_to_tensor(self.env.observation_space.high)

        for i in range(states.shape[0]):
            # Sample eight candidate goals sampled randomly from a Gaussian centered at s_{t+c} - s_t
            # Include the original goal and a goal corresponding to the difference s_{t+c} - s_t
            # TODO: clip the random actions to lie within the high-level action range
            candidate_goals = tf.concat(
                (tf.random.normal(shape=(8, self.env.observation_space.shape[0]), mean=means[i], stddev=std_dev),
                 tf.reshape(goals[i], (1, -1)), tf.reshape(means[i], (1, -1))),
                axis=0)

            chosen_goal = tf.argmax(
                [self.log_probability(states[i], actions[i], candidate_goals[g]) for g in
                 range(candidate_goals.shape[0])])
            goals = tf.tensor_scatter_nd_update(goals, [[i]], [candidate_goals[chosen_goal]])

        return first_states, goals

    @tf.function
    def train_step_critics(self, states, actions, rewards, next_states, actor, target_critic_1,
                           target_critic_2, critic_trains_fns, target_noise,
                           scope='Policy'):
        target_goal_preds = actor(next_states)
        target_goal_preds += target_noise

        target_q_values_1 = target_critic_1([next_states, target_goal_preds])
        target_q_values_2 = target_critic_2([next_states, target_goal_preds])

        target_q_values = tf.concat((target_q_values_1, target_q_values_2), axis=1)
        target_q_values = tf.reshape(tf.reduce_min(target_q_values, axis=1), (self.batch_size, -1))
        targets = rewards + self.gamma * target_q_values

        critic_trains_fns[0](states, actions, targets, scope=scope, label='Critic 1')
        critic_trains_fns[1](states, actions, targets, scope=scope, label='Critic 2')

    def create_train_step_actor_fn(self, actor, critic, optimizer):
        @tf.function
        def train_step_actor(states, scope='policy', label='actor'):
            with tf.GradientTape() as tape:
                action_predictions = actor(states)
                q_values = critic([states, action_predictions])
                policy_loss = -tf.reduce_mean(q_values)
            gradients = tape.gradient(policy_loss, actor.trainable_variables)
            optimizer.apply_gradients(zip(gradients, actor.trainable_variables))

            with tf.name_scope(scope):
                with self.summary_writer.as_default():
                    tf.summary.scalar(f'{label} Policy Loss', policy_loss, step=optimizer.iterations)

        return train_step_actor

    def create_train_step_critic_fn(self, critic, optimizer):
        @tf.function
        def train_step_critic(states, actions, targets, scope='Policy', label='Critic'):
            with tf.GradientTape() as tape:
                q_values = critic([states, actions])
                mse_loss = self.mse(q_values, targets)
            gradients = tape.gradient(mse_loss, critic.trainable_variables)
            optimizer.apply_gradients(zip(gradients, critic.trainable_variables))

            with tf.name_scope(scope):
                with self.summary_writer.as_default():
                    tf.summary.scalar(f'{label} MSE Loss', mse_loss, step=optimizer.iterations)
                    tf.summary.scalar(f'{label} Mean Q Values', tf.reduce_mean(q_values), step=optimizer.iterations)

        return train_step_critic

    def update_lower(self):
        if len(self.lower_buffer) >= self.batch_size:
            states, actions, rewards, next_states = self.lower_buffer.sample(self.batch_size)
            rewards = rewards.reshape(-1, 1).astype(np.float32)

            self.train_step_critics(states, actions, rewards, next_states, self.low_actor, self.low_target_critic_1,
                                    self.low_target_critic_2,
                                    self.low_critic_train_fns,
                                    target_noise=tf.random.normal(actions.shape,
                                                                  stddev=0.1 * self.env.action_space.high),
                                    scope='Lower_Policy')

            if self.low_critic_1_optimizer.iterations % self.d == 0:
                self.low_actor_train_fn(states, scope='Lower_Policy', label='Actor')

                # Update target networks
                polyak_average(self.low_actor.variables, self.low_target_actor.variables, self.polyak)
                polyak_average(self.low_critic_1.variables, self.low_target_critic_1.variables, self.polyak)
                polyak_average(self.low_critic_2.variables, self.low_target_critic_2.variables, self.polyak)

    def update_higher(self):
        if len(self.higher_buffer) >= self.batch_size:
            states, goals, actions, rewards, next_states = self.higher_buffer.sample(self.batch_size)
            rewards = rewards.reshape((-1, 1))

            states, goals, actions, rewards, next_states = (tf.convert_to_tensor(states, dtype=tf.float32),
                                                            tf.convert_to_tensor(goals, dtype=tf.float32),
                                                            tf.convert_to_tensor(actions, dtype=tf.float32),
                                                            tf.convert_to_tensor(rewards, dtype=tf.float32),
                                                            tf.convert_to_tensor(next_states, dtype=tf.float32))

            states, goals = self.off_policy_correct(states=states, goals=goals, actions=actions, new_states=next_states)

            self.train_step_critics(states, goals, rewards, next_states, self.high_actor, self.high_target_critic_1,
                                    self.high_target_critic_2,
                                    self.high_critic_train_fns,
                                    target_noise=tf.random.normal(next_states.shape,
                                                                  stddev=0.1 * self.env.observation_space.high),
                                    scope='Higher_Policy')

            if self.high_critic_1_optimizer.iterations % self.d == 0:
                self.high_actor_train_fn(states, scope='Higher_Policy', label='Actor')

                # Update target networks
                polyak_average(self.high_actor.variables, self.high_target_actor.variables, self.polyak)
                polyak_average(self.high_critic_1.variables, self.high_target_critic_1.variables, self.polyak)
                polyak_average(self.high_critic_2.variables, self.high_target_critic_2.variables, self.polyak)

    def learn(self):
        # Collect experiences s_t, g_t, a_t, R_t
        mean_reward = None
        total_steps = 0

        for ep in range(self.max_episodes):
            if ep % self.print_freq == 0 and ep > 0:
                new_mean_reward = np.mean(self.rewards[-self.print_freq - 1:])

                print(f"-------------------------------------------------------")
                print(f"Mean {self.print_freq} Episode Reward: {new_mean_reward}")
                print(f"Total Episodes: {ep}")
                print(f"Total Steps: {total_steps}")
                print(f"-------------------------------------------------------")

                total_steps = 0
                with tf.name_scope('Episodic Information'):
                    with self.summary_writer.as_default():
                        tf.summary.scalar(f'Mean {self.print_freq} Episode Reward', new_mean_reward,
                                          step=ep // self.print_freq)

                # Model saving inspired by Open AI Baseline implementation
                if (mean_reward is None or new_mean_reward >= mean_reward) and self.save_path is not None:
                    print(f"Saving model due to mean reward increase:{mean_reward} -> {new_mean_reward}")
                    print(f'Location: {self.save_path}')
                    mean_reward = new_mean_reward

                    self.low_actor.save_weights(f'{self.save_path}/low/actor')
                    self.low_critic_1.save_weights(f'{self.save_path}/low/critic_1')
                    self.low_critic_2.save_weights(f'{self.save_path}/low/critic_2')
                    self.high_actor.save_weights(f'{self.save_path}/high/actor')
                    self.high_critic_1.save_weights(f'{self.save_path}/high/critic_1')
                    self.high_critic_2.save_weights(f'{self.save_path}/high/critic_2')

            obs = self.env.reset()
            goal = self.get_goal(obs.reshape((1, -1)), noise=True).flatten()
            higher_goal = goal
            higher_obs = []
            higher_actions = []
            higher_reward = 0
            episode_reward = 0
            episode_intrinsic_rewards = 0
            ep_len = 0
            c = 0

            done = False
            while not done:
                if self.render:
                    self.env.render()
                action = self.act(obs.reshape((1, -1)), goal.reshape((1, -1)), noise=True).flatten()
                new_obs, rew, done, info = self.env.step(action)
                new_obs = new_obs.flatten()
                new_goal = self.goal_transition(obs, goal, new_obs)
                episode_reward += rew

                # Goals are treated as additional state information for the low level
                # policy. Store transitions in respective replay buffers
                intrinsic_reward = self.intrinsic_reward(obs, goal, new_obs) * self.low_rew_scale
                self.lower_buffer.add((np.concatenate((obs, goal)), action,
                                       intrinsic_reward,
                                       np.concatenate((new_obs, new_goal)),))
                episode_intrinsic_rewards += intrinsic_reward

                self.update_lower()

                # Fill lists for single higher level transition
                higher_obs.append(obs)
                higher_actions.append(action)
                higher_reward += self.high_rew_scale * rew

                # Only add transitions to the high level replay buffer every c steps
                c += 1
                if c == self.c or done:
                    # Need all higher level transitions to be the same length
                    # fill the rest of this transition with zeros
                    while c < self.c:
                        higher_obs.append(np.full(self.env.observation_space.shape, 0))
                        higher_actions.append(np.full(self.env.action_space.shape, 0))
                        c += 1
                    self.higher_buffer.add((higher_obs, higher_goal, higher_actions, higher_reward, new_obs))

                    self.update_higher()
                    c = 0
                    higher_obs = []
                    higher_actions = []
                    higher_reward = 0
                    goal = self.get_goal(new_obs.reshape((1, -1)), noise=True).flatten()
                    higher_goal = goal

                obs = new_obs
                goal = new_goal

            with tf.name_scope('Episodic Information'):
                with self.summary_writer.as_default():
                    tf.summary.scalar(f'Episode Environment Reward', episode_reward, step=ep)
                    tf.summary.scalar(f'Episode Intrinsic Reward', episode_intrinsic_rewards, step=ep)

            self.rewards.append(episode_reward)
            total_steps += ep_len
Esempio n. 11
0
class SelfPlayTrainer:
    def __init__(self,
                 agent,
                 game,
                 buffer_file=None,
                 weights_file=None,
                 n_batches=0):
        self.agent = agent
        self.game = game
        self.replay_buffer = ReplayBuffer()

        if buffer_file is not None:
            self.replay_buffer.buffer = pickle.load(open(buffer_file, "rb"))

        self.current_network = NestedTTTNet()
        self.control_network = NestedTTTNet()

        if weights_file is not None:
            self.control_network.load_state_dict(torch.load(weights_file))

        self.current_network.load_state_dict(self.control_network.state_dict())
        self.control_network.eval()
        self.current_network.train()

        self.agent.update_control_net(self.control_network)

        self.n_batches = n_batches

        self.optim = torch.optim.Adam(self.current_network.parameters(),
                                      lr=.01,
                                      weight_decay=10e-4)

    def generate_self_play_data(self, n_games=100):
        for _ in range(n_games):
            turn_num = 0
            self.game.reset()
            self.agent.reset()
            result = 0
            player_num = 0

            states = []
            move_vectors = []

            while len(self.game.get_valid_moves()) > 0:
                move, move_probs = self.agent.search(self.game.copy(),
                                                     turn_num,
                                                     allotted_playouts=400)

                states.append(self.game.state.tolist())
                move_vectors.append(move_probs)

                result = self.game.make_move(move)
                if not result:
                    self.game.switch_player()
                    self.agent.take_action(move)
                    turn_num += 1
                    player_num = (player_num + 1) % 2

            if not result:
                self.replay_buffer.extend(
                    list(zip(states, move_vectors, zero_gen())))
            else:
                self.replay_buffer.extend(
                    list(
                        zip(states[::-1], move_vectors[::-1],
                            one_neg_one_gen()))[::-1])

    def compare_control_to_train(self):
        self.current_network.eval()
        old_agent = AlphaMCTSAgent(control_net=self.control_network)
        new_agent = AlphaMCTSAgent(control_net=self.current_network)

        agents = [old_agent, new_agent]

        wins = 0
        ties = 0

        game = self.game.copy()

        for game_num in range(100):
            game.reset()
            agents[0].reset()
            agents[1].reset()
            result = 0
            player_num = game_num // 50  #Both take first turn 50 times
            turn_num = 100  #Turn down the temperature

            while len(game.get_valid_moves()) > 0:
                move, _ = agents[player_num].search(game.copy(),
                                                    turn_num,
                                                    allotted_playouts=800)
                _, _ = agents[1 - player_num].search(game.copy(),
                                                     turn_num,
                                                     allotted_playouts=800)

                result = game.make_move(move)
                if not result:
                    game.switch_player()
                    agents[0].take_action(move)
                    agents[1].take_action(move)
                    player_num = (player_num + 1) % 2

            if not result:
                ties += 1
            elif result and player_num == 1:
                wins += 1

            print("After {} games, {} wins and {} ties".format(
                game_num + 1, wins, ties))

        if wins + .5 * ties >= 55:
            print(
                "Challenger network won {} games and tied {} games; it becomes new control network"
                .format(wins, ties))
            torch.save(self.current_network.state_dict(),
                       "control_weights_{}.pth".format(self.n_batches))
            self.control_network.load_state_dict(
                self.current_network.state_dict())
        else:
            print(
                "Challenger network not sufficiently better; {} wins and {} ties"
                .format(wins, ties))

        self.control_network.eval()
        self.current_network.train()

    def train_on_batch(self, batch_size=32):
        if len(self.replay_buffer) < batch_size:
            return

        self.current_network.train()

        sample = self.replay_buffer.sample(batch_size)
        states, probs, rewards = zip(*sample)
        states = torch.FloatTensor(states).requires_grad_(True)
        probs = torch.FloatTensor(probs).requires_grad_(True)
        rewards = torch.FloatTensor(rewards).unsqueeze(1).requires_grad_(True)
        self.optim.zero_grad()

        ps, vs = self.current_network(states)

        loss = torch.nn.functional.mse_loss(
            vs, rewards) - (ps.log() * probs).sum()
        loss.backward()

        self.optim.step()

        self.n_batches += 1

        return loss.item()

    def run(self,
            total_runs=10,
            self_play_games=100,
            training_batches=200,
            batch_size=32):
        losses = []
        for run_num in range(1, total_runs + 1):
            print("Run {} of {}".format(run_num, total_runs))
            for selfplay_num in range(1, self_play_games + 1):
                self.generate_self_play_data(1)
                print("\tFinished self-play game {} of {} (Buffer size {})".
                      format(selfplay_num, self_play_games,
                             len(self.replay_buffer)))
            print("Finished {} self-play games".format(self_play_games))
            for _ in range(training_batches):
                losses.append(self.train_on_batch(batch_size))
                if len(losses) == 5:
                    print("\tLoss for last 5 batches: {}".format(sum(losses)))
                    losses = []

        self.compare_control_to_train()
Esempio n. 12
0
class Agent(object):
    def __init__(self,
                 input_dims,
                 n_actions,
                 layer_sizes,
                 act_lr=0.00001,
                 crt_lr=0.0001,
                 tau=0.001,
                 gamma=0.99,
                 max_size=1000000,
                 batch_size=64,
                 chkpt_dir='tmp/ddpg',
                 name='ddpg',
                 layerNorm=True):
        self.input_dims = input_dims
        self.n_actions = n_actions
        self.layer_sizes = layer_sizes
        self.layerNorm = layerNorm
        self.gamma = gamma  # discount factor
        self.tau = tau  # target network updating weight
        self.memory = ReplayBuffer(max_size, self.input_dims, self.n_actions)
        self.batch_size = batch_size

        self.actor = ActorNetwork(act_lr,
                                  self.input_dims,
                                  self.n_actions,
                                  self.layer_sizes,
                                  name='Actor_' + name,
                                  chkpt_dir=chkpt_dir,
                                  layerNorm=self.layerNorm)
        self.critic = CriticNetwork(crt_lr,
                                    self.input_dims,
                                    self.n_actions,
                                    self.layer_sizes,
                                    name='Critic_' + name,
                                    chkpt_dir=chkpt_dir,
                                    layerNorm=self.layerNorm)

        self.target_actor = ActorNetwork(act_lr,
                                         self.input_dims,
                                         self.n_actions,
                                         self.layer_sizes,
                                         name='TargetActor_' + name,
                                         chkpt_dir=chkpt_dir,
                                         layerNorm=self.layerNorm)
        self.target_critic = CriticNetwork(crt_lr,
                                           self.input_dims,
                                           self.n_actions,
                                           self.layer_sizes,
                                           name='TargetCritic_' + name,
                                           chkpt_dir=chkpt_dir,
                                           layerNorm=self.layerNorm)

        self.noise = OUActionNoise(mu=np.zeros(self.n_actions))

        self.update_network_parameters(tau=1)

    def choose_action(self, observation):
        self.actor.eval()
        observation = T.tensor(observation,
                               dtype=T.float).to(self.actor.device)
        mu = self.actor.forward(observation).to(self.actor.device)
        mu_prime = mu + T.tensor(self.noise() * 0.05, dtype=T.float).to(
            self.actor.device)
        self.actor.train()
        return mu_prime.cpu().detach().numpy()

    def remember(self, state, action, reward, new_state, done):
        self.memory.store_transition(state, action, reward, new_state, done)

    def learn(self):
        if self.memory.mem_cntr < self.batch_size:
            return
        state, action, reward, new_state, done = \
            self.memory.sample_buffer(self.batch_size)

        reward = T.tensor(reward, dtype=T.float).to(self.critic.device)
        # done = T.tensor(done).to(self.critic.device)
        new_state = T.tensor(new_state, dtype=T.float).to(self.critic.device)
        action = T.tensor(action, dtype=T.float).to(self.critic.device)
        state = T.tensor(state, dtype=T.float).to(self.critic.device)

        # calculate target
        self.target_actor.eval()
        self.target_critic.eval()
        target_actions = self.target_actor.forward(new_state)
        critic_value_ = self.target_critic.forward(new_state,
                                                   target_actions).view(-1)
        # critic_value_[done] = 0.0    # In building context, terminal state does not have value of 0
        target = reward + self.gamma * critic_value_

        # train critic
        self.critic.train()
        self.critic.optimizer.zero_grad()
        critic_value = self.critic.forward(state, action).view(-1)
        critic_loss = F.mse_loss(target, critic_value)
        critic_loss.backward()
        self.critic.optimizer.step()

        # train actor
        self.critic.eval()
        self.actor.train()
        self.actor.optimizer.zero_grad()
        mu = self.actor.forward(state)
        actor_loss = -self.critic.forward(state, mu)
        actor_loss = T.mean(actor_loss)
        actor_loss.backward()
        self.actor.optimizer.step()

        self.update_network_parameters()

        return critic_loss.item(), actor_loss.item()

    def update_network_parameters(self, tau=None):
        if tau is None:
            tau = self.tau

        updated_actor = update_single_target_network_parameters(
            self.actor, self.target_actor, tau)
        updated_critic = update_single_target_network_parameters(
            self.critic, self.target_critic, tau)

        self.target_actor.load_state_dict(updated_actor)
        self.target_critic.load_state_dict(updated_critic)

    def save_models(self):
        print('.... saving models ....')
        self.actor.save_checkpoint()
        # self.target_actor.save_checkpoint(modelName)
        self.critic.save_checkpoint()
        # self.target_critic.save_checkpoint(modelName)

    def load_models(self):
        print('.... loading models ....')
        self.actor.load_checkpoint()
        # self.target_actor.load_checkpoint(modelName)
        self.critic.load_checkpoint()
Esempio n. 13
0
class Agent():
    def __init__(self,
                 input_dims,
                 n_actions,
                 layer_sizes,
                 act_lr=0.00001,
                 crt_lr=0.0001,
                 tau=0.001,
                 gamma=0.99,
                 max_size=1000000,
                 batch_size=64,
                 update_actor_interval=2,
                 noise=0.1,
                 noise_targetAct=0.2,
                 chkpt_dir='tmp/td3',
                 name='td3',
                 layerNorm=True):
        self.input_dims = input_dims
        self.n_actions = n_actions
        self.gamma = gamma
        self.tau = tau
        self.max_action = 1
        self.min_action = -1
        self.memory = ReplayBuffer(max_size, self.input_dims, self.n_actions)
        self.batch_size = batch_size
        self.learn_step_cntr = 0
        self.update_actor_iter = update_actor_interval

        self.actor = ActorNetwork(act_lr,
                                  self.input_dims,
                                  self.n_actions,
                                  layer_sizes,
                                  name='Actor_' + name,
                                  chkpt_dir=chkpt_dir,
                                  layerNorm=layerNorm)

        self.critic_1 = CriticNetwork(crt_lr,
                                      self.input_dims,
                                      self.n_actions,
                                      layer_sizes,
                                      name='Critic1_' + name,
                                      chkpt_dir=chkpt_dir,
                                      layerNorm=layerNorm)
        self.critic_2 = CriticNetwork(crt_lr,
                                      self.input_dims,
                                      self.n_actions,
                                      layer_sizes,
                                      name='Critic2_' + name,
                                      chkpt_dir=chkpt_dir,
                                      layerNorm=layerNorm)

        self.target_actor = ActorNetwork(act_lr,
                                         self.input_dims,
                                         self.n_actions,
                                         layer_sizes,
                                         name='TargetActor_' + name,
                                         chkpt_dir=chkpt_dir,
                                         layerNorm=layerNorm)
        self.target_critic_1 = CriticNetwork(crt_lr,
                                             self.input_dims,
                                             self.n_actions,
                                             layer_sizes,
                                             name='TargetCritic1_' + name,
                                             chkpt_dir=chkpt_dir,
                                             layerNorm=layerNorm)
        self.target_critic_2 = CriticNetwork(crt_lr,
                                             self.input_dims,
                                             self.n_actions,
                                             layer_sizes,
                                             name='TargetCritic2_' + name,
                                             chkpt_dir=chkpt_dir,
                                             layerNorm=layerNorm)

        self.noise = noise
        self.noise_targetAct = noise_targetAct
        self.update_network_parameters(tau=1)

    def choose_action(self, observation):
        state = T.tensor(observation, dtype=T.float).to(self.actor.device)
        mu = self.actor.forward(state).to(self.actor.device)
        mu_prime = mu + T.tensor(np.random.normal(scale=self.noise),
                                 dtype=T.float).to(self.actor.device)

        mu_prime = T.clamp(mu_prime, self.min_action, self.max_action)

        return mu_prime.cpu().detach().numpy()

    def remember(self, state, action, reward, new_state, done):
        self.memory.store_transition(state, action, reward, new_state, done)

    def learn(self):
        if self.memory.mem_cntr < self.batch_size:
            return
        state, action, reward, new_state, done = \
            self.memory.sample_buffer(self.batch_size)

        reward = T.tensor(reward, dtype=T.float).to(self.critic_1.device)
        # done = T.tensor(done).to(self.critic_1.device)
        state_ = T.tensor(new_state, dtype=T.float).to(self.critic_1.device)
        state = T.tensor(state, dtype=T.float).to(self.critic_1.device)
        action = T.tensor(action, dtype=T.float).to(self.critic_1.device)

        target_actions = self.target_actor.forward(state_)
        target_actions = target_actions + \
            T.clamp(T.tensor(np.random.normal(
                scale=self.noise_targetAct)), -0.5, 0.5)
        target_actions = T.clamp(target_actions, self.min_action,
                                 self.max_action)

        q1_ = self.target_critic_1.forward(state_, target_actions).view(-1)
        q2_ = self.target_critic_2.forward(state_, target_actions).view(-1)
        # q1_[done] = 0.0   # In building context, the terminal state does not have 0 value
        # q2_[done] = 0.0
        critic_value_ = T.min(q1_, q2_)
        target = reward + self.gamma * critic_value_

        self.critic_1.optimizer.zero_grad()
        self.critic_2.optimizer.zero_grad()
        q1 = self.critic_1.forward(state, action).view(-1)
        q2 = self.critic_2.forward(state, action).view(-1)
        q1_loss = F.mse_loss(target, q1)
        q2_loss = F.mse_loss(target, q2)
        critic_loss = q1_loss + q2_loss
        critic_loss.backward()
        self.critic_1.optimizer.step()
        self.critic_2.optimizer.step()

        self.learn_step_cntr += 1
        # if self.learn_step_cntr % self.update_actor_iter != 0:
        #     return

        self.actor.optimizer.zero_grad()
        actor_q1_loss = self.critic_1.forward(
            state, self.actor.forward(state))  # can also use the mean
        # of actor_q1_loss and actor_q2_loss, but it would be slower and does not really matter
        actor_loss = -T.mean(actor_q1_loss)
        actor_loss.backward()
        self.actor.optimizer.step()

        self.update_network_parameters()

        return critic_loss.item(), actor_loss.item()

    def update_network_parameters(self, tau=None):
        if tau is None:
            tau = self.tau

        updated_actor = update_single_target_network_parameters(
            self.actor, self.target_actor, tau)
        updated_critic_1 = update_single_target_network_parameters(
            self.critic_1, self.target_critic_1, tau)
        updated_critic_2 = update_single_target_network_parameters(
            self.critic_2, self.target_critic_2, tau)

        self.target_actor.load_state_dict(updated_actor)
        self.target_critic_1.load_state_dict(updated_critic_1)
        self.target_critic_2.load_state_dict(updated_critic_2)

    def save_models(self):
        print('.... saving models ....')
        self.actor.save_checkpoint()
        # self.target_actor.save_checkpoint()
        self.critic_1.save_checkpoint()
        self.critic_2.save_checkpoint()
        # self.target_critic_1.save_checkpoint()
        # self.target_critic_2.save_checkpoint()

    def load_models(self):
        print('.... loading models ....')
        self.actor.load_checkpoint()
        # self.target_actor.load_checkpoint()
        self.critic_1.load_checkpoint()
        self.critic_2.load_checkpoint()
Esempio n. 14
0
    def __init__(self,
                 input_dims,
                 n_actions,
                 layer_sizes,
                 act_lr=0.00001,
                 crt_lr=0.0001,
                 tau=0.001,
                 gamma=0.99,
                 max_size=1000000,
                 batch_size=64,
                 update_actor_interval=2,
                 noise=0.1,
                 noise_targetAct=0.2,
                 chkpt_dir='tmp/td3',
                 name='td3',
                 layerNorm=True):
        self.input_dims = input_dims
        self.n_actions = n_actions
        self.gamma = gamma
        self.tau = tau
        self.max_action = 1
        self.min_action = -1
        self.memory = ReplayBuffer(max_size, self.input_dims, self.n_actions)
        self.batch_size = batch_size
        self.learn_step_cntr = 0
        self.update_actor_iter = update_actor_interval

        self.actor = ActorNetwork(act_lr,
                                  self.input_dims,
                                  self.n_actions,
                                  layer_sizes,
                                  name='Actor_' + name,
                                  chkpt_dir=chkpt_dir,
                                  layerNorm=layerNorm)

        self.critic_1 = CriticNetwork(crt_lr,
                                      self.input_dims,
                                      self.n_actions,
                                      layer_sizes,
                                      name='Critic1_' + name,
                                      chkpt_dir=chkpt_dir,
                                      layerNorm=layerNorm)
        self.critic_2 = CriticNetwork(crt_lr,
                                      self.input_dims,
                                      self.n_actions,
                                      layer_sizes,
                                      name='Critic2_' + name,
                                      chkpt_dir=chkpt_dir,
                                      layerNorm=layerNorm)

        self.target_actor = ActorNetwork(act_lr,
                                         self.input_dims,
                                         self.n_actions,
                                         layer_sizes,
                                         name='TargetActor_' + name,
                                         chkpt_dir=chkpt_dir,
                                         layerNorm=layerNorm)
        self.target_critic_1 = CriticNetwork(crt_lr,
                                             self.input_dims,
                                             self.n_actions,
                                             layer_sizes,
                                             name='TargetCritic1_' + name,
                                             chkpt_dir=chkpt_dir,
                                             layerNorm=layerNorm)
        self.target_critic_2 = CriticNetwork(crt_lr,
                                             self.input_dims,
                                             self.n_actions,
                                             layer_sizes,
                                             name='TargetCritic2_' + name,
                                             chkpt_dir=chkpt_dir,
                                             layerNorm=layerNorm)

        self.noise = noise
        self.noise_targetAct = noise_targetAct
        self.update_network_parameters(tau=1)
class CycleGanModel(BaseModel):
    def name(self):
        return 'TrainCycleGanModel'

    def initialize(self, args):
        BaseModel.initialize(self, args)
        self.input_A = self.Tensor(args.batchSize, 3, 1024, 256)
        self.input_B = self.Tensor(args.batchSize, 3, 1024, 256)

        self.fake_A_Buffer = ReplayBuffer()
        self.fake_B_Buffer = ReplayBuffer()

        self.netG_AtoB = networks.define_G(3, 3, 64, 'resnet_9blocks',
                                           'instance', False, args.init_type,
                                           self.gpu_ids)
        self.netG_BtoA = networks.define_G(3, 3, 64, 'resnet_9blocks',
                                           'instance', False, args.init_type,
                                           self.gpu_ids)
        self.netD_A = networks.define_D(3,
                                        64,
                                        'basic',
                                        norm='instance',
                                        use_sigmoid=False,
                                        gpu_ids=args.gpu_ids)
        self.netD_B = networks.define_D(3,
                                        64,
                                        'basic',
                                        norm='instance',
                                        use_sigmoid=False,
                                        gpu_ids=args.gpu_ids)

        self.netG_AtoB.apply(weights_init_normal)
        self.netG_BtoA.apply(weights_init_normal)
        self.netD_A.apply(weights_init_normal)
        self.netD_B.apply(weights_init_normal)

        checkpoint_AtoB_filename = 'netG_A2B.pth'
        checkpoint_BtoA_filename = 'netG_B2A.pth'

        checkpoint_D_A_filename = 'netD_A.pth'
        checkpoint_D_B_filename = 'netD_B.pth'

        checkpoint_path_AtoB = os.path.join(args.checkpoints_dir,
                                            checkpoint_AtoB_filename)
        checkpoint_path_BtoA = os.path.join(args.checkpoints_dir,
                                            checkpoint_BtoA_filename)

        checkpoint_path_D_A = os.path.join(args.checkpoints_dir,
                                           checkpoint_D_A_filename)
        checkpoint_path_D_B = os.path.join(args.checkpoints_dir,
                                           checkpoint_D_B_filename)

        # Load checkpoint
        # self.netG_AtoB.load_state_dict(torch.load(checkpoint_path_AtoB))
        # self.netG_BtoA.load_state_dict(torch.load(checkpoint_path_BtoA))
        # self.netD_A.load_state_dict(torch.load(checkpoint_path_D_A))
        # self.netD_B.load_state_dict(torch.load(checkpoint_path_D_B))

        # define loss
        # self.criterionGAN = networks.GANLoss().to(self.device)
        self.criterionGAN = torch.nn.MSELoss().cuda()
        self.criterionCycle = torch.nn.L1Loss().cuda()
        self.criterionIdentity = torch.nn.L1Loss().cuda()

        # init optimizer
        self.optimizer_G = torch.optim.Adam(itertools.chain(
            self.netG_AtoB.parameters(), self.netG_BtoA.parameters()),
                                            lr=0.0001,
                                            betas=(0.5, 0.999))
        self.optimizer_D_a = torch.optim.Adam(self.netD_A.parameters(),
                                              lr=0.0001,
                                              betas=(0.5, 0.999))
        self.optimizer_D_b = torch.optim.Adam(self.netD_B.parameters(),
                                              lr=0.0001,
                                              betas=(0.5, 0.999))

        self.lr_scheduler_G = torch.optim.lr_scheduler.LambdaLR(
            self.optimizer_G,
            lr_lambda=LambdaLR(args.n_epochs, args.epoch,
                               args.decay_epoch).step)
        self.lr_scheduler_D_a = torch.optim.lr_scheduler.LambdaLR(
            self.optimizer_D_a,
            lr_lambda=LambdaLR(args.n_epochs, args.epoch,
                               args.decay_epoch).step)
        self.lr_scheduler_D_b = torch.optim.lr_scheduler.LambdaLR(
            self.optimizer_D_b,
            lr_lambda=LambdaLR(args.n_epochs, args.epoch,
                               args.decay_epoch).step)

    def set_input(self, input_real, input_fake):
        self.image_real_sizes = input_real['A_sizes']

        input_A = input_real['A']
        self.input_A.resize_(input_A.size()).copy_(input_A)
        self.image_real_paths = input_real['A_paths']

        # self.size_real = (int(self.image_real_sizes[0]), int(self.image_real_sizes[1]))

        self.image_fake_sizes = input_fake['B_sizes']

        input_B = input_fake['B']
        self.input_B.resize_(input_B.size()).copy_(input_B)
        self.image_fake_paths = input_fake['B_paths']

        # self.size_fake = (int(self.image_fake_sizes[0]), int(self.image_fake_sizes[1]))

    def train(self):
        real_A = Variable(self.input_A)
        real_B = Variable(self.input_B)
        target_real = Variable(self.Tensor(real_B.size(0), 1, 14,
                                           62).fill_(1.0),
                               requires_grad=False)
        target_fake = Variable(self.Tensor(real_B.size(0), 1, 14,
                                           62).fill_(0.0),
                               requires_grad=False)
        loss_gan = self.criterionGAN
        loss_cycle = self.criterionCycle
        loss_identity = self.criterionIdentity

        self.optimizer_G.zero_grad()

        i_b = self.netG_AtoB(real_B)
        loss_identity_B = loss_identity(i_b, real_B) * 0.5
        i_a = self.netG_BtoA(real_A)
        loss_identity_A = loss_identity(i_a, real_A) * 0.5

        fake_B = self.netG_AtoB(real_A)
        pred_fake = self.netD_B(fake_B)
        loss_gan_A2B = loss_gan(pred_fake, target_real)
        fake_A = self.netG_BtoA(real_B)
        pred_fake = self.netD_A(fake_A)
        loss_gan_B2A = loss_gan(pred_fake, target_real)

        recovered_a = self.netG_BtoA(fake_B)
        loss_cycle_A = loss_cycle(recovered_a, real_A) * 10.0
        recovered_b = self.netG_AtoB(fake_A)
        loss_cycle_B = loss_cycle(recovered_b, real_B) * 10.0

        loss_G = loss_identity_A + loss_identity_B + loss_gan_A2B + loss_gan_B2A + loss_cycle_A + loss_cycle_B
        loss_G.backward()

        self.optimizer_G.step()

        self.optimizer_D_a.zero_grad()

        pred_real = self.netD_A(real_A)
        loss_d_real = loss_gan(pred_real, target_real)
        fake_A = self.fake_A_Buffer.push_and_pop(fake_A)
        pred_fake = self.netD_A(fake_A.detach())
        loss_d_fake = loss_gan(pred_fake, target_fake)

        loss_d_a = (loss_d_real + loss_d_fake) * 0.5
        loss_d_a.backward()

        self.optimizer_D_a.step()

        self.optimizer_D_b.zero_grad()

        pred_real = self.netD_B(real_B)
        loss_d_real = loss_gan(pred_real, target_real)
        fake_B = self.fake_B_Buffer.push_and_pop(fake_B)
        pred_fake = self.netD_B(fake_B.detach())
        loss_d_fake = loss_gan(pred_fake, target_fake)

        loss_d_b = (loss_d_real + loss_d_fake) * 0.5
        loss_d_b.backward()

        self.optimizer_D_b.step()

        print(
            'Generator Total Loss : {a:.3f},   Generator Identity Loss : {b:.3f},   Generator GAN Loss : {c:.3f},   '
            'Generator Cycle Loss : {d:.3f}'.format(
                a=loss_G,
                b=loss_identity_A + loss_identity_B,
                c=loss_gan_A2B + loss_gan_B2A,
                d=loss_cycle_A + loss_cycle_B))
        print('Discriminator Loss : {a:.3f}'.format(a=loss_d_a + loss_d_b))

    def update_learning_rate(self):
        self.lr_scheduler_G.step()
        self.lr_scheduler_D_a.step()
        self.lr_scheduler_D_b.step()

    def save_checkpoint(self):
        torch.save(self.netG_AtoB.state_dict(), './checkpoints/netG_A2B.pth')
        torch.save(self.netG_BtoA.state_dict(), './checkpoints/netG_B2A.pth')
        torch.save(self.netD_A.state_dict(), './checkpoints/netD_A.pth')
        torch.save(self.netD_B.state_dict(), './checkpoints/netD_B.pth')

    def forward(self):
        self.real_A = Variable(self.input_A)
        self.fake_B = self.netG_AtoB(self.real_A)

    def get_image_paths(self):
        return self.image_real_paths, self.image_fake_paths

    def get_image_sizes(self):
        return self.size_real, self.size_fake

    def get_current_visuals(self):
        real_A = util.tensor2im(self.real_A.data)
        fake_B = util.tensor2im(self.fake_B.data)

        return OrderedDict([('original', real_A), ('restyled', fake_B)])
Esempio n. 16
0
    def train(self, env):
        # Memory
        memory = ReplayBuffer(capacity=self.replay_size)

        # Training Loop
        total_numsteps = 0
        updates = 0

        for i_episode in itertools.count(1):
            episode_reward = 0
            episode_steps = 0
            done = False
            state = env.reset()

            while not done:
                if total_numsteps < self.start_steps:
                    action = env.action_space.sample()  # Sample random action
                else:
                    # Sample action from policy
                    action = self.select_action(state)

                if len(memory) > self.batch_size:
                    # Number of updates per step in environment
                    for i in range(self.updates_per_step):
                        # Update parameters of all the networks
                        q1_loss, q2_loss, policy_loss, alpha_loss = self.update_parameters(
                            memory, self.batch_size, updates)
                        updates += 1

                next_state, reward, done, _ = env.step(action)  # Step
                episode_steps += 1
                total_numsteps += 1
                episode_reward += reward

                if self.render:
                    env.render()

                # Ignore the "done" signal if it comes from hitting the time horizon.
                # (https://github.com/openai/spinningup/blob/master/spinup/algos/sac/sac.py)
                done = 0 if episode_steps == env._max_episode_steps else done

                memory.push(state, action, reward, next_state,
                            done)  # Append transition to memory

                state = next_state

            logger.info('UPDATE')
            logger.record_tabular('q1_loss', q1_loss)
            logger.record_tabular('q2_loss', q2_loss)
            logger.record_tabular('policy_loss', policy_loss)
            logger.record_tabular('alpha_loss', alpha_loss)
            logger.dump_tabular()

            logger.info('STATUS')
            logger.record_tabular('i_episode', i_episode)
            logger.record_tabular('episode_steps', episode_steps)
            logger.record_tabular('total_numsteps', total_numsteps)
            logger.record_tabular('episode_reward', episode_reward)
            logger.dump_tabular()

            if i_episode % 100 == 0:
                logger.info('SAVE')
                self.save_model('../saved/sac')

            if total_numsteps > self.num_steps:
                return
Esempio n. 17
0
class Agent(BaseAgent):
    def __init__(self, env, **kwargs):
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        self.obs_space = env.observation_space
        self.action_space = env.action_space
        super(Agent, self).__init__(env.action_space)
        mask = kwargs.get('mask', 2)
        mask_hi = kwargs.get('mask_hi', 19)
        self.rule = kwargs.get('rule', 'c')
        self.danger = kwargs.get('danger', 0.9)
        self.bus_thres = kwargs.get('threshold', 0.1)
        self.max_low_len = kwargs.get('max_low_len', 19)
        self.converter = graphGoalConverter(env, mask, mask_hi, self.danger,
                                            self.device, self.rule)
        self.thermal_limit = env._thermal_limit_a
        self.convert_obs = self.converter.convert_obs
        self.action_dim = self.converter.n
        self.order_dim = len(self.converter.masked_sorted_sub)
        self.node_num = env.dim_topo
        self.delay_step = 2
        self.update_step = 0
        self.k_step = 1
        self.nheads = kwargs.get('head_number', 8)
        self.target_update = kwargs.get('target_update', 1)
        self.hard_target = kwargs.get('hard_target', False)
        self.use_order = (self.rule == 'o')

        self.gamma = kwargs.get('gamma', 0.99)
        self.tau = kwargs.get('tau', 1e-3)
        self.dropout = kwargs.get('dropout', 0.)
        self.memlen = kwargs.get('memlen', int(1e5))
        self.batch_size = kwargs.get('batch_size', 128)
        self.update_start = self.batch_size * 8
        self.actor_lr = kwargs.get('actor_lr', 5e-5)
        self.critic_lr = kwargs.get('critic_lr', 5e-5)
        self.embed_lr = kwargs.get('embed_lr', 5e-5)
        self.alpha_lr = kwargs.get('alpha_lr', 5e-5)

        self.state_dim = kwargs.get('state_dim', 128)
        self.n_history = kwargs.get('n_history', 6)
        self.input_dim = self.converter.n_feature * self.n_history

        print(
            f'N: {self.node_num}, O: {self.input_dim}, S: {self.state_dim}, A: {self.action_dim}, ({self.order_dim})'
        )
        print(kwargs)
        self.emb = EncoderLayer(self.input_dim, self.state_dim, self.nheads,
                                self.node_num, self.dropout).to(self.device)
        self.temb = EncoderLayer(self.input_dim, self.state_dim, self.nheads,
                                 self.node_num, self.dropout).to(self.device)
        self.Q = DoubleSoftQ(self.state_dim, self.nheads, self.node_num,
                             self.action_dim, self.use_order, self.order_dim,
                             self.dropout).to(self.device)
        self.tQ = DoubleSoftQ(self.state_dim, self.nheads, self.node_num,
                              self.action_dim, self.use_order, self.order_dim,
                              self.dropout).to(self.device)
        self.actor = Actor(self.state_dim, self.nheads, self.node_num,
                           self.action_dim, self.use_order, self.order_dim,
                           self.dropout).to(self.device)

        # copy parameters
        self.tQ.load_state_dict(self.Q.state_dict())
        self.temb.load_state_dict(self.emb.state_dict())

        # entropy
        self.target_entropy = -self.action_dim * 3 if not self.use_order else -3 * (
            self.action_dim + self.order_dim)
        self.log_alpha = torch.FloatTensor([-3]).to(self.device)
        self.log_alpha.requires_grad = True

        # optimizers
        self.Q.optimizer = optim.Adam(self.Q.parameters(), lr=self.critic_lr)
        self.actor.optimizer = optim.Adam(self.actor.parameters(),
                                          lr=self.actor_lr)
        self.emb.optimizer = optim.Adam(self.emb.parameters(),
                                        lr=self.embed_lr)
        self.alpha_optim = optim.Adam([self.log_alpha], lr=self.alpha_lr)

        self.memory = ReplayBuffer(max_size=self.memlen)
        self.Q.eval()
        self.tQ.eval()
        self.emb.eval()
        self.temb.eval()
        self.actor.eval()

    def is_safe(self, obs):
        for ratio, limit in zip(obs.rho, self.thermal_limit):
            # Seperate big line and small line
            if (limit < 400.00
                    and ratio >= self.danger - 0.05) or ratio >= self.danger:
                return False
        return True

    def load_mean_std(self, mean, std):
        self.state_mean = mean
        self.state_std = std.masked_fill(std < 1e-5, 1.)
        self.state_mean[0, sum(self.obs_space.shape[:20]):] = 0
        self.state_std[0, sum(self.action_space.shape[:20]):] = 1

    def state_normalize(self, s):
        s = (s - self.state_mean) / self.state_std
        return s

    def reset(self, obs):
        self.converter.last_topo = np.ones(self.node_num, dtype=int)
        self.topo = None
        self.goal = None
        self.goal_list = []
        self.low_len = -1
        self.adj = None
        self.stacked_obs = []
        self.low_actions = []
        self.save = False

    def cache_stat(self):
        cache = {
            'last_topo': self.converter.last_topo,
            'topo': self.topo,
            'goal': self.goal,
            'goal_list': self.goal_list,
            'low_len': self.low_len,
            'adj': self.adj,
            'stacked_obs': self.stacked_obs,
            'low_actions': self.low_actions,
            'save': self.save,
        }
        return cache

    def load_cache_stat(self, cache):
        self.converter.last_topo = cache['last_topo']
        self.topo = cache['topo']
        self.goal = cache['goal']
        self.goal_list = cache['goal_list']
        self.low_len = cache['low_len']
        self.adj = cache['adj']
        self.stacked_obs = cache['stacked_obs']
        self.low_actions = cache['low_actions']
        self.save = cache['save']

    def hash_goal(self, goal):
        hashed = ''
        for i in goal.view(-1):
            hashed += str(int(i.item()))
        return hashed

    def stack_obs(self, obs):
        obs_vect = obs.to_vect()
        obs_vect = torch.FloatTensor(obs_vect).unsqueeze(0)
        obs_vect, self.topo = self.convert_obs(self.state_normalize(obs_vect))
        if len(self.stacked_obs) == 0:
            for _ in range(self.n_history):
                self.stacked_obs.append(obs_vect)
        else:
            self.stacked_obs.pop(0)
            self.stacked_obs.append(obs_vect)
        self.adj = (torch.FloatTensor(obs.connectivity_matrix()) +
                    torch.eye(int(obs.dim_topo))).to(self.device)
        self.converter.last_topo = np.where(obs.topo_vect == -1,
                                            self.converter.last_topo,
                                            obs.topo_vect)

    def reconnect_line(self, obs):
        # if the agent can reconnect powerline not included in controllable substation, return action
        # otherwise, return None
        dislines = np.where(obs.line_status == False)[0]
        for i in dislines:
            act = None
            if obs.time_next_maintenance[
                    i] != 0 and i in self.converter.lonely_lines:
                sub_or = self.action_space.line_or_to_subid[i]
                sub_ex = self.action_space.line_ex_to_subid[i]
                if obs.time_before_cooldown_sub[sub_or] == 0:
                    act = self.action_space(
                        {'set_bus': {
                            'lines_or_id': [(i, 1)]
                        }})
                if obs.time_before_cooldown_sub[sub_ex] == 0:
                    act = self.action_space(
                        {'set_bus': {
                            'lines_ex_id': [(i, 1)]
                        }})
                if obs.time_before_cooldown_line[i] == 0:
                    status = self.action_space.get_change_line_status_vect()
                    status[i] = True
                    act = self.action_space({'change_line_status': status})
                if act is not None:
                    return act
        return None

    def get_current_state(self):
        return torch.cat(self.stacked_obs + [self.topo], dim=-1)

    def act(self, obs, reward, done):
        sample = (reward is None)
        self.stack_obs(obs)
        is_safe = self.is_safe(obs)
        self.save = False

        # reconnect powerline when the powerline in uncontrollable substations is disconnected
        if False in obs.line_status:
            act = self.reconnect_line(obs)
            if act is not None:
                return act

        # generate goal if it is initial or previous goal has been reached
        if self.goal is None or (not is_safe and self.low_len == -1):
            goal, bus_goal, low_actions, order, Q1, Q2 = self.generate_goal(
                sample, obs, not sample)
            if len(low_actions) == 0:
                act = self.action_space()
                if self.goal is None:
                    self.update_goal(goal, bus_goal, low_actions, order, Q1,
                                     Q2)
                return self.action_space()
            self.update_goal(goal, bus_goal, low_actions, order, Q1, Q2)

        act = self.pick_low_action(obs)
        return act

    def pick_low_action(self, obs):
        # Safe and there is no queued low actions, just do nothing
        if self.is_safe(obs) and self.low_len == -1:
            act = self.action_space()
            return act

        # optimize low actions every step
        self.low_actions = self.optimize_low_actions(obs, self.low_actions)
        self.low_len += 1

        # queue has been empty after optimization. just do nothing
        if len(self.low_actions) == 0:
            act = self.action_space()
            self.low_len = -1

        # normally execute low action from low actions queue
        else:
            sub_id, new_topo = self.low_actions.pop(0)[:2]
            act = self.converter.convert_act(sub_id, new_topo, obs.topo_vect)

        # When it meets maximum low action execution time, log and reset
        if self.max_low_len <= self.low_len:
            self.low_len = -1
        return act

    def high_act(self, stacked_state, adj, sample=True):
        order, Q1, Q2 = None, 0, 0
        with torch.no_grad():
            # stacked_state # B, N, F
            stacked_t, stacked_x = stacked_state[...,
                                                 -1:], stacked_state[..., :-1]
            emb_input = stacked_x
            state = self.emb(emb_input, adj).detach()
            actor_input = [state, stacked_t.squeeze(-1)]
            if sample:
                action, std = self.actor.sample(actor_input, adj)
                if self.use_order:
                    action, order = action
                critic_input = action
                Q1, Q2 = self.Q(state, critic_input, adj, order)
                Q1, Q2 = Q1.detach()[0].item(), Q2.detach()[0].item()
                if self.use_order:
                    std, order_std = std
            else:
                action = self.actor.mean(actor_input, adj)
                if self.use_order:
                    action, order = action
        if order is not None: order = order.detach().cpu()
        return action.detach().cpu(), order, Q1, Q2

    def make_candidate_goal(self, stacked_state, adj, sample, obs):
        goal, order, Q1, Q2 = self.high_act(stacked_state, adj, sample)
        bus_goal = torch.zeros_like(goal).long()
        bus_goal[goal > self.bus_thres] = 1
        low_actions = self.converter.plan_act(
            bus_goal, obs.topo_vect, order[0] if order is not None else None)
        low_actions = self.optimize_low_actions(obs, low_actions)
        return goal, bus_goal, low_actions, order, Q1, Q2

    def generate_goal(self, sample, obs, nosave=False):
        stacked_state = self.get_current_state().to(self.device)
        adj = self.adj.unsqueeze(0)
        goal, bus_goal, low_actions, order, Q1, Q2 = self.make_candidate_goal(
            stacked_state, adj, sample, obs)
        return goal, bus_goal, low_actions, order, Q1, Q2

    def update_goal(self, goal, bus_goal, low_actions, order=None, Q1=0, Q2=0):
        self.order = order
        self.goal = goal
        self.bus_goal = bus_goal
        self.low_actions = low_actions
        self.low_len = 0
        self.save = True
        self.goal_list.append(self.hash_goal(bus_goal))

    def optimize_low_actions(self, obs, low_actions):
        # remove overlapped action
        optimized = []
        cooldown_list = obs.time_before_cooldown_sub
        if self.max_low_len != 1 and self.rule == 'c':
            low_actions = self.converter.heuristic_order(obs, low_actions)
        for low_act in low_actions:
            sub_id, sub_goal = low_act[:2]
            sub_goal, same = self.converter.inspect_act(
                sub_id, sub_goal, obs.topo_vect)
            if not same:
                optimized.append((sub_id, sub_goal, cooldown_list[sub_id]))

        # sort by cooldown_sub
        if self.max_low_len != 1 and self.rule != 'o':
            optimized = sorted(optimized, key=lambda x: x[2])

        # if current action has cooldown, then discard
        if len(optimized) > 0 and optimized[0][2] > 0:
            optimized = []
        return optimized

    def append_sample(self, s, m, a, r, s2, m2, d, order):
        if self.use_order:
            self.memory.append((s, m, a, r, s2, m2, int(d), order))
        else:
            self.memory.append((s, m, a, r, s2, m2, int(d)))

    def unpack_batch(self, batch):
        if self.use_order:
            states, adj, actions, rewards, states2, adj2, dones, orders = list(
                zip(*batch))
            orders = torch.cat(orders, 0)
        else:
            states, adj, actions, rewards, states2, adj2, dones = list(
                zip(*batch))
        states = torch.cat(states, 0)
        states2 = torch.cat(states2, 0)
        adj = torch.stack(adj, 0)
        adj2 = torch.stack(adj2, 0)
        actions = torch.cat(actions, 0)
        rewards = torch.FloatTensor(rewards).unsqueeze(1)
        dones = torch.FloatTensor(dones).unsqueeze(1)
        if self.use_order:
            return states.to(self.device), adj.to(self.device), actions.to(self.device), rewards.to(self.device), \
                states2.to(self.device), adj2.to(self.device), dones.to(self.device), orders.to(self.device)
        else:
            return states.to(self.device), adj.to(self.device), actions.to(self.device), \
                rewards.to(self.device), states2.to(self.device), adj2.to(self.device), dones.to(self.device)

    def update(self):
        self.update_step += 1
        batch = self.memory.sample(self.batch_size)
        orders = None
        if self.use_order:
            stacked_states, adj, actions, rewards, stacked_states2, adj2, dones, orders = self.unpack_batch(
                batch)
        else:
            stacked_states, adj, actions, rewards, stacked_states2, adj2, dones = self.unpack_batch(
                batch)

        self.Q.train()
        self.emb.train()
        self.actor.eval()

        # critic loss
        stacked_t, stacked_x = stacked_states[...,
                                              -1:], stacked_states[..., :-1]
        stacked2_t, stacked2_x = stacked_states2[..., -1:], stacked_states2[
            ..., :-1]
        emb_input = stacked_x
        emb_input2 = stacked2_x
        states = self.emb(emb_input, adj)
        states2 = self.emb(emb_input2, adj2)
        actor_input2 = [states2, stacked2_t.squeeze(-1)]
        with torch.no_grad():
            tstates2 = self.temb(emb_input2, adj2).detach()
            action2, log_pi2 = self.actor.rsample(actor_input2, adj2)
            order2 = None
            if self.use_order:
                action2, order2 = action2
                log_pi2 = log_pi2[0] + log_pi2[1]
            critic_input2 = action2
            targets = self.tQ.min_Q(tstates2, critic_input2, adj2,
                                    order2) - self.log_alpha.exp() * log_pi2

        targets = rewards + (1 - dones) * self.gamma * targets.detach()

        critic_input = actions
        predQ1, predQ2 = self.Q(states, critic_input, adj, orders)

        Q1_loss = F.mse_loss(predQ1, targets)
        Q2_loss = F.mse_loss(predQ2, targets)

        loss = Q1_loss + Q2_loss
        self.Q.optimizer.zero_grad()
        self.emb.optimizer.zero_grad()
        loss.backward()
        self.emb.optimizer.step()
        self.Q.optimizer.step()

        self.Q.eval()

        if self.update_step % self.delay_step == 0:
            # actor loss
            self.actor.train()
            states = self.emb(emb_input, adj)
            actor_input = [states, stacked_t.squeeze(-1)]
            action, log_pi = self.actor.rsample(actor_input, adj)
            order = None
            if self.use_order:
                action, order = action
                log_pi = log_pi[0] + log_pi[1]
            critic_input = action
            actor_loss = (
                self.log_alpha.exp() * log_pi -
                self.Q.min_Q(states, critic_input, adj, order)).mean()

            self.emb.optimizer.zero_grad()
            self.actor.optimizer.zero_grad()
            actor_loss.backward()
            self.emb.optimizer.step()
            self.actor.optimizer.step()

            self.actor.eval()

            # target update
            if self.hard_target:
                self.tQ.load_state_dict(self.Q.state_dict())
                self.temb.load_state_dict(self.emb.state_dict())
            else:
                for tp, p in zip(self.tQ.parameters(), self.Q.parameters()):
                    tp.data.copy_(self.tau * p + (1 - self.tau) * tp)
                for tp, p in zip(self.temb.parameters(),
                                 self.emb.parameters()):
                    tp.data.copy_(self.tau * p + (1 - self.tau) * tp)

            # alpha loss
            alpha_loss = self.log_alpha * (-log_pi.detach() -
                                           self.target_entropy).mean()
            self.alpha_optim.zero_grad()
            alpha_loss.backward()
            self.alpha_optim.step()
        self.emb.eval()

        return predQ1.detach().mean().item(), predQ2.detach().mean().item()

    def save_model(self, path, name):
        torch.save(self.actor.state_dict(),
                   os.path.join(path, f'{name}_actor.pt'))
        torch.save(self.emb.state_dict(), os.path.join(path, f'{name}_emb.pt'))
        torch.save(self.Q.state_dict(), os.path.join(path, f'{name}_Q.pt'))

    def load_model(self, path, name=None):
        head = ''
        if name is not None:
            head = name + '_'
        self.actor.load_state_dict(
            torch.load(os.path.join(path, f'{head}actor.pt'),
                       map_location=self.device))
        self.emb.load_state_dict(
            torch.load(os.path.join(path, f'{head}emb.pt'),
                       map_location=self.device))
        self.Q.load_state_dict(
            torch.load(os.path.join(path, f'{head}Q.pt'),
                       map_location=self.device))
Esempio n. 18
0
    optimizer_D_A,
    lr_lambda=LambdaLR(opt.n_epochs, opt.epoch, opt.decay_epoch).step)
lr_scheduler_D_B = torch.optim.lr_scheduler.LambdaLR(
    optimizer_D_B,
    lr_lambda=LambdaLR(opt.n_epochs, opt.epoch, opt.decay_epoch).step)

# Inputs & targets memory allocation
Tensor = torch.cuda.FloatTensor if opt.cuda else torch.Tensor
input_A = Tensor(opt.batchSize, opt.input_nc, opt.size, opt.size)
input_B = Tensor(opt.batchSize, opt.output_nc, opt.size, opt.size)
target_real = Variable(Tensor(opt.batchSize, 1).fill_(1.0),
                       requires_grad=False)
target_fake = Variable(Tensor(opt.batchSize, 1).fill_(0.0),
                       requires_grad=False)

fake_A_buffer = ReplayBuffer()
fake_Am_buffer = ReplayBuffer()
fake_B_buffer = ReplayBuffer()
fake_Bm_buffer = ReplayBuffer()

# Dataset loader
transforms_ = [
    transforms.Resize(int(opt.size * 1.12), Image.BICUBIC),
    transforms.RandomCrop(opt.size),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize([0.5], [0.5])
]
dataloader = DataLoader(Dataset(opt.dataroot,
                                transforms_=transforms_,
                                unaligned=True),
Esempio n. 19
0
class DDPGAgent():
    
    def __init__(self, state_size, action_size, num_agents):
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(RANDOM_SEED)
        self.num_agents = num_agents

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size).to(device)
        self.actor_target = Actor(state_size, action_size).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size).to(device)
        self.critic_target = Critic(state_size, action_size).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise(action_size)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE)
        
        # Directory where to save the model
        self.model_dir = os.getcwd() + "/DDPG/saved_models"
        os.makedirs(self.model_dir, exist_ok=True)

    def step(self, states, actions, rewards, next_states, dones):
        for i in range(self.num_agents):
            self.memory.add(states[i], actions[i], rewards[i], next_states[i], dones[i])

        if len(self.memory) > BATCH_SIZE:
            experiences = self.memory.sample()
            self.learn(experiences, GAMMA)
        
    def act(self, states, add_noise=True):
        states = torch.from_numpy(states).float().to(device)

        actions = np.zeros((self.num_agents, self.action_size))
        self.actor_local.eval()
        with torch.no_grad():
            for i, state in enumerate(states):
                actions[i, :] = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        
        if add_noise:
            actions += self.noise.sample()
        
        return np.clip(actions, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        states, actions, rewards, next_states, dones = experiences

        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)  # adds gradient clipping to stabilize learning
        self.critic_optimizer.step()
        
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)
        
    def soft_update(self, local_model, target_model, tau):
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
            
    def save_model(self):
        torch.save(
            self.actor_local.state_dict(), 
            os.path.join(self.model_dir, 'actor_params.pth')
        )
        torch.save(
            self.actor_optimizer.state_dict(), 
            os.path.join(self.model_dir, 'actor_optim_params.pth')
        )
        torch.save(
            self.critic_local.state_dict(), 
            os.path.join(self.model_dir, 'critic_params.pth')
        )
        torch.save(
            self.critic_optimizer.state_dict(), 
            os.path.join(self.model_dir, 'critic_optim_params.pth')
        )

    def load_model(self):
        """Loads weights from saved model."""
        self.actor_local.load_state_dict(
            torch.load(os.path.join(self.model_dir, 'actor_params.pth'))
        )
        self.actor_optimizer.load_state_dict(
            torch.load(os.path.join(self.model_dir, 'actor_optim_params.pth'))
        )
        self.critic_local.load_state_dict(
            torch.load(os.path.join(self.model_dir, 'critic_params.pth'))
        )
        self.critic_optimizer.load_state_dict(
            torch.load(os.path.join(self.model_dir, 'critic_optim_params.pth'))
        )
Esempio n. 20
0
    def __init__(self, env, **kwargs):
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        self.obs_space = env.observation_space
        self.action_space = env.action_space
        super(Agent, self).__init__(env.action_space)
        mask = kwargs.get('mask', 2)
        mask_hi = kwargs.get('mask_hi', 19)
        self.rule = kwargs.get('rule', 'c')
        self.danger = kwargs.get('danger', 0.9)
        self.bus_thres = kwargs.get('threshold', 0.1)
        self.max_low_len = kwargs.get('max_low_len', 19)
        self.converter = graphGoalConverter(env, mask, mask_hi, self.danger,
                                            self.device, self.rule)
        self.thermal_limit = env._thermal_limit_a
        self.convert_obs = self.converter.convert_obs
        self.action_dim = self.converter.n
        self.order_dim = len(self.converter.masked_sorted_sub)
        self.node_num = env.dim_topo
        self.delay_step = 2
        self.update_step = 0
        self.k_step = 1
        self.nheads = kwargs.get('head_number', 8)
        self.target_update = kwargs.get('target_update', 1)
        self.hard_target = kwargs.get('hard_target', False)
        self.use_order = (self.rule == 'o')

        self.gamma = kwargs.get('gamma', 0.99)
        self.tau = kwargs.get('tau', 1e-3)
        self.dropout = kwargs.get('dropout', 0.)
        self.memlen = kwargs.get('memlen', int(1e5))
        self.batch_size = kwargs.get('batch_size', 128)
        self.update_start = self.batch_size * 8
        self.actor_lr = kwargs.get('actor_lr', 5e-5)
        self.critic_lr = kwargs.get('critic_lr', 5e-5)
        self.embed_lr = kwargs.get('embed_lr', 5e-5)
        self.alpha_lr = kwargs.get('alpha_lr', 5e-5)

        self.state_dim = kwargs.get('state_dim', 128)
        self.n_history = kwargs.get('n_history', 6)
        self.input_dim = self.converter.n_feature * self.n_history

        print(
            f'N: {self.node_num}, O: {self.input_dim}, S: {self.state_dim}, A: {self.action_dim}, ({self.order_dim})'
        )
        print(kwargs)
        self.emb = EncoderLayer(self.input_dim, self.state_dim, self.nheads,
                                self.node_num, self.dropout).to(self.device)
        self.temb = EncoderLayer(self.input_dim, self.state_dim, self.nheads,
                                 self.node_num, self.dropout).to(self.device)
        self.Q = DoubleSoftQ(self.state_dim, self.nheads, self.node_num,
                             self.action_dim, self.use_order, self.order_dim,
                             self.dropout).to(self.device)
        self.tQ = DoubleSoftQ(self.state_dim, self.nheads, self.node_num,
                              self.action_dim, self.use_order, self.order_dim,
                              self.dropout).to(self.device)
        self.actor = Actor(self.state_dim, self.nheads, self.node_num,
                           self.action_dim, self.use_order, self.order_dim,
                           self.dropout).to(self.device)

        # copy parameters
        self.tQ.load_state_dict(self.Q.state_dict())
        self.temb.load_state_dict(self.emb.state_dict())

        # entropy
        self.target_entropy = -self.action_dim * 3 if not self.use_order else -3 * (
            self.action_dim + self.order_dim)
        self.log_alpha = torch.FloatTensor([-3]).to(self.device)
        self.log_alpha.requires_grad = True

        # optimizers
        self.Q.optimizer = optim.Adam(self.Q.parameters(), lr=self.critic_lr)
        self.actor.optimizer = optim.Adam(self.actor.parameters(),
                                          lr=self.actor_lr)
        self.emb.optimizer = optim.Adam(self.emb.parameters(),
                                        lr=self.embed_lr)
        self.alpha_optim = optim.Adam([self.log_alpha], lr=self.alpha_lr)

        self.memory = ReplayBuffer(max_size=self.memlen)
        self.Q.eval()
        self.tQ.eval()
        self.emb.eval()
        self.temb.eval()
        self.actor.eval()
Esempio n. 21
0
class GanModel(BaseModel):
    def name(self):
        return 'TrainGanModel'

    def initialize(self, args):
        BaseModel.initialize(self, args)
        self.input_B = self.Tensor(args.batchSize, 3, 1024, 256)
        self.input_C = self.Tensor(args.batchSize, 1, 1024, 256)

        self.fake_Buffer = ReplayBuffer()

        self.netG_BtoC = networks.define_G(3, 1, 64, 'unet_128', 'batch',
                                           False, args.init_type, self.gpu_ids)
        self.netD_C = networks.define_D(1,
                                        64,
                                        'basic',
                                        norm='batch',
                                        use_sigmoid=False,
                                        gpu_ids=args.gpu_ids)

        self.netG_BtoC.apply(weights_init_normal)
        self.netD_C.apply(weights_init_normal)

        checkpoint_BtoC_filename = 'netG_B2C.pth'
        checkpoint_D_C_filename = 'netD_C.pth'

        checkpoint_path_BtoC = os.path.join(args.checkpoints_dir,
                                            checkpoint_BtoC_filename)
        checkpoint_path_D_C = os.path.join(args.checkpoints_dir,
                                           checkpoint_D_C_filename)

        # Load checkpoint
        # self.netG_BtoC.load_state_dict(torch.load(checkpoint_path_BtoC))
        # self.netD_C.load_state_dict(torch.load(checkpoint_path_D_C))

        # define loss
        self.criterionGAN = torch.nn.MSELoss()
        self.criterionReconstruction = torch.nn.L1Loss().cuda()

        # init optimizer
        self.optimizer_G = torch.optim.Adam(self.netG_BtoC.parameters(),
                                            lr=0.0002,
                                            betas=(0.5, 0.999))
        self.optimizer_D = torch.optim.Adam(self.netD_C.parameters(),
                                            lr=0.0002,
                                            betas=(0.5, 0.999))

        self.lr_scheduler_G = torch.optim.lr_scheduler.LambdaLR(
            self.optimizer_G,
            lr_lambda=LambdaLR(args.n_epochs, args.epoch,
                               args.decay_epoch).step)
        self.lr_scheduler_D = torch.optim.lr_scheduler.LambdaLR(
            self.optimizer_D,
            lr_lambda=LambdaLR(args.n_epochs, args.epoch,
                               args.decay_epoch).step)

    def set_input(self, input):
        self.image_syn_sizes = input['B_sizes']

        input_B = input['B']
        save_image(input_B[0], './input_check/rgb.jpg')
        self.input_B.resize_(input_B.size()).copy_(input_B)
        self.image_syn_paths = input['B_paths']

        # self.size_syn = (int(self.image_syn_sizes[0]), int(self.image_syn_sizes[1]))

        self.image_dep_sizes = input['C_sizes']

        input_C = input['C']
        save_image(input_C[0], './input_check/depth.jpg')
        self.input_C.resize_(input_C.size()).copy_(input_C)
        self.image_dep_paths = input['C_paths']

        # self.size_dep = (int(self.image_dep_sizes[0]), int(self.image_dep_sizes[1]))

    def train(self):
        syn_data = Variable(self.input_B)
        dep_data = Variable(self.input_C)
        target_real = Variable(self.Tensor(syn_data.size(0), 1, 14,
                                           62).fill_(1.0),
                               requires_grad=False)
        target_fake = Variable(self.Tensor(syn_data.size(0), 1, 14,
                                           62).fill_(0.0),
                               requires_grad=False)
        loss_gan = self.criterionGAN
        loss_rec = self.criterionReconstruction

        self.optimizer_G.zero_grad()

        fake_dep = self.netG_BtoC(syn_data)
        loss_r = loss_rec(fake_dep, dep_data)
        loss_g = loss_gan(self.netD_C(fake_dep), target_real)
        loss_G = 0.01 * loss_g + 0.99 * loss_r
        # loss_G = loss_g
        loss_G.backward()

        self.optimizer_G.step()

        self.optimizer_D.zero_grad()

        pred_real = self.netD_C(dep_data)
        loss_real = loss_gan(pred_real, target_real)
        fake_A = self.fake_Buffer.push_and_pop(fake_dep)
        pred_fake = self.netD_C(fake_A)
        loss_fake = loss_gan(pred_fake, target_fake)

        loss_D = (loss_real + loss_fake) * 0.5
        loss_D.backward()

        self.optimizer_D.step()

        print(
            'Generator Loss : {loss_G:.5f}, Discriminator Loss : {loss_D:.5f}'.
            format(loss_G=loss_G, loss_D=loss_D))

    def update_learning_rate(self):
        self.lr_scheduler_G.step()
        self.lr_scheduler_D.step()

    def save_checkpoint(self):
        torch.save(self.netG_BtoC.state_dict(), './checkpoints/netG_B2C.pth')
        torch.save(self.netD_C.state_dict(), './checkpoints/netD_C.pth')

    def forward(self):
        self.syn_data = Variable(self.input_B)
        self.pred_depth = self.netG_BtoC(self.syn_data)

    def get_image_paths(self):
        return self.image_syn_paths, self.image_dep_paths

    def get_image_sizes(self):
        return self.size_syn, self.size_dep

    def get_current_visuals(self):
        syn_d = util.tensor2im(self.syn_data.data)
        pred_d = util.tensor2im(self.pred_depth.data)

        return OrderedDict([('original', syn_d), ('depth', pred_d)])
Esempio n. 22
0
class Agent:
	def __init__(self, gamma=0.999, buffer_size=1e5, batch_size=1024,
                 episodes_nr=50000, tau=2e-2, gym_name='MountainCarContinuous-v0'):
         
		self.lr_actor = 5e-3				# learning rate for the actor
		self.lr_critic = 1e-3			# learning rate for the critic
		self.lr_decay = 1				# learning rate decay (per episode)
		self.l2_reg_actor = 1e-7			# L2 regularization factor for the actor
		self.l2_reg_critic = 1e-7		# L2 regularization factor for the critic
         
		self.num_episodes = episodes_nr		# number of episodes
		self.max_steps_ep = 10000	# default max number of steps per episode (unless env has a lower hardcoded limit)
		self.train_every = 1			# number of steps to run the policy (and collect experience) before updating network weights
		self.replay_memory_capacity = buffer_size	# capacity of experience replay memory
		
		self.batch_size = batch_size
		self.memory = ReplayBuffer(int(buffer_size))
		self.episodes_nr = episodes_nr
		self.gamma = gamma
		self.tau = tau
        
		self.env = gym.make(gym_name)
		assert(self.env.action_space.high == -self.env.action_space.low)
		self.action_range = self.env.action_space.high[0]
        
		self.action_dim = np.prod(np.array(self.env.action_space.shape))
		self.state_dim = np.prod(np.array(self.env.observation_space.shape))
        
		#self.noise = OUNoise(self.action_dim)
		self.action_range = self.env.action_space.high - self.env.action_space.low
		
		self.initial_noise_scale = 0.1	# scale of the exploration noise process (1.0 is the range of each action dimension)
		self.noise_decay = 1 #0.99		# decay rate (per episode) of the scale of the exploration noise process
		self.exploration_mu = 0.0	# mu parameter for the exploration noise process: dXt = theta*(mu-Xt)*dt + sigma*dWt
		self.exploration_theta = 0.15 # theta parameter for the exploration noise process: dXt = theta*(mu-Xt)*dt + sigma*dWt
		self.exploration_sigma = 0.2	# sigma parameter for the exploration noise process: dXt = theta*(mu-Xt	)*dt + sigma*dWt

		self.noise = OUNoise(self.action_dim)
		
	def run(self):
		
		tf.reset_default_graph()
		
		state_ph = tf.placeholder(dtype=tf.float32, shape=[None,self.state_dim])
		action_ph = tf.placeholder(dtype=tf.float32, shape=[None,self.action_dim])
		reward_ph = tf.placeholder(dtype=tf.float32, shape=[None])
		next_state_ph = tf.placeholder(dtype=tf.float32, shape=[None,self.state_dim])
		is_not_terminal_ph = tf.placeholder(dtype=tf.float32, shape=[None]) # indicators (go into target computation)
		
		# episode counter
		episodes = tf.Variable(0.0, trainable=False, name='episodes')
		episode_inc_op = episodes.assign_add(1)

        
		
		actions = Actor(state_ph, self.action_range, self.action_dim, "local").out
		target_actions = tf.stop_gradient(Actor(next_state_ph, self.action_range, self.action_dim, "target").out)
		
		q_det = Critic(action_ph, state_ph, "local", reuse=False).q
		q_inf = Critic(actions, state_ph, "local", reuse=True).q
		
		target_critic = tf.stop_gradient(Critic(target_actions, next_state_ph, "target").q)
		 
		actor_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='actor_local')
		slow_target_actor_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='actor_target')
		critic_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='critic_local')
		slow_target_critic_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='critic_target')
		
		update_targets_ops = []
		for i, slow_target_actor_var in enumerate(slow_target_actor_vars):
			update_slow_target_actor_op = slow_target_actor_var.assign(self.tau*actor_vars[i]+(1-self.tau)*slow_target_actor_var)
			update_targets_ops.append(update_slow_target_actor_op)

		for i, slow_target_var in enumerate(slow_target_critic_vars):
			update_slow_target_critic_op = slow_target_var.assign(self.tau*critic_vars[i]+(1-self.tau)*slow_target_var)
			update_targets_ops.append(update_slow_target_critic_op)

		update_slow_targets_op = tf.group(*update_targets_ops, name='update_slow_targets')
		
		targets = tf.expand_dims(reward_ph, 1) + tf.expand_dims(is_not_terminal_ph, 1) * self.gamma * target_critic
		
		td_errors = targets - q_det
		
		critic_loss = tf.reduce_mean(tf.square(td_errors))
		for var in critic_vars:
			if not 'bias' in var.name:
				critic_loss += self.l2_reg_critic * 0.5 * tf.nn.l2_loss(var)

		# critic optimizer
		critic_train_op = tf.train.AdamOptimizer(self.lr_critic*self.lr_decay**episodes).minimize(critic_loss)

		# actor loss function (mean Q-values under current policy with regularization)
		actor_loss = -1*tf.reduce_mean(q_inf)
		for var in actor_vars:
			if not 'bias' in var.name:
				actor_loss += self.l2_reg_actor * 0.5 * tf.nn.l2_loss(var)

		# actor optimizer
		# the gradient of the mean Q-values wrt actor params is the deterministic policy gradient (keeping critic params fixed)
		actor_train_op = tf.train.AdamOptimizer(self.lr_actor*self.lr_decay**episodes).minimize(actor_loss, var_list=actor_vars)

		# initialize session
		sess = tf.Session()	
		sess.run(tf.global_variables_initializer())
		
		
		total_steps = 0
		for ep in range(self.num_episodes):

			total_reward = 0
			steps_in_ep = 0
			
			#noise_process = np.zeros(self.action_dim)
			#noise_scale = (self.initial_noise_scale * self.noise_decay**ep) * self.action_range


			# Initial state
			observation = self.env.reset()
			if ep%1 == 0: self.env.render()
	

			for t in range(self.max_steps_ep):

				# choose action based on deterministic policy
				action_for_state, = sess.run(actions, feed_dict = {state_ph: observation[None]})

				# add temporally-correlated exploration noise to action (using an Ornstein-Uhlenbeck process)
				# print(action_for_state)
				#noise_process = self.exploration_theta*(self.exploration_mu - noise_process) + self.exploration_sigma*np.random.randn(self.action_dim)
				# print(noise_scale*noise_process)
				action_for_state += self.noise.sample() #noise_process #*noise_scale

				# take step
				next_observation, reward, done, _info = self.env.step(action_for_state)
				if ep%1 == 0: self.env.render()
				total_reward += reward

				self.memory.add_to_memory((observation, action_for_state, reward, next_observation, 0.0 if done else 1.0))

				# update network weights to fit a minibatch of experience
				if total_steps%self.train_every == 0 and self.memory.len() >= self.batch_size:

					# grab N (s,a,r,s') tuples from replay memory
					minibatch = self.memory.sample_from_memory(self.batch_size)

					# update the critic and actor params using mean-square value error and deterministic policy gradient, respectively
					_, _ = sess.run([critic_train_op, actor_train_op], 
						feed_dict = {
							state_ph: np.asarray([elem[0] for elem in minibatch]),
							action_ph: np.asarray([elem[1] for elem in minibatch]),
							reward_ph: np.asarray([elem[2] for elem in minibatch]),
							next_state_ph: np.asarray([elem[3] for elem in minibatch]),
							is_not_terminal_ph: np.asarray([elem[4] for elem in minibatch])})

					# update slow actor and critic targets towards current actor and critic
					_ = sess.run(update_slow_targets_op)

				observation = next_observation
				total_steps += 1
				steps_in_ep += 1
	
				if done: 
					# Increment episode counter
					_ = sess.run(episode_inc_op)
					break
		
			print('Episode %2i, Reward: %7.3f, Steps: %i'%(ep,total_reward,steps_in_ep))

		env.close()
class Agent():
    def __init__(
        self,
        input_dims,
        n_actions,
        layer_sizes,
        act_lr=0.00003,
        crt_lr=0.0003,
        gamma=0.99,
        max_size=1000000,
        tau=0.005,
        batch_size=64,
        reward_scale=1,
        name='sac',
        chkpt_dir='tmp/ddpg',
        layerNorm=True,
    ):
        '''Higher reward scale means higher weights given to rewards ratehr than entropy'''
        self.gamma = gamma
        self.tau = tau
        self.batch_size = batch_size
        self.input_dims = input_dims
        self.n_actions = n_actions
        # The env action was scaled to [-1, 1]
        self.max_action = np.ones(self.n_actions)
        # Cannot use env.action_space.high, because env.action_space.high is not real action space
        self.layer_sizes = layer_sizes
        self.layerNorm = layerNorm

        self.memory = ReplayBuffer(max_size, self.input_dims, self.n_actions)

        self.actor = ActorNetwork(act_lr,
                                  self.input_dims,
                                  self.n_actions,
                                  self.max_action,
                                  fc_dims=self.layer_sizes,
                                  name='Actor_' + name,
                                  chkpt_dir=chkpt_dir,
                                  layerNorm=self.layerNorm)

        self.critic_1 = CriticNetwork(crt_lr,
                                      self.input_dims,
                                      self.n_actions,
                                      self.layer_sizes,
                                      name='critic1_' + name,
                                      chkpt_dir=chkpt_dir,
                                      layerNorm=self.layerNorm)
        self.critic_2 = CriticNetwork(crt_lr,
                                      self.input_dims,
                                      self.n_actions,
                                      self.layer_sizes,
                                      name='critic2_' + name,
                                      chkpt_dir=chkpt_dir,
                                      layerNorm=self.layerNorm)

        self.value = ValueNetwork(crt_lr,
                                  self.input_dims,
                                  self.layer_sizes,
                                  name='value_' + name,
                                  chkpt_dir=chkpt_dir,
                                  layerNorm=self.layerNorm)
        self.target_value = ValueNetwork(crt_lr,
                                         self.input_dims,
                                         self.layer_sizes,
                                         name='target_value_' + name,
                                         chkpt_dir=chkpt_dir,
                                         layerNorm=self.layerNorm)

        self.scale = reward_scale
        self.update_network_parameters(tau=1)

    def choose_action(self, observation):
        state = T.Tensor([observation]).to(self.actor.device)
        actions, _ = self.actor.sample_normal(state, reparameterize=False)

        return actions.cpu().detach().numpy()[0]

    def remember(self, state, action, reward, new_state, done):
        self.memory.store_transition(state, action, reward, new_state, done)

    def update_network_parameters(self, tau=None):
        if tau is None:
            tau = self.tau

        updated_value = update_single_target_network_parameters(
            self.value, self.target_value, tau)

        self.target_value.load_state_dict(updated_value)

    def save_models(self):
        print('.... saving models ....')
        self.actor.save_checkpoint()
        self.value.save_checkpoint()
        #        self.target_value.save_checkpoint()
        self.critic_1.save_checkpoint()
        self.critic_2.save_checkpoint()

    def load_models(self):
        print('.... loading models ....')
        self.actor.load_checkpoint()
        self.value.load_checkpoint()
        #        self.target_value.load_checkpoint()
        self.critic_1.load_checkpoint()
        self.critic_2.load_checkpoint()

    def learn(self):
        if self.memory.mem_cntr < self.batch_size:
            return

        state, action, reward, new_state, done = \
            self.memory.sample_buffer(self.batch_size)

        reward = T.tensor(reward, dtype=T.float).to(self.actor.device)
        done = T.tensor(done).to(self.actor.device)
        state_ = T.tensor(new_state, dtype=T.float).to(self.actor.device)
        state = T.tensor(state, dtype=T.float).to(self.actor.device)
        action = T.tensor(action, dtype=T.float).to(self.actor.device)

        # Update the value network
        self.value.optimizer.zero_grad()

        value = self.value.forward(state).view(-1)

        actions, log_probs = self.actor.sample_normal(state,
                                                      reparameterize=False)
        log_probs = log_probs.view(-1)
        # Use the action from the current policy, rather than the one stored in the buffer
        q1_new_policy = self.critic_1.forward(state, actions).view(-1)
        q2_new_policy = self.critic_2.forward(state, actions).view(-1)
        critic_value = T.min(q1_new_policy, q2_new_policy)
        value_target = critic_value - log_probs  # - log_probs is entropy

        value_loss = F.mse_loss(value, value_target)
        value_loss.backward(retain_graph=True)
        self.value.optimizer.step()

        # Update the critic network
        self.critic_1.optimizer.zero_grad()
        self.critic_2.optimizer.zero_grad()

        # action and state are from replay buffer generated by old policy
        q1_old_policy = self.critic_1.forward(state, action).view(-1)
        q2_old_policy = self.critic_2.forward(state, action).view(-1)

        value_ = self.target_value.forward(state_).view(-1)
        # value_[done] = 0.0    # In building context, terminal state does not have 0 value
        q_hat = self.scale * reward + self.gamma * value_

        critic_1_loss = F.mse_loss(q1_old_policy, q_hat)
        critic_2_loss = F.mse_loss(q2_old_policy, q_hat)
        critic_loss = critic_1_loss + critic_2_loss
        critic_loss.backward()
        self.critic_1.optimizer.step()
        self.critic_2.optimizer.step()

        # Update the actor network
        self.actor.optimizer.zero_grad()

        actions, log_probs = self.actor.sample_normal(state,
                                                      reparameterize=True)
        log_probs = log_probs.view(-1)
        # Use the action from the current policy, rather than the one stored in the buffer
        q1_new_policy = self.critic_1.forward(state, actions).view(-1)
        q2_new_policy = self.critic_2.forward(state, actions).view(-1)
        critic_value = T.min(q1_new_policy, q2_new_policy)

        actor_loss = log_probs - critic_value
        actor_loss = T.mean(actor_loss)
        actor_loss.backward(retain_graph=True)
        self.actor.optimizer.step()

        self.update_network_parameters()

        return critic_loss.item(), actor_loss.item()
    def __init__(
        self,
        input_dims,
        n_actions,
        layer_sizes,
        act_lr=0.00003,
        crt_lr=0.0003,
        gamma=0.99,
        max_size=1000000,
        tau=0.005,
        batch_size=64,
        reward_scale=1,
        name='sac',
        chkpt_dir='tmp/ddpg',
        layerNorm=True,
    ):
        '''Higher reward scale means higher weights given to rewards ratehr than entropy'''
        self.gamma = gamma
        self.tau = tau
        self.batch_size = batch_size
        self.input_dims = input_dims
        self.n_actions = n_actions
        # The env action was scaled to [-1, 1]
        self.max_action = np.ones(self.n_actions)
        # Cannot use env.action_space.high, because env.action_space.high is not real action space
        self.layer_sizes = layer_sizes
        self.layerNorm = layerNorm

        self.memory = ReplayBuffer(max_size, self.input_dims, self.n_actions)

        self.actor = ActorNetwork(act_lr,
                                  self.input_dims,
                                  self.n_actions,
                                  self.max_action,
                                  fc_dims=self.layer_sizes,
                                  name='Actor_' + name,
                                  chkpt_dir=chkpt_dir,
                                  layerNorm=self.layerNorm)

        self.critic_1 = CriticNetwork(crt_lr,
                                      self.input_dims,
                                      self.n_actions,
                                      self.layer_sizes,
                                      name='critic1_' + name,
                                      chkpt_dir=chkpt_dir,
                                      layerNorm=self.layerNorm)
        self.critic_2 = CriticNetwork(crt_lr,
                                      self.input_dims,
                                      self.n_actions,
                                      self.layer_sizes,
                                      name='critic2_' + name,
                                      chkpt_dir=chkpt_dir,
                                      layerNorm=self.layerNorm)

        self.value = ValueNetwork(crt_lr,
                                  self.input_dims,
                                  self.layer_sizes,
                                  name='value_' + name,
                                  chkpt_dir=chkpt_dir,
                                  layerNorm=self.layerNorm)
        self.target_value = ValueNetwork(crt_lr,
                                         self.input_dims,
                                         self.layer_sizes,
                                         name='target_value_' + name,
                                         chkpt_dir=chkpt_dir,
                                         layerNorm=self.layerNorm)

        self.scale = reward_scale
        self.update_network_parameters(tau=1)
Esempio n. 25
0
class DDPG:
    def __init__(
        self,
        env,
        gamma=0.99,
        polyak=0.995,
        act_noise=0.1,
        render=False,
        batch_size=32,
        q_lr=1e-3,
        p_lr=1e-4,
        buffer_capacity=5000,
        max_episodes=100,
        save_path=None,
        load_path=None,
        print_freq=1,
        start_steps=10000,
        log_dir='logs/train',
        training=True,
    ):
        self.gamma = gamma
        self.polyak = polyak
        self.act_noise = act_noise
        self.render = render
        self.batch_size = batch_size
        self.p_lr = p_lr
        self.q_lr = q_lr
        self.max_episodes = max_episodes
        self.start_steps = start_steps
        self.actor, self.critic = create_actor_critic(
            env.observation_space.shape[0], env.action_space.shape[0],
            env.action_space.high)
        self.target_actor, self.target_critic = create_actor_critic(
            env.observation_space.shape[0], env.action_space.shape[0],
            env.action_space.high)
        self.target_actor.set_weights(self.actor.get_weights())
        self.target_critic.set_weights(self.critic.get_weights())
        self.env = env
        self.rewards = []
        self.print_freq = print_freq
        self.save_path = save_path

        if training:
            self.buffer = ReplayBuffer(buffer_capacity)
            self.actor_optimizer = tf.keras.optimizers.Adam(
                learning_rate=self.p_lr)
            self.critic_optimizer = tf.keras.optimizers.Adam(
                learning_rate=self.q_lr)
            self.summary_writer = tf.summary.create_file_writer(log_dir)
            self.mse = tf.keras.losses.MeanSquaredError()
        if load_path is not None:
            self.actor.load_weights(f'{load_path}/actor')
            self.critic.load_weights(f'{load_path}/critic')

    @tf.function
    def train_step(self, states, actions, targets):
        with tf.GradientTape() as tape:
            action_predictions = self.actor(states)
            q_values = self.critic([states, action_predictions])
            policy_loss = -tf.reduce_mean(q_values)
        actor_gradients = tape.gradient(policy_loss,
                                        self.actor.trainable_variables)
        self.actor_optimizer.apply_gradients(
            zip(actor_gradients, self.actor.trainable_variables))

        with tf.GradientTape() as tape:
            q_values = self.critic([states, actions])
            mse_loss = self.mse(q_values, targets)
        critic_gradients = tape.gradient(mse_loss,
                                         self.critic.trainable_variables)
        self.critic_optimizer.apply_gradients(
            zip(critic_gradients, self.critic.trainable_variables))

        with self.summary_writer.as_default():
            tf.summary.scalar('Policy Loss',
                              policy_loss,
                              step=self.critic_optimizer.iterations)
            tf.summary.scalar('MSE Loss',
                              mse_loss,
                              step=self.critic_optimizer.iterations)
            tf.summary.scalar('Estimated Q Value',
                              tf.reduce_mean(q_values),
                              step=self.critic_optimizer.iterations)

    def update(self):
        if len(self.buffer) >= self.batch_size:
            # Sample random minibatch of N transitions
            states, actions, rewards, next_states, dones = self.buffer.sample(
                self.batch_size)
            dones = dones.reshape(-1, 1)
            rewards = rewards.reshape(-1, 1)

            # Set the target for learning
            target_action_preds = self.target_actor(next_states)
            target_q_values = self.target_critic(
                [next_states, target_action_preds])
            targets = rewards + self.gamma * target_q_values * (1 - dones)

            # update critic by minimizing the MSE loss
            # update the actor policy using the sampled policy gradient
            self.train_step(states, actions, targets)

            # Update target networks
            polyak_average(self.actor.variables, self.target_actor.variables,
                           self.polyak)
            polyak_average(self.critic.variables, self.target_critic.variables,
                           self.polyak)

    def act(self, obs, noise=False):
        # Initialize a random process N for action exploration
        norm_dist = tf.random.normal(self.env.action_space.shape,
                                     stddev=self.act_noise)

        action = self.actor(np.expand_dims(obs, axis=0))
        action = np.clip(action.numpy() + (norm_dist.numpy() if noise else 0),
                         a_min=self.env.action_space.low,
                         a_max=self.env.action_space.high)
        return action

    def learn(self):
        mean_reward = None
        total_steps = 0
        overall_steps = 0
        for ep in range(self.max_episodes):
            if ep % self.print_freq == 0 and ep > 0:
                new_mean_reward = np.mean(self.rewards[-self.print_freq - 1:])

                print(
                    f"-------------------------------------------------------")
                print(
                    f"Mean {self.print_freq} Episode Reward: {new_mean_reward}"
                )
                print(f"Mean Steps: {total_steps / self.print_freq}")
                print(f"Total Episodes: {ep}")
                print(f"Total Steps: {overall_steps}")
                print(
                    f"-------------------------------------------------------")

                total_steps = 0
                with self.summary_writer.as_default():
                    tf.summary.scalar(f'Mean {self.print_freq} Episode Reward',
                                      new_mean_reward,
                                      step=ep)

                # Model saving inspired by Open AI Baseline implementation
                if (mean_reward is None or new_mean_reward >= mean_reward
                    ) and self.save_path is not None:
                    print(
                        f"Saving model due to mean reward increase:{mean_reward} -> {new_mean_reward}"
                    )
                    print(f'Location: {self.save_path}')
                    mean_reward = new_mean_reward

                    self.actor.save_weights(f'{self.save_path}/actor')
                    self.critic.save_weights(f'{self.save_path}/critic')

            # Receive initial observation state s_1
            obs = self.env.reset()
            done = False
            episode_reward = 0
            ep_len = 0
            while not done:
                # Display the environment
                if self.render:
                    self.env.render()

                # Execute action and observe reward and observe new state
                if self.start_steps > 0:
                    self.start_steps -= 1
                    action = self.env.action_space.sample()
                else:
                    # Select action according to policy and exploration noise
                    action = self.act(obs, noise=True).flatten()
                new_obs, rew, done, info = self.env.step(action)
                new_obs = new_obs.flatten()
                episode_reward += rew

                # Store transition in R
                self.buffer.add((obs, action, rew, new_obs, done))

                # Perform a single learning step
                self.update()

                obs = new_obs
                ep_len += 1

            with self.summary_writer.as_default():
                tf.summary.scalar(f'Episode Reward', episode_reward, step=ep)

            self.rewards.append(episode_reward)
            total_steps += ep_len
            overall_steps += ep_len
    def initialize(self, args):
        BaseModel.initialize(self, args)
        self.input_A = self.Tensor(args.batchSize, 3, 1024, 256)
        self.input_B = self.Tensor(args.batchSize, 3, 1024, 256)

        self.fake_A_Buffer = ReplayBuffer()
        self.fake_B_Buffer = ReplayBuffer()

        self.netG_AtoB = networks.define_G(3, 3, 64, 'resnet_9blocks',
                                           'instance', False, args.init_type,
                                           self.gpu_ids)
        self.netG_BtoA = networks.define_G(3, 3, 64, 'resnet_9blocks',
                                           'instance', False, args.init_type,
                                           self.gpu_ids)
        self.netD_A = networks.define_D(3,
                                        64,
                                        'basic',
                                        norm='instance',
                                        use_sigmoid=False,
                                        gpu_ids=args.gpu_ids)
        self.netD_B = networks.define_D(3,
                                        64,
                                        'basic',
                                        norm='instance',
                                        use_sigmoid=False,
                                        gpu_ids=args.gpu_ids)

        self.netG_AtoB.apply(weights_init_normal)
        self.netG_BtoA.apply(weights_init_normal)
        self.netD_A.apply(weights_init_normal)
        self.netD_B.apply(weights_init_normal)

        checkpoint_AtoB_filename = 'netG_A2B.pth'
        checkpoint_BtoA_filename = 'netG_B2A.pth'

        checkpoint_D_A_filename = 'netD_A.pth'
        checkpoint_D_B_filename = 'netD_B.pth'

        checkpoint_path_AtoB = os.path.join(args.checkpoints_dir,
                                            checkpoint_AtoB_filename)
        checkpoint_path_BtoA = os.path.join(args.checkpoints_dir,
                                            checkpoint_BtoA_filename)

        checkpoint_path_D_A = os.path.join(args.checkpoints_dir,
                                           checkpoint_D_A_filename)
        checkpoint_path_D_B = os.path.join(args.checkpoints_dir,
                                           checkpoint_D_B_filename)

        # Load checkpoint
        # self.netG_AtoB.load_state_dict(torch.load(checkpoint_path_AtoB))
        # self.netG_BtoA.load_state_dict(torch.load(checkpoint_path_BtoA))
        # self.netD_A.load_state_dict(torch.load(checkpoint_path_D_A))
        # self.netD_B.load_state_dict(torch.load(checkpoint_path_D_B))

        # define loss
        # self.criterionGAN = networks.GANLoss().to(self.device)
        self.criterionGAN = torch.nn.MSELoss().cuda()
        self.criterionCycle = torch.nn.L1Loss().cuda()
        self.criterionIdentity = torch.nn.L1Loss().cuda()

        # init optimizer
        self.optimizer_G = torch.optim.Adam(itertools.chain(
            self.netG_AtoB.parameters(), self.netG_BtoA.parameters()),
                                            lr=0.0001,
                                            betas=(0.5, 0.999))
        self.optimizer_D_a = torch.optim.Adam(self.netD_A.parameters(),
                                              lr=0.0001,
                                              betas=(0.5, 0.999))
        self.optimizer_D_b = torch.optim.Adam(self.netD_B.parameters(),
                                              lr=0.0001,
                                              betas=(0.5, 0.999))

        self.lr_scheduler_G = torch.optim.lr_scheduler.LambdaLR(
            self.optimizer_G,
            lr_lambda=LambdaLR(args.n_epochs, args.epoch,
                               args.decay_epoch).step)
        self.lr_scheduler_D_a = torch.optim.lr_scheduler.LambdaLR(
            self.optimizer_D_a,
            lr_lambda=LambdaLR(args.n_epochs, args.epoch,
                               args.decay_epoch).step)
        self.lr_scheduler_D_b = torch.optim.lr_scheduler.LambdaLR(
            self.optimizer_D_b,
            lr_lambda=LambdaLR(args.n_epochs, args.epoch,
                               args.decay_epoch).step)
Esempio n. 27
0
def main():
    time_str = time.strftime("%Y%m%d-%H%M%S")
    print('time_str: ', time_str)

    exp_count = 0

    if args.experiment == 'a|s':
        direc_name_ = '_'.join([args.env, args.experiment])
    else:
        direc_name_ = '_'.join(
            [args.env, args.experiment, 'bp2VAE',
             str(args.bp2VAE)])

    direc_name_exist = True

    while direc_name_exist:
        exp_count += 1
        direc_name = '/'.join([direc_name_, str(exp_count)])
        direc_name_exist = os.path.exists(direc_name)

    try:
        os.makedirs(direc_name)
    except OSError as e:
        if e.errno != errno.EEXIST:
            raise

    if args.tensorboard_dir is None:
        logger = Logger('/'.join([direc_name, time_str]))
    else:
        logger = Logger(args.tensorboard_dir)

    env = gym.make(args.env)

    if args.wrapper:
        if args.video_dir is None:
            args.video_dir = '/'.join([direc_name, 'videos'])
        env = gym.wrappers.Monitor(env, args.video_dir, force=True)

    print('observation_space: ', env.observation_space)
    print('action_space: ', env.action_space)
    env.seed(args.seed)
    torch.manual_seed(args.seed)

    if args.experiment == 'a|s':
        dim_x = env.observation_space.shape[0]
    elif args.experiment == 'a|z(s)' or args.experiment == 'a|z(s, s_next)' or \
            args.experiment == 'a|z(a_prev, s, s_next)':
        dim_x = args.z_dim

    policy = ActorCritic(input_size=dim_x,
                         hidden1_size=3 * dim_x,
                         hidden2_size=6 * dim_x,
                         action_size=env.action_space.n)

    if args.use_cuda:
        Tensor = torch.cuda.FloatTensor
        torch.cuda.manual_seed_all(args.seed)
        policy.cuda()
    else:
        Tensor = torch.FloatTensor

    policy_optimizer = optim.Adam(policy.parameters(), lr=args.policy_lr)

    if args.experiment != 'a|s':
        from util import ReplayBuffer, vae_loss_function

        dim_s = env.observation_space.shape[0]

        if args.experiment == 'a|z(s)' or args.experiment == 'a|z(s, s_next)':
            from model import VAE
            vae = VAE(input_size=dim_s,
                      hidden1_size=3 * args.z_dim,
                      hidden2_size=args.z_dim)

        elif args.experiment == 'a|z(a_prev, s, s_next)':
            from model import CVAE
            vae = CVAE(input_size=dim_s,
                       class_size=1,
                       hidden1_size=3 * args.z_dim,
                       hidden2_size=args.z_dim)

        if args.use_cuda:
            vae.cuda()
        vae_optimizer = optim.Adam(vae.parameters(), lr=args.vae_lr)

        if args.experiment == 'a|z(s)':
            from util import Transition_S2S as Transition
        elif args.experiment == 'a|z(s, s_next)' or args.experiment == 'a|z(a_prev, s, s_next)':
            from util import Transition_S2SNext as Transition

        buffer = ReplayBuffer(args.buffer_capacity, Transition)

        update_vae = True

    if args.experiment == 'a|s':
        from util import Record_S
    elif args.experiment == 'a|z(s)':
        from util import Record_S2S
    elif args.experiment == 'a|z(s, s_next)' or args.experiment == 'a|z(a_prev, s, s_next)':
        from util import Record_S2SNext

    def train_actor_critic(n):
        saved_info = policy.saved_info

        R = 0
        cum_returns_ = []

        for r in policy.rewards[::-1]:
            R = r + args.gamma * R
            cum_returns_.insert(0, R)

        cum_returns = Tensor(cum_returns_)
        cum_returns = (cum_returns - cum_returns.mean()) \
                      / (cum_returns.std() + np.finfo(np.float32).eps)
        cum_returns = Variable(cum_returns, requires_grad=False).unsqueeze(1)

        batch_info = SavedInfo(*zip(*saved_info))
        batch_log_prob = torch.cat(batch_info.log_prob)
        batch_value = torch.cat(batch_info.value)

        batch_adv = cum_returns - batch_value
        policy_loss = -torch.sum(batch_log_prob * batch_adv)
        value_loss = F.smooth_l1_loss(batch_value,
                                      cum_returns,
                                      size_average=False)

        policy_optimizer.zero_grad()
        total_loss = policy_loss + value_loss
        total_loss.backward()
        policy_optimizer.step()

        if args.use_cuda:
            logger.scalar_summary('value_loss', value_loss.data.cpu()[0], n)
            logger.scalar_summary('policy_loss', policy_loss.data.cpu()[0], n)

            all_value_loss.append(value_loss.data.cpu()[0])
            all_policy_loss.append(policy_loss.data.cpu()[0])
        else:
            logger.scalar_summary('value_loss', value_loss.data[0], n)
            logger.scalar_summary('policy_loss', policy_loss.data[0], n)

            all_value_loss.append(value_loss.data[0])
            all_policy_loss.append(policy_loss.data[0])

        del policy.rewards[:]
        del policy.saved_info[:]

    if args.experiment != 'a|s':

        def train_vae(n):

            train_times = (n // args.vae_update_frequency -
                           1) * args.vae_update_times

            for i in range(args.vae_update_times):
                train_times += 1

                sample = buffer.sample(args.batch_size)
                batch = Transition(*zip(*sample))
                state_batch = torch.cat(batch.state)

                if args.experiment == 'a|z(s)':
                    recon_batch, mu, log_var = vae.forward(state_batch)

                    mse_loss, kl_loss = vae_loss_function(
                        recon_batch,
                        state_batch,
                        mu,
                        log_var,
                        logger,
                        train_times,
                        kl_discount=args.kl_weight,
                        mode=args.experiment)

                elif args.experiment == 'a|z(s, s_next)' or args.experiment == 'a|z(a_prev, s, s_next)':
                    next_state_batch = Variable(torch.cat(batch.next_state),
                                                requires_grad=False)
                    predicted_batch, mu, log_var = vae.forward(state_batch)
                    mse_loss, kl_loss = vae_loss_function(
                        predicted_batch,
                        next_state_batch,
                        mu,
                        log_var,
                        logger,
                        train_times,
                        kl_discount=args.kl_weight,
                        mode=args.experiment)

                vae_loss = mse_loss + kl_loss

                vae_optimizer.zero_grad()
                vae_loss.backward()
                vae_optimizer.step()

                logger.scalar_summary('vae_loss', vae_loss.data[0],
                                      train_times)
                all_vae_loss.append(vae_loss.data[0])
                all_mse_loss.append(mse_loss.data[0])
                all_kl_loss.append(kl_loss.data[0])

    # To store cum_reward, value_loss and policy_loss from each episode
    all_cum_reward = []
    all_last_hundred_average = []
    all_value_loss = []
    all_policy_loss = []

    if args.experiment != 'a|s':
        # Store each vae_loss calculated
        all_vae_loss = []
        all_mse_loss = []
        all_kl_loss = []

    for episode in count(1):
        done = False
        state_ = torch.Tensor([env.reset()])
        cum_reward = 0

        if args.experiment == 'a|z(a_prev, s, s_next)':
            action = random.randint(0, 2)
            state_, reward, done, info = env.step(action)
            cum_reward += reward
            state_ = torch.Tensor([np.append(state_, action)])

        while not done:
            if args.experiment == 'a|s':
                state = Variable(state_, requires_grad=False)
            elif args.experiment == 'a|z(s)' or args.experiment == 'a|z(s, s_next)' \
                    or args.experiment == 'a|z(a_prev, s, s_next)':
                state_ = Variable(state_, requires_grad=False)
                mu, log_var = vae.encode(state_)

                if args.bp2VAE and update_vae:
                    state = vae.reparametrize(mu, log_var)
                else:
                    state = vae.reparametrize(mu, log_var).detach()

            action_ = policy.select_action(state)

            if args.use_cuda:
                action = action_.cpu()[0, 0]
            else:
                action = action_[0, 0]

            next_state_, reward, done, info = env.step(action)
            next_state_ = torch.Tensor([next_state_])
            cum_reward += reward

            if args.render:
                env.render()

            policy.rewards.append(reward)

            if args.experiment == 'a|z(s)':
                buffer.push(state_)
            elif args.experiment == 'a|z(s, s_next)' or args.experiment == 'a|z(a_prev, s, s_next)':
                if not done:
                    buffer.push(state_, next_state_)

            if args.experiment == 'a|z(a_prev, s, s_next)':
                next_state_ = torch.cat(
                    [next_state_, torch.Tensor([action])], 1)

            state_ = next_state_

        train_actor_critic(episode)
        last_hundred_average = sum(all_cum_reward[-100:]) / 100

        logger.scalar_summary('cum_reward', cum_reward, episode)
        logger.scalar_summary('last_hundred_average', last_hundred_average,
                              episode)

        all_cum_reward.append(cum_reward)
        all_last_hundred_average.append(last_hundred_average)

        if update_vae:
            if args.experiment != 'a|s' and episode % args.vae_update_frequency == 0:
                assert len(buffer) >= args.batch_size
                train_vae(episode)

            if len(all_vae_loss) > 1000:
                if abs(
                        sum(all_vae_loss[-500:]) / 500 -
                        sum(all_vae_loss[-1000:-500]) /
                        500) < args.vae_update_threshold:
                    update_vae = False

        if episode % args.log_interval == 0:
            print(
                'Episode {}\tLast cum return: {:5f}\t100-episodes average cum return: {:.2f}'
                .format(episode, cum_reward, last_hundred_average))

        if episode > args.num_episodes:
            print("100-episodes average cum return is now {} and "
                  "the last episode runs to {} time steps!".format(
                      last_hundred_average, cum_reward))
            env.close()
            torch.save(policy, '/'.join([direc_name, 'model']))

            if args.experiment == 'a|s':
                record = Record_S(
                    policy_loss=all_policy_loss,
                    value_loss=all_value_loss,
                    cum_reward=all_cum_reward,
                    last_hundred_average=all_last_hundred_average)
            elif args.experiment == 'a|z(s)':
                record = Record_S2S(
                    policy_loss=all_policy_loss,
                    value_loss=all_value_loss,
                    cum_reward=all_cum_reward,
                    last_hundred_average=all_last_hundred_average,
                    mse_recon_loss=all_mse_loss,
                    kl_loss=all_kl_loss,
                    vae_loss=all_vae_loss)
            elif args.experiment == 'a|z(s, s_next)' or args.experiment == 'a|z(a_prev, s, s_next)':
                record = Record_S2SNext(
                    policy_loss=all_policy_loss,
                    value_loss=all_value_loss,
                    cum_reward=all_cum_reward,
                    last_hundred_average=all_last_hundred_average,
                    mse_pred_loss=all_mse_loss,
                    kl_loss=all_kl_loss,
                    vae_loss=all_vae_loss)

            pickle.dump(record, open('/'.join([direc_name, 'record']), 'wb'))

            break