#!/usr/bin/env python # encoding: utf-8 """ @author: Young @license: (C) Copyright 2013-2017 @contact: [email protected] @file: test_memory.py @time: 2018/1/16 21:37 """ import numpy as np from agent.memory import Memory M = Memory() state = np.random.normal(size=24) action = np.random.normal(size=4) reward = np.random.normal() done = np.bool(np.random.randint(0, 2)) next_state = state for _ in range(int(1e6)): M(state, action, reward, done, next_state) states, actions, rewards, next_states = M.sample(128) print(states.shape) print(actions.shape) print(rewards.shape) print(next_states.shape)
class Agent(object): def __init__(self, state_size, action_size, action_limits=1.): self.state_size = state_size self.action_size = action_size self.action_limits = action_limits self.memory = Memory(MEMORY_SIZE) self.noise = Noise(action_size) self.actor = ActorNet(state_size, action_size) self.target_actor = deepcopy(self.actor) self.actor_optimizer = Adam(self.actor.parameters(), LEARNING_RATE) self.critic = CriticNet(state_size, action_size) self.target_critic = deepcopy(self.critic) self.critic_optimizer = Adam(self.critic.parameters(), LEARNING_RATE) def append(self, *args): self.memory.append(*args) def sample(self, *args): return self.memory.sample(*args) def get_exploitation_policy(self, state): state = Variable(torch.from_numpy(np.float32(state))) action = self.target_actor(state).detach() return action.data.numpy() def get_exploration_policy(self, state): state = Variable(torch.from_numpy(np.float32(state))) action = self.actor(state).detach() return action.data.numpy() + \ (self.noise() * self.action_limits) def optimize(self, batch_size=BATCH_SIZE): batch = self.sample(batch_size) state, action, reward, next_state =\ [Variable(torch.from_numpy(i)) for i in batch] next_action = self.target_actor.forward(next_state).detach() next_value = torch.squeeze( self.target_critic(next_state, next_action).detach()) target_value = reward + GAMMA * next_value value = torch.squeeze(self.critic(state, action)) loss_critic = nf.smooth_l1_loss(value, target_value) self.critic_optimizer.zero_grad() loss_critic.backward() self.critic_optimizer.step() policy_action = self.actor(state) loss_actor = -1 * torch.sum(self.critic(state, policy_action)) self.actor_optimizer.zero_grad() loss_actor.backward() self.actor_optimizer.step() soft_update(self.target_actor, self.actor, TAU) soft_update(self.target_critic, self.critic, TAU) def restore_models(self, num_episode): self.actor.load_state_dict( torch.load("./Models/{}_actor.pkl".format(num_episode))) self.critic.load_state_dict( torch.load("./Models/{}_critic.pkl".format(num_episode))) hard_update(self.target_actor, self.actor) hard_update(self.target_critic, self.critic) def save_models(self, num_episode): torch.save(self.target_actor.state_dict(), "actor_{}.pkl".format(num_episode)) torch.save(self.target_critic.state_dict(), "critic_{}.pkl".format(num_episode)) print('Models saved successfully')