def __init__(self, env_name, env, batch_size=32, replay_memory_size=1e6, history_size=4, target_net_update_frequency=1e4, gamma=0.99, action_repeat=4, lr=0.00025, gradient_momentum=0.95, initial_epsilon=1, final_epsilon=0.1, epsilon_decay_step=1e6, warmup_step=5e4, save_model_frequency=20, eval_frequency=1): self.env_name = env_name self.env = env self.batch_size = batch_size self.replay_memory_size = replay_memory_size self.history_size = history_size self.target_net_update_frequency = target_net_update_frequency self.gamma = gamma self.action_repeat = action_repeat self.lr = lr self.gradient_momentum = gradient_momentum self.initial_epsilon = initial_epsilon self.final_epsilon = final_epsilon self.epsilon_decay_step = epsilon_decay_step self.warmup_step = warmup_step self.save_model_frequency = save_model_frequency self.eval_frequency = eval_frequency self.epsilon = self.initial_epsilon self.device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu') print('Train on device:', self.device) self.memory = Memory(int(replay_memory_size), batch_size) self.net = Network(self.env.action_space.n).to(self.device) print(self.net) self.target_net = Network(self.env.action_space.n).to(self.device) self.update_model(self.target_net, self.net) self.opt = optim.RMSprop(self.net.parameters(), lr=self.lr, alpha=self.gradient_momentum) self.writer = SummaryWriter('./logs/DQN_{}_{}'.format( datetime.now().strftime("%Y-%m-%d_%H-%M-%S"), self.env_name)) self.loss_fn = F.mse_loss self.total_step = 0
def __init__(self, env_name, env, actor_lr=3e-4, critic_lr=3e-3, gamma=0.99, batch_size=32, replay_memory_size=1e6, is_test=False, save_model_frequency=200, eval_frequency=10): self.env_name = env_name self.env = env self.actor_lr = actor_lr self.critic_lr = critic_lr self.gamma = gamma self.batch_size = batch_size self.replay_memory_size = replay_memory_size self.save_model_frequency = save_model_frequency self.eval_frequency = eval_frequency self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') print('Train on device:', self.device) if not is_test: self.writer = SummaryWriter('./logs/DDPG_{}'.format(self.env_name)) self.loss_fn = F.mse_loss self.memory = Memory(int(replay_memory_size), batch_size) n_state, n_action = env.observation_space.shape[0], env.action_space.shape[0] self.noise = OUNoise(n_action) self.actor = DeterministicActor(n_state, n_action, action_scale=int(env.action_space.high[0])).to(self.device) self.target_actor = DeterministicActor(n_state, n_action, action_scale=int(env.action_space.high[0])).to(self.device) update_model(self.target_actor, self.actor) self.actor_opt = optim.Adam(self.actor.parameters(), lr=self.actor_lr) self.critic = Critic(n_state + n_action).to(self.device) self.target_critic = Critic(n_state + n_action).to(self.device) update_model(self.target_critic, self.critic) self.critic_opt = optim.Adam(self.critic.parameters(), lr=self.critic_lr) print(self.actor) print(self.critic)
class DQN: def __init__(self, env_name, env, batch_size=32, replay_memory_size=1e6, history_size=4, target_net_update_frequency=1e4, gamma=0.99, action_repeat=4, lr=0.00025, gradient_momentum=0.95, initial_epsilon=1, final_epsilon=0.1, epsilon_decay_step=1e6, warmup_step=5e4, save_model_frequency=20, eval_frequency=1): self.env_name = env_name self.env = env self.batch_size = batch_size self.replay_memory_size = replay_memory_size self.history_size = history_size self.target_net_update_frequency = target_net_update_frequency self.gamma = gamma self.action_repeat = action_repeat self.lr = lr self.gradient_momentum = gradient_momentum self.initial_epsilon = initial_epsilon self.final_epsilon = final_epsilon self.epsilon_decay_step = epsilon_decay_step self.warmup_step = warmup_step self.save_model_frequency = save_model_frequency self.eval_frequency = eval_frequency self.epsilon = self.initial_epsilon self.device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu') print('Train on device:', self.device) self.memory = Memory(int(replay_memory_size), batch_size) self.net = Network(self.env.action_space.n).to(self.device) print(self.net) self.target_net = Network(self.env.action_space.n).to(self.device) self.update_model(self.target_net, self.net) self.opt = optim.RMSprop(self.net.parameters(), lr=self.lr, alpha=self.gradient_momentum) self.writer = SummaryWriter('./logs/DQN_{}_{}'.format( datetime.now().strftime("%Y-%m-%d_%H-%M-%S"), self.env_name)) self.loss_fn = F.mse_loss self.total_step = 0 def select_action(self, state, is_test=False): epsilon = 0.05 if is_test else self.epsilon if np.random.uniform(0, 1) < epsilon: a = self.env.action_space.sample() else: state = torch.tensor(state, dtype=torch.float).unsqueeze(0).to( self.device) a = self.net(state).cpu().detach().numpy().argmax() return a def learn(self): batch = self.memory.sample() loss = self.compute_loss(batch) self.opt.zero_grad() loss.backward() # for param in self.net.parameters(): # if param.grad is not None: # param.grad.data.clamp_(-1, 1) for p in filter(lambda p: p.grad is not None, self.net.parameters()): p.grad.data.clamp_(min=-1, max=1) self.opt.step() def compute_loss(self, batch): batch_state, batch_action, batch_reward, batch_next_state, batch_done = batch.state, batch.action, batch.reward, batch.next_state, batch.done batch_state = torch.tensor(batch_state, dtype=torch.float, requires_grad=True).to(self.device) batch_action = torch.tensor(batch_action, dtype=torch.long).to(self.device) batch_reward = torch.tensor(batch_reward, dtype=torch.float, requires_grad=True).to(self.device) batch_next_state = torch.tensor(batch_next_state, dtype=torch.float, requires_grad=True).to(self.device) batch_mask = torch.tensor([not i for i in batch_done], dtype=torch.bool).to(self.device) pred_q = self.net(batch_state) pred_target_q = self.target_net(batch_next_state) q = torch.tensor( [i[idx] for idx, i in zip(batch_action.long(), pred_q)], dtype=torch.float, requires_grad=True).to(self.device) max_next_q = torch.tensor([i.max() for i in pred_target_q], dtype=torch.float, requires_grad=True).to(self.device) target_q = batch_mask * (batch_reward + self.gamma * max_next_q) loss = self.loss_fn(q, target_q) # self.writer.add_scalar('loss', loss, self.total_step) return loss def train(self, epochs): for epoch in range(epochs): s = self.env.reset() s = self.preprocess(s) s = np.stack((s[0], s[0], s[0], s[0]), axis=0) while True: # self.env.render() if self.total_step < self.warmup_step: a = env.action_space.sample() else: a = self.select_action(s) s_, r, done, _ = env.step(a) s_ = self.preprocess(s_) s_ = np.stack((s[1], s[2], s[3], s_[0]), axis=0) if r > 0: r = 1 elif r < 0: r = -1 self.memory.push(s, a, r, s_, done) s = s_ self.total_step += 1 self.epsilon = self.final_epsilon if self.total_step > self.epsilon_decay_step else self.initial_epsilon - ( self.initial_epsilon - self.final_epsilon ) * self.total_step / self.epsilon_decay_step if self.total_step % self.target_net_update_frequency == 0: self.update_model(self.target_net, self.net) if len(self.memory) >= self.batch_size: self.learn() if done: break if (epoch + 1) % self.save_model_frequency == 0: self.save_model(self.net, 'model/model_DQN_{}'.format(epoch)) if (epoch + 1) % self.eval_frequency == 0: eval_r = self.evaluate() print('epoch', epoch, 'reward', eval_r) self.writer.add_scalar('reward', eval_r, epoch) def preprocess(self, img): img = Image.fromarray(img) img_preprocess = transforms.Compose([ transforms.Grayscale(1), transforms.Resize((84, 84)), transforms.CenterCrop(84), transforms.ToTensor(), ]) img = img_preprocess(img) return img.numpy() def save_model(self, model, path): p = os.path.dirname(path) if not os.path.exists(p): os.mkdir(p) torch.save(model.state_dict(), path) def load_model(self, model, path): model.load_state_dict(torch.load(path)) def update_model(self, target_model, model, tau=1): for target_param, param in zip(target_model.parameters(), model.parameters()): target_param.data.copy_(target_param.data * (1 - tau) + param.data * tau) def evaluate(self, epochs=3): total_r = 0 for _ in range(epochs): s = self.env.reset() s = self.preprocess(s) s = np.stack((s[0], s[0], s[0], s[0]), axis=0) while True: a = self.select_action(s, is_test=True) s_, r, done, _ = self.env.step(a) s_ = self.preprocess(s_) s_ = np.stack((s[1], s[2], s[3], s_[0]), axis=0) total_r += r s = s_ if done: break return total_r / epochs
class DDPG: def __init__(self, env_name, env, actor_lr=3e-4, critic_lr=3e-3, gamma=0.99, batch_size=32, replay_memory_size=1e6, is_test=False, save_model_frequency=200, eval_frequency=10): self.env_name = env_name self.env = env self.actor_lr = actor_lr self.critic_lr = critic_lr self.gamma = gamma self.batch_size = batch_size self.replay_memory_size = replay_memory_size self.save_model_frequency = save_model_frequency self.eval_frequency = eval_frequency self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') print('Train on device:', self.device) if not is_test: self.writer = SummaryWriter('./logs/DDPG_{}'.format(self.env_name)) self.loss_fn = F.mse_loss self.memory = Memory(int(replay_memory_size), batch_size) n_state, n_action = env.observation_space.shape[0], env.action_space.shape[0] self.noise = OUNoise(n_action) self.actor = DeterministicActor(n_state, n_action, action_scale=int(env.action_space.high[0])).to(self.device) self.target_actor = DeterministicActor(n_state, n_action, action_scale=int(env.action_space.high[0])).to(self.device) update_model(self.target_actor, self.actor) self.actor_opt = optim.Adam(self.actor.parameters(), lr=self.actor_lr) self.critic = Critic(n_state + n_action).to(self.device) self.target_critic = Critic(n_state + n_action).to(self.device) update_model(self.target_critic, self.critic) self.critic_opt = optim.Adam(self.critic.parameters(), lr=self.critic_lr) print(self.actor) print(self.critic) def select_action(self, state, is_test=False): state = torch.tensor(state, dtype=torch.float).to(self.device) if is_test: a = self.actor(state) else: a = self.actor(state) + torch.tensor(self.noise(), dtype=torch.float).to(self.device) a = a.clip(-self.actor.action_scale, self.actor.action_scale) return a.cpu().detach().numpy() def train(self, epochs): best_eval = -1e6 for epoch in range(epochs): s = self.env.reset() policy_loss, critic_loss = 0, 0 while True: self.env.render() a = self.select_action(s) s_, r, done, _ = self.env.step(a) self.memory.push(s, a, r, s_, done) if len(self.memory) > self.batch_size: policy_loss, critic_loss = self.learn() s = s_ if done: break self.writer.add_scalar('loss/actor_loss', policy_loss, epoch) self.writer.add_scalar('loss/critic_loss', critic_loss, epoch) if (epoch + 1) % self.save_model_frequency == 0: save_model(self.critic, 'model/{}_model/critic_{}'.format(self.env_name, epoch)) save_model(self.actor, 'model/{}_model/actor_{}'.format(self.env_name, epoch)) if (epoch + 1) % self.eval_frequency == 0: eval_r = self.evaluate() print('epoch', epoch, 'evaluate reward', eval_r) self.writer.add_scalar('reward', eval_r, epoch) if eval_r > best_eval: best_eval = eval_r save_model(self.critic, 'model/{}_model/best_critic'.format(self.env_name)) save_model(self.actor, 'model/{}_model/best_actor'.format(self.env_name)) def learn(self): batch = self.memory.sample() batch_state, batch_action, batch_reward, batch_next_state, batch_done = \ batch.state, batch.action, batch.reward, batch.next_state, batch.done batch_state = torch.tensor(batch_state, dtype=torch.float).to(self.device) batch_action = torch.tensor(batch_action, dtype=torch.float).reshape(self.batch_size, -1).to(self.device) batch_reward = torch.tensor(batch_reward, dtype=torch.float).reshape(self.batch_size, -1).to(self.device) batch_next_state = torch.tensor(batch_next_state, dtype=torch.float).to(self.device) batch_mask = torch.tensor([not i for i in batch_done], dtype=torch.bool).reshape(self.batch_size, -1).to(self.device) # update critic pred_q = self.critic(torch.cat((batch_state, batch_action), dim=-1)) next_action = self.target_actor(batch_next_state) next_q = self.target_critic(torch.cat((batch_next_state, next_action), dim=-1)) pred_target_q = batch_reward + batch_mask * self.gamma * next_q critic_loss = self.loss_fn(pred_q, pred_target_q) self.critic_opt.zero_grad() critic_loss.backward() self.critic_opt.step() # update actor policy_loss = - self.critic(torch.cat((batch_state, self.actor(batch_state)), dim=-1)).mean() self.actor_opt.zero_grad() policy_loss.backward() self.actor_opt.step() # update target update_model(self.target_critic, self.critic, 0.05) update_model(self.target_actor, self.actor, 0.05) return policy_loss.item(), critic_loss.item() def evaluate(self, epochs=3, is_render=False): eval_r = 0 for _ in range(epochs): s = self.env.reset() while True: if is_render: self.env.render() with torch.no_grad(): a = self.select_action(s, is_test=True) s_, r, done, _ = self.env.step(a) s = s_ eval_r += r if done: break return eval_r / epochs
def __init__(self, env_name, env, actor_lr=3e-4, critic_lr=3e-4, alpha_lr=3e-4, gamma=0.99, batch_size=64, replay_memory_size=1e6, update_frequency=2, warmup_step=1e3, tau=0.005, alpha=None, is_test=False, save_model_frequency=200, eval_frequency=10, save_log_frequency=10): self.env_name = env_name self.env = env self.actor_lr = actor_lr self.critic_lr = critic_lr self.alpha_lr = alpha_lr self.gamma = gamma self.batch_size = batch_size self.replay_memory_size = replay_memory_size self.update_frequency = update_frequency self.warmup_step = warmup_step self.tau = tau self.save_model_frequency = save_model_frequency self.eval_frequency = eval_frequency self.save_log_frequency = save_log_frequency self.total_step = 0 self.device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu') print('Train on device:', self.device) if not is_test: self.writer = SummaryWriter('./logs/SAC_{}'.format(self.env_name)) self.loss_fn = F.mse_loss self.memory = Memory(int(replay_memory_size), batch_size) n_state, n_action = env.observation_space.shape[ 0], env.action_space.shape[0] self.state_normalize = ZFilter(n_state) if alpha is None: self.auto_tune_alpha = True self.target_entropy = -torch.prod( torch.Tensor(env.action_space.shape)).item() self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device) self.alpha_opt = optim.Adam([self.log_alpha], lr=self.alpha_lr) print('Auto adjust alpha') else: self.auto_tune_alpha = False self.log_alpha = torch.log(torch.tensor( alpha, dtype=torch.float)).to(self.device) print('Fixed alpha') self.actor = SACGaussianActor( n_state, n_action, 256, action_scale=int(env.action_space.high[0])).to(self.device) self.actor_opt = optim.Adam(self.actor.parameters(), lr=self.actor_lr) self.critic = TwinCritic(n_state + n_action, 256).to(self.device) self.critic_opt = optim.Adam(self.critic.parameters(), lr=self.critic_lr) self.target_critic = TwinCritic(n_state + n_action, 256).to(self.device) update_model(self.target_critic, self.critic) print(self.actor) print(self.critic)
class SAC: def __init__(self, env_name, env, actor_lr=3e-4, critic_lr=3e-4, alpha_lr=3e-4, gamma=0.99, batch_size=64, replay_memory_size=1e6, update_frequency=2, warmup_step=1e3, tau=0.005, alpha=None, is_test=False, save_model_frequency=200, eval_frequency=10, save_log_frequency=10): self.env_name = env_name self.env = env self.actor_lr = actor_lr self.critic_lr = critic_lr self.alpha_lr = alpha_lr self.gamma = gamma self.batch_size = batch_size self.replay_memory_size = replay_memory_size self.update_frequency = update_frequency self.warmup_step = warmup_step self.tau = tau self.save_model_frequency = save_model_frequency self.eval_frequency = eval_frequency self.save_log_frequency = save_log_frequency self.total_step = 0 self.device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu') print('Train on device:', self.device) if not is_test: self.writer = SummaryWriter('./logs/SAC_{}'.format(self.env_name)) self.loss_fn = F.mse_loss self.memory = Memory(int(replay_memory_size), batch_size) n_state, n_action = env.observation_space.shape[ 0], env.action_space.shape[0] self.state_normalize = ZFilter(n_state) if alpha is None: self.auto_tune_alpha = True self.target_entropy = -torch.prod( torch.Tensor(env.action_space.shape)).item() self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device) self.alpha_opt = optim.Adam([self.log_alpha], lr=self.alpha_lr) print('Auto adjust alpha') else: self.auto_tune_alpha = False self.log_alpha = torch.log(torch.tensor( alpha, dtype=torch.float)).to(self.device) print('Fixed alpha') self.actor = SACGaussianActor( n_state, n_action, 256, action_scale=int(env.action_space.high[0])).to(self.device) self.actor_opt = optim.Adam(self.actor.parameters(), lr=self.actor_lr) self.critic = TwinCritic(n_state + n_action, 256).to(self.device) self.critic_opt = optim.Adam(self.critic.parameters(), lr=self.critic_lr) self.target_critic = TwinCritic(n_state + n_action, 256).to(self.device) update_model(self.target_critic, self.critic) print(self.actor) print(self.critic) def select_action(self, state, is_test=False): state = torch.tensor(state, dtype=torch.float).to(self.device) # BUG: 最好是对state进行unsqueeze后再输出,保证和batch的数据的输入维度相同 a, log_prob = self.actor.sample(state, is_test) return a.cpu().detach().numpy(), log_prob.cpu().detach().numpy() def train(self, epochs): best_eval = -1e6 for epoch in range(epochs): s = self.env.reset() s = self.state_normalize(s) policy_loss, critic_loss, alpha_loss = 0, 0, 0 while True: self.env.render() a, _ = self.select_action(s) s_, r, done, _ = self.env.step(a) s_ = self.state_normalize(s_) self.memory.push(s, a, r, s_, done) self.total_step += 1 if len( self.memory ) > self.batch_size and self.total_step > self.warmup_step: policy_loss, critic_loss, alpha_loss = self.learn() s = s_ if done: break if (epoch + 1) % self.save_log_frequency == 0: self.writer.add_scalar('loss/critic_loss', critic_loss, self.total_step) self.writer.add_scalar('loss/policy_loss', policy_loss, self.total_step) self.writer.add_scalar('alpha', self.log_alpha.exp().item(), self.total_step) self.writer.add_scalar('loss/alpha_loss', alpha_loss, self.total_step) if (epoch + 1) % self.save_model_frequency == 0: save_model( self.critic, 'model/{}_model/critic_{}'.format(self.env_name, epoch)) save_model( self.actor, 'model/{}_model/actor_{}'.format(self.env_name, epoch)) ZFilter.save( self.state_normalize, 'model/{}_model/rs_{}'.format(self.env_name, epoch)) if (epoch + 1) % self.eval_frequency == 0: eval_r = self.evaluate() print('epoch', epoch, 'evaluate reward', eval_r) self.writer.add_scalar('reward', eval_r, self.total_step) if eval_r > best_eval: best_eval = eval_r save_model( self.critic, 'model/{}_model/best_critic'.format(self.env_name)) save_model( self.actor, 'model/{}_model/best_actor'.format(self.env_name)) ZFilter.save( self.state_normalize, 'model/{}_model/best_rs'.format(self.env_name)) def learn(self): batch = self.memory.sample() batch_state, batch_action, batch_reward, batch_next_state, batch_done = \ batch.state, batch.action, batch.reward, batch.next_state, batch.done batch_state = torch.tensor(batch_state, dtype=torch.float).to(self.device) batch_action = torch.tensor(batch_action, dtype=torch.float).reshape( self.batch_size, -1).to(self.device) batch_reward = torch.tensor(batch_reward, dtype=torch.float).reshape( self.batch_size, -1).to(self.device) batch_next_state = torch.tensor(batch_next_state, dtype=torch.float).to(self.device) batch_mask = torch.tensor([not i for i in batch_done], dtype=torch.bool).reshape( self.batch_size, -1).to(self.device) alpha = self.log_alpha.exp() # update critic with torch.no_grad(): next_action, next_log_prob = self.actor.sample(batch_next_state) next_log_prob = next_log_prob.sum(1, keepdim=True) next_input = torch.cat([batch_next_state, next_action], dim=-1) target_q1, target_q2 = self.target_critic(next_input) target_q = batch_reward + batch_mask * self.gamma * ( torch.min(target_q1, target_q2) - alpha * next_log_prob) q1, q2 = self.critic(torch.cat([batch_state, batch_action], dim=-1)) critic_loss_1 = self.loss_fn(q1, target_q) critic_loss_2 = self.loss_fn(q2, target_q) critic_loss = critic_loss_1 + critic_loss_2 self.critic_opt.zero_grad() critic_loss.backward() self.critic_opt.step() # update actor batch_pi, batch_pi_log_prob = self.actor.sample(batch_state) q1, q2 = self.critic(torch.cat([batch_state, batch_pi], dim=-1)) batch_pi_log_prob = batch_pi_log_prob.sum(1, keepdim=True) policy_loss = (alpha * batch_pi_log_prob - torch.min(q1, q2)).mean() self.actor_opt.zero_grad() policy_loss.backward() self.actor_opt.step() # update alpha if self.auto_tune_alpha: alpha_loss = -( self.log_alpha * (batch_pi_log_prob + self.target_entropy).detach()).mean() self.alpha_opt.zero_grad() alpha_loss.backward() self.alpha_opt.step() else: alpha_loss = torch.tensor(0) if (self.total_step + 1) % self.update_frequency == 0: update_model(self.target_critic, self.critic, self.tau) return policy_loss.item(), critic_loss.item(), alpha_loss.item() def evaluate(self, epochs=3, is_render=False): eval_r = 0 for _ in range(epochs): s = self.env.reset() s = self.state_normalize(s, update=False) while True: if is_render: self.env.render() a, _ = self.select_action(s, is_test=True) s_, r, done, _ = self.env.step(a) s_ = self.state_normalize(s_, update=False) s = s_ eval_r += r if done: break return eval_r / epochs
def collect_samples(pid, queue, env, policy, custom_reward, mean_action, render, running_state, min_batch_size): if pid > 0: torch.manual_seed(torch.randint(0, 5000, (1, )) * pid) if hasattr(env, 'np_random'): env.np_random.seed(env.np_random.randint(5000) * pid) if hasattr(env, 'env') and hasattr(env.env, 'np_random'): env.env.np_random.seed(env.env.np_random.randint(5000) * pid) log = dict() memory = Memory() num_steps = 0 total_reward = 0 min_reward = 1e6 max_reward = -1e6 total_c_reward = 0 min_c_reward = 1e6 max_c_reward = -1e6 num_episodes = 0 while num_steps < min_batch_size: state = env.reset() if running_state is not None: state = running_state(state) reward_episode = 0 for t in range(10000): state_var = tensor(state).unsqueeze(0) with torch.no_grad(): if mean_action: action = policy(state_var)[0][0].numpy() else: action = policy.select_action(state_var)[0].numpy() action = int(action) if policy.is_disc_action else action.astype( np.float64) next_state, reward, done, _ = env.step(action) reward_episode += reward if running_state is not None: next_state = running_state(next_state) if custom_reward is not None: reward = custom_reward(state, action) total_c_reward += reward min_c_reward = min(min_c_reward, reward) max_c_reward = max(max_c_reward, reward) mask = 0 if done else 1 memory.push(state, action, mask, next_state, reward) if render: env.render() if done: break state = next_state # log stats num_steps += (t + 1) num_episodes += 1 total_reward += reward_episode min_reward = min(min_reward, reward_episode) max_reward = max(max_reward, reward_episode) log['num_steps'] = num_steps log['num_episodes'] = num_episodes log['total_reward'] = total_reward log['avg_reward'] = total_reward / num_episodes log['max_reward'] = max_reward log['min_reward'] = min_reward if custom_reward is not None: log['total_c_reward'] = total_c_reward log['avg_c_reward'] = total_c_reward / num_steps log['max_c_reward'] = max_c_reward log['min_c_reward'] = min_c_reward if queue is not None: queue.put([pid, memory, log]) else: return memory, log