class CartPole: def __init__(self, gravity): self.dim = 5 self.env = CartPoleEnv() self.env.gravity = gravity def sigmoid(self, x): return 1 / (1 + np.exp(-x)) def action(self, observation, x): x = x * 10 - 5 w = x[:4] b = x[4] return int(self.sigmoid(np.sum(observation * w) + b) > 0.5) def fitness(self, x): fitness = 0 observation = self.env.reset() for t in range(200): action = self.action(observation, x) observation, reward, done, info = self.env.step(action) fitness += reward if done: break return -fitness def __del__(self): self.env.close()
ppo_update(agent, policy_trajectories, agent_optimiser, args.ppo_clip, epoch, args.value_loss_coeff, args.entropy_loss_coeff) # Evaluate agent and plot metrics if step % args.evaluation_interval == 0: metrics['test_steps'].append(step) metrics['test_returns'].append( evaluate_agent(agent, args.evaluation_episodes, seed=args.seed)) lineplot(metrics['test_steps'], metrics['test_returns'], 'test_returns') if args.imitation != 'BC': lineplot(metrics['train_steps'], metrics['train_returns'], 'train_returns') if args.save_trajectories: # Store trajectories from agent after training _, trajectories = evaluate_agent(agent, args.evaluation_episodes, return_trajectories=True, seed=args.seed) torch.save(trajectories, os.path.join('results', 'trajectories.pth')) # Save agent and metrics torch.save(agent.state_dict(), os.path.join('results', 'agent.pth')) if args.imitation in ['AIRL', 'GAIL']: torch.save(discriminator.state_dict(), os.path.join('results', 'discriminator.pth')) torch.save(metrics, os.path.join('results', 'metrics.pth')) env.close()