Esempio n. 1
0
    def compute(self, config, budget, working_directory, *args, **kwargs):
            """
            Simple example for a compute function using a feed forward network.
            It is trained on the MNIST dataset.
            The input parameter "config" (dictionary) contains the sampled
            configurations passed by the bohb optimizer
            """
            env = ContinuousCartPoleEnv(reward_function=smooth_reward)
            state_dim = env.observation_space.shape[0]
            # Try to ensure determinism
            ############################
            torch.manual_seed(config['seed'])
            env.seed(config['seed'])
            torch.backends.cudnn.deterministic = True
            torch.backends.cudnn.benchmark = False
            ############################
            # conf dictionary to controll training
            conf = {'lr':config['lr'], 'bs':64, 'loss':nn.MSELoss(),
                    'hidden_dim':config['hidden_dim'],
                    'mem_size':50000, 'activation':config['activation'],
                    'epsilon':config['epsilon'],
                    'eps_scheduler':'exp', 'n_episodes':budget,
                    'dropout_rate': config['dropout_rate'], 'n_cycles': 1,
                    'decay_rate': config['decay_rate']
                  }
            ############################
            # create dqn object and train it
            dqn = DQN(state_dim, config['action_dim'],
                      gamma=config['gamma'], conf=conf)
            time_steps = 1000
            stats = dqn.train(int(budget), time_steps, env, conf)
            # plot_episode_stats(stats, noshow=True)
            final_reward = 0
            for _ in range(5):
                s = env.reset()
                for _ in range(time_steps):
                    # env.render()
                    action = dqn.get_action(s, 0.)
                    s, r, d, _ = env.step(dqn.action.act(action))  #(action - 5)/6]))
                    final_reward += r
                    if d:
                        break
            env.close()
#           ###########################
            return ({
                     # remember: HpBandSter always minimizes!
                    'loss': - (final_reward / 5),
                    'info': {'max_len_train': max(stats.episode_lengths),
                             'max_reward_train': max(stats.episode_rewards),
                             'avg_final': (final_reward / 5) }
            })
Esempio n. 2
0
        # When is done, executes
        if done:
            print("Episode finished after {} timesteps".format(t + 1))
            rewards_list.append(total_reward)
            steps_list.append(t)
            break
    print("Total reward in this episode: ", total_reward)

print("")
print("========== RESULTS ==========")
print("=============================")
print("Total number of episodes: ", num_episodes)
print("Average reward of all episodes: ", np.average(rewards_list))
print("")

env.close()

############################
# Plots
############################
'''
# summarize history for loss
fig1 = plt.figure()
plt.plot(loss_list)
plt.title('Model Loss')
plt.ylabel('loss')
plt.xlabel('steps')
#plt.show()
fig1.savefig('model_loss.png')
'''