Example #1
0
            else:
                # Add experience to memory
                memory.add((state, action, reward, next_state))
                state = next_state
                t += 1
            # Sample mini-batch from memory
            batch = memory.sample(batch_size)
            states = np.array([each[0] for each in batch])
            actions = np.array([each[1] for each in batch])
            rewards = np.array([each[2] for each in batch])
            next_states = np.array([each[3] for each in batch])
            # Train network

            loss = agent.learn(states, actions, rewards, gamma, next_states)

        test_rewards_list.extend(
            test_agent(agent, env, test_max_steps=convergence_reward + 25))
        cur_compute_len = min(100, len(test_rewards_list))
        mean_reward = np.mean(test_rewards_list[len(test_rewards_list) -
                                                cur_compute_len:])
        print(
            'Episode: {}'.format(ep),
            'Mean test reward: {:.1f}'.format(mean_reward),
        )
        if mean_reward > convergence_reward:
            print(ep, "solved")
            break
    agent.save(model_name + ".h5")
    np.save(model_name + "_train_rewards.npy", train_rewards_list)
    np.save(model_name + "_test_rewards.npy", test_rewards_list)
Example #2
0
                 device=device,
                 use_boosting=use_boosting,
                 use_double_dqn=use_double_dqn)
 if conditional_update:
     print("comparing old and new weights...")
     w_adv_last_dict_after = copy.deepcopy(
         net.fc2_adv.state_dict())
     w_val_last_dict_after = copy.deepcopy(
         net.fc2_val.state_dict())
     test_agent = copy.deepcopy(agent)
     # test original
     test_agent.dqn_model.fc2_adv.load_state_dict(
         w_adv_last_dict_before)
     test_agent.dqn_model.fc2_val.load_state_dict(
         w_val_last_dict_before)
     before_reward = utils.test_agent(
         test_env, test_agent)
     # test new
     test_agent.dqn_model.fc2_adv.load_state_dict(
         w_adv_last_dict_after)
     test_agent.dqn_model.fc2_val.load_state_dict(
         w_val_last_dict_after)
     after_reward = utils.test_agent(
         test_env, test_agent)
     print(
         "average reward:: original: %.3f" %
         before_reward,
         " least-squares: %.3f" % after_reward)
     if after_reward > before_reward:
         net.fc2_adv.load_state_dict(
             w_adv_last_dict_after)
         net.fc2_val.load_state_dict(
Example #3
0
 if debug_change:
     w_adv_last_debug = copy.deepcopy(
         net.fc2_adv.state_dict())
 if conditional_update:
     print("comparing old and new weights...")
     w_adv_last_dict_after = copy.deepcopy(
         net.fc2_adv.state_dict())
     w_val_last_dict_after = copy.deepcopy(
         net.fc2_val.state_dict())
     test_agent = copy.deepcopy(agent)
     # test original
     test_agent.dqn_model.fc2_adv.load_state_dict(
         w_adv_last_dict_before)
     test_agent.dqn_model.fc2_val.load_state_dict(
         w_val_last_dict_before)
     before_reward = utils.test_agent(test_env, test_agent)
     # test new
     test_agent.dqn_model.fc2_adv.load_state_dict(
         w_adv_last_dict_after)
     test_agent.dqn_model.fc2_val.load_state_dict(
         w_val_last_dict_after)
     after_reward = utils.test_agent(test_env, test_agent)
     print(
         "average reward:: original: %.3f" % before_reward,
         " least-squares: %.3f" % after_reward)
     if (after_reward > before_reward) and (
             abs(after_reward - before_reward) <
             change_threshold):
         net.fc2_adv.load_state_dict(w_adv_last_dict_after)
         net.fc2_val.load_state_dict(w_val_last_dict_after)
         print("using updated weights.")
Example #4
0
    # Network parameters
    hidden_size = 20  # number of units in each Q-network hidden layer
    learning_rate = 0.01  # Q-network learning rate
    # Memory parameters
    memory_size = 10000  # memory capacity
    batch_size = 32  # experience mini-batch size
    pretrain_length = batch_size  # number experiences to pretrain the memory
    memory = Memory(max_size=memory_size)

    # Initialize the simulation
    env = gym.make('CartPole-v1')

    # TODO 指定网络参数和模型名字
    agent = DQNAgent(env,
                     explore_start,
                     explore_stop,
                     decay_rate,
                     state_size=state_size,
                     action_size=action_size,
                     hidden_size=hidden_size,
                     use_targetQ=True,
                     C=20,
                     use_dueling=False,
                     lr=learning_rate)
    model_name = "nips"
    agent.load_model(model_name + '.h5')
    ans = test_agent(agent, env, 500, 10, False)
    ans = np.array(ans)
    print('Mean: {:.1f}'.format(ans.mean()), 'Std: {:.1f}'.format(ans.std()),
          'Max: {:.1f}'.format(ans.max()), 'Min: {:.1f}'.format(ans.min()))