else: # Add experience to memory memory.add((state, action, reward, next_state)) state = next_state t += 1 # Sample mini-batch from memory batch = memory.sample(batch_size) states = np.array([each[0] for each in batch]) actions = np.array([each[1] for each in batch]) rewards = np.array([each[2] for each in batch]) next_states = np.array([each[3] for each in batch]) # Train network loss = agent.learn(states, actions, rewards, gamma, next_states) test_rewards_list.extend( test_agent(agent, env, test_max_steps=convergence_reward + 25)) cur_compute_len = min(100, len(test_rewards_list)) mean_reward = np.mean(test_rewards_list[len(test_rewards_list) - cur_compute_len:]) print( 'Episode: {}'.format(ep), 'Mean test reward: {:.1f}'.format(mean_reward), ) if mean_reward > convergence_reward: print(ep, "solved") break agent.save(model_name + ".h5") np.save(model_name + "_train_rewards.npy", train_rewards_list) np.save(model_name + "_test_rewards.npy", test_rewards_list)
device=device, use_boosting=use_boosting, use_double_dqn=use_double_dqn) if conditional_update: print("comparing old and new weights...") w_adv_last_dict_after = copy.deepcopy( net.fc2_adv.state_dict()) w_val_last_dict_after = copy.deepcopy( net.fc2_val.state_dict()) test_agent = copy.deepcopy(agent) # test original test_agent.dqn_model.fc2_adv.load_state_dict( w_adv_last_dict_before) test_agent.dqn_model.fc2_val.load_state_dict( w_val_last_dict_before) before_reward = utils.test_agent( test_env, test_agent) # test new test_agent.dqn_model.fc2_adv.load_state_dict( w_adv_last_dict_after) test_agent.dqn_model.fc2_val.load_state_dict( w_val_last_dict_after) after_reward = utils.test_agent( test_env, test_agent) print( "average reward:: original: %.3f" % before_reward, " least-squares: %.3f" % after_reward) if after_reward > before_reward: net.fc2_adv.load_state_dict( w_adv_last_dict_after) net.fc2_val.load_state_dict(
if debug_change: w_adv_last_debug = copy.deepcopy( net.fc2_adv.state_dict()) if conditional_update: print("comparing old and new weights...") w_adv_last_dict_after = copy.deepcopy( net.fc2_adv.state_dict()) w_val_last_dict_after = copy.deepcopy( net.fc2_val.state_dict()) test_agent = copy.deepcopy(agent) # test original test_agent.dqn_model.fc2_adv.load_state_dict( w_adv_last_dict_before) test_agent.dqn_model.fc2_val.load_state_dict( w_val_last_dict_before) before_reward = utils.test_agent(test_env, test_agent) # test new test_agent.dqn_model.fc2_adv.load_state_dict( w_adv_last_dict_after) test_agent.dqn_model.fc2_val.load_state_dict( w_val_last_dict_after) after_reward = utils.test_agent(test_env, test_agent) print( "average reward:: original: %.3f" % before_reward, " least-squares: %.3f" % after_reward) if (after_reward > before_reward) and ( abs(after_reward - before_reward) < change_threshold): net.fc2_adv.load_state_dict(w_adv_last_dict_after) net.fc2_val.load_state_dict(w_val_last_dict_after) print("using updated weights.")
# Network parameters hidden_size = 20 # number of units in each Q-network hidden layer learning_rate = 0.01 # Q-network learning rate # Memory parameters memory_size = 10000 # memory capacity batch_size = 32 # experience mini-batch size pretrain_length = batch_size # number experiences to pretrain the memory memory = Memory(max_size=memory_size) # Initialize the simulation env = gym.make('CartPole-v1') # TODO 指定网络参数和模型名字 agent = DQNAgent(env, explore_start, explore_stop, decay_rate, state_size=state_size, action_size=action_size, hidden_size=hidden_size, use_targetQ=True, C=20, use_dueling=False, lr=learning_rate) model_name = "nips" agent.load_model(model_name + '.h5') ans = test_agent(agent, env, 500, 10, False) ans = np.array(ans) print('Mean: {:.1f}'.format(ans.mean()), 'Std: {:.1f}'.format(ans.std()), 'Max: {:.1f}'.format(ans.max()), 'Min: {:.1f}'.format(ans.min()))