def train_and_test(experiments): df = pd.DataFrame(columns=[ 'model name', 'episode number', 'train mean score', 'train mean steps', 'test accuracy', 'test mean score', 'test mean steps' ]) for model_name, steps, train_agent in experiments: # Train train_res = experiment(steps, agent_config=train_agent) train_res["agent"].save_model(model_name) training_mean_steps = train_res["steps"].mean() training_mean_score = train_res["scores"].mean() np.savetxt("results/training/ddqn.csv", train_res["steps"], delimiter=',') # Test test_agent = DQNAgent(output_dim, None, use_ddqn=True, default_policy=True, model_filename=model_name, epsilon=0.01, epsilon_lower_bound=0.01, learn_thresh=0) test_res = experiment(500, default_policy=True, policy=model_name, agent_config=test_agent) testing_accuracy = accuracy(test_res["results"]) testing_mean_steps = test_res["steps"].mean() testing_mean_score = test_res["scores"].mean() np.savetxt("results/testing/ddqn.csv", test_res["steps"], delimiter=',') df.loc[len(df)] = [ model_name, len(train_res["steps"]), training_mean_score, training_mean_steps, testing_accuracy, testing_mean_score, testing_mean_steps ] df.to_csv('experiments.csv')
def experiment(n_episodes, default_policy=False, policy=None, render=False): """ Run a RL experiment that can be either training or testing Args: n_episodes: number of train/test episodes default_policy: boolean to enable testing/training phase policy: numpy tensor with a trained policy render: enable OpenAI environment graphical rendering Returns: Dictionary with: cumulative experiments outcomes list of steps per episode list of cumulative rewards trained policy """ with tf.device('/gpu:0'): res = [0, 0] # array of results accumulator: {[0]: Loss, [1]: Victory} scores = [] # Cumulative rewards steps = [] # Steps per episode reward_list = RingBuffer(100) env = gym.make('PongDeterministic-v4') input_dim = env.observation_space.shape[0] output_dim = env.action_space.n if default_policy: agent = DQNAgent(output_dim, None, use_ddqn=True, default_policy=True, model_filename=policy, epsilon=0.05, epsilon_lower_bound=0.05) else: layers = [ Conv2D(32, (8, 8), strides=(4, 4), activation='relu', input_shape=(84, 84, 4), kernel_initializer=VarianceScaling(scale=2.0)), Conv2D(64, (4, 4), strides=(2, 2), activation='relu', kernel_initializer=VarianceScaling(scale=2.0)), Conv2D(64, (3, 3), strides=(1, 1), activation='relu', kernel_initializer=VarianceScaling(scale=2.0)), Flatten(), Dense(512, activation='relu'), Dense(output_dim) ] agent = DQNAgent(output_dim, layers, use_ddqn=True, memory_size=700000, gamma=0.99, learn_thresh=50000, epsilon_lower_bound=0.02, epsilon_decay_function=lambda e: e - (0.98 / 950000), update_rate=10000, optimizer=Adam(0.00025)) gathered_frame = 0 for episode_number in tqdm(range(n_episodes), desc="Episode"): frame = env.reset() state = pre_processing(frame) empty_state = np.zeros(state.shape, dtype="uint8") cumulative_reward = 0 has_lost_life = True t = 0 while True: if has_lost_life: next_action = 1 # [1, 4, 5][ran.randint(0, 2)] stack = np.stack( (empty_state, empty_state, empty_state, empty_state), axis=2) stack = np.reshape([stack], (1, 84, 84, 4)) for _ in range(ran.randint(1, 10)): gathered_frame += 1 frame, reward, end, _ = env.step(next_action) new_state = np.reshape(pre_processing(frame), (1, 84, 84, 1)) new_stack = np.append(new_state, stack[:, :, :, :3], axis=3) stack = new_stack if (render): env.render() has_lost_life = False next_action = agent.act(stack) new_state, reward, end, _ = env.step(next_action) if (render): env.render() time.sleep(0.02) reward = np.clip(reward, -1., 1.) if reward != 0: has_lost_life = True cumulative_reward += reward new_state = np.reshape(pre_processing(new_state), (1, 84, 84, 1)) new_stack = np.append(new_state, stack[:, :, :, :3], axis=3) agent.memoise( (stack, next_action, reward, new_state, has_lost_life)) stack = new_stack gathered_frame += 1 if end: reward_list.append(cumulative_reward) if cumulative_reward > 0: res[1] += 1 print("You Won!, steps:", t, "reward:", reward_list.mean(), "frames:", gathered_frame) else: res[0] += 1 print("You Lost!, steps:", t, "reward:", reward_list.mean(), "frames:", gathered_frame) steps.append(t) break agent.learn() t += 1 scores.append(cumulative_reward) if episode_number >= 50 and episode_number % 10 == 0: model_name = "partial_model_pong" + str(episode_number) agent.save_model(model_name) env.close() return { "results": np.array(res), "steps": np.array(steps), "scores": np.array(scores), "agent": agent }
def experiment(n_episodes, default_policy=False, policy=None, render=False): """ Run a RL experiment that can be either training or testing Args: n_episodes: number of train/test episodes default_policy: boolean to enable testing/training phase policy: numpy tensor with a trained policy render: enable OpenAI environment graphical rendering Returns: Dictionary with: cumulative experiments outcomes list of steps per episode list of cumulative rewards trained policy """ res = [0, 0] # array of results accumulator: {[0]: Loss, [1]: Victory} scores = [] # Cumulative rewards steps = [] # steps per episode env = gym.make('CartPole-v0') env = env.unwrapped env.seed(seed) input_dim = env.observation_space.shape[0] output_dim = env.action_space.n layer1 = Dense(10, input_dim=input_dim, activation='relu') layer2 = Dense(output_dim) if default_policy: agent = DQNAgent(output_dim, None, use_ddqn=True, default_policy=True, model_filename=policy, epsilon=0, epsilon_lower_bound=0, learn_thresh=0) else: agent1 = DQNAgent(output_dim, [layer1, layer2], use_ddqn=True, learn_thresh=2000, update_rate=100, epsilon_decay_function=lambda e: e - 0.0001, epsilon_lower_bound=0.1, optimizer=keras.optimizers.RMSprop(0.001), memory_size=2000, tb_dir=None) agent2 = DQNAgent(output_dim, [layer1, layer2], use_ddqn=True, learn_thresh=2000, update_rate=100, epsilon_decay_function=lambda e: e - 0.0001, epsilon_lower_bound=0.1, optimizer=keras.optimizers.RMSprop(0.001), memory_size=2000, tb_dir=None) agent3 = DQNAgent(output_dim, [layer1, layer2], use_ddqn=True, learn_thresh=2000, update_rate=100, epsilon_decay_function=lambda e: e - 0.0001, epsilon_lower_bound=0.1, optimizer=keras.optimizers.RMSprop(0.001), memory_size=2000, tb_dir=None) agent4 = DQNAgent(output_dim, [layer1, layer2], use_ddqn=True, learn_thresh=2000, update_rate=100, epsilon_decay_function=lambda e: e - 0.0001, epsilon_lower_bound=0.1, optimizer=keras.optimizers.RMSprop(0.001), memory_size=2000, tb_dir=None) agent5 = DQNAgent(output_dim, [layer1, layer2], use_ddqn=True, learn_thresh=2000, update_rate=100, epsilon_decay_function=lambda e: e - 0.0001, epsilon_lower_bound=0.1, optimizer=keras.optimizers.RMSprop(0.001), memory_size=2000, tb_dir=None) agent6 = DQNAgent(output_dim, [layer1, layer2], use_ddqn=True, learn_thresh=2000, update_rate=100, epsilon_decay_function=lambda e: e - 0.0001, epsilon_lower_bound=0.1, optimizer=keras.optimizers.RMSprop(0.001), memory_size=2000, tb_dir=None) agent7 = DQNAgent(output_dim, [layer1, layer2], use_ddqn=True, learn_thresh=2000, update_rate=100, epsilon_decay_function=lambda e: e - 0.0001, epsilon_lower_bound=0.1, optimizer=keras.optimizers.RMSprop(0.001), memory_size=2000, tb_dir=None) agent8 = DQNAgent(output_dim, [layer1, layer2], use_ddqn=True, learn_thresh=2000, update_rate=100, epsilon_decay_function=lambda e: e - 0.0001, epsilon_lower_bound=0.1, optimizer=keras.optimizers.RMSprop(0.001), memory_size=2000, tb_dir=None) agent9 = DQNAgent(output_dim, [layer1, layer2], use_ddqn=True, learn_thresh=2000, update_rate=100, epsilon_decay_function=lambda e: e - 0.0001, epsilon_lower_bound=0.1, optimizer=keras.optimizers.RMSprop(0.001), memory_size=2000, tb_dir=None) agent10 = DQNAgent(output_dim, [layer1, layer2], use_ddqn=True, learn_thresh=2000, update_rate=100, epsilon_decay_function=lambda e: e - 0.0001, epsilon_lower_bound=0.1, optimizer=keras.optimizers.RMSprop(0.001), memory_size=2000, tb_dir=None) agents = [ agent1, agent2, agent3, agent4, agent5, agent6, agent7, agent8, agent9, agent10 ] agentE = EnsemblerAgent(output_dim, agents, EnsemblerType.TRUST_BASED) for i_ep in tqdm(range(n_episodes), desc="Episode"): state = env.reset() cumulative_reward = 0 state = np.reshape(state, [1, 4]) t = 0 while True: if (render): env.render() time.sleep(0.1) next_action = agentE.act(state) new_state, reward, end, _ = env.step(next_action) x, x_dot, theta, theta_dot = new_state new_state = np.reshape(new_state, [1, 4]) # Reward shaping r1 = (env.x_threshold - abs(x)) / env.x_threshold - 0.8 r2 = (env.theta_threshold_radians - abs(theta)) / env.theta_threshold_radians - 0.5 r3 = -abs(theta_dot) agent1.memoise((state, next_action, r2, new_state, end)) agent2.memoise((state, next_action, r3, new_state, end)) agent3.memoise((state, next_action, r2, new_state, end)) agent4.memoise((state, next_action, r3, new_state, end)) agent5.memoise((state, next_action, r2, new_state, end)) agent6.memoise((state, next_action, r3, new_state, end)) agent7.memoise((state, next_action, r2, new_state, end)) agent8.memoise((state, next_action, r3, new_state, end)) agent9.memoise((state, next_action, r2, new_state, end)) agent10.memoise((state, next_action, r3, new_state, end)) if end or t > 199: if t < 195: res[0] += 1 else: res[1] += 1 print("ENTRATO!,", t, "steps", "reward: ", cumulative_reward) steps.append(t) if i_ep % 100 == 0: if evaluate(env, agentE): cumulative_reward += reward scores.append(cumulative_reward) env.close() return { "results": np.array(res), "steps": np.array(steps), "scores": np.array(scores) } break else: state = new_state cumulative_reward += reward for agent in agentE.agents: agent.learn() t += 1 cumulative_reward += reward scores.append(cumulative_reward) env.close() return { "results": np.array(res), "steps": np.array(steps), "scores": np.array(scores) }
def experiment(n_episodes, default_policy=False, policy=None, render=False): """ Run a RL experiment that can be either training or testing Args: n_episodes: number of train/test episodes default_policy: boolean to enable testing/training phase policy: numpy tensor with a trained policy render: enable OpenAI environment graphical rendering Returns: Dictionary with: cumulative experiments outcomes list of steps per episode list of cumulative rewards trained policy """ res = [0, 0] # array of results accumulator: {[0]: Loss, [1]: Victory} scores = [] # Cumulative rewards steps = [] # Steps per episode env = gym.make('MountainCar-v0') env.seed(seed) input_dim = env.observation_space.shape[0] output_dim = env.action_space.n layer1 = Dense(15, input_dim=input_dim, activation='relu') layer2 = Dense(output_dim) agent1 = DQNAgent(output_dim, [layer1, layer2], use_ddqn=True, learn_thresh=1000, update_rate=300, epsilon_decay_function=lambda e: e - 0.001, epsilon_lower_bound=0.01, optimizer=keras.optimizers.RMSprop(0.001), tb_dir=None) agent2 = DQNAgent(output_dim, [layer1, layer2], use_ddqn=True, learn_thresh=1000, update_rate=300, epsilon_decay_function=lambda e: e - 0.001, epsilon_lower_bound=0.01, optimizer=keras.optimizers.RMSprop(0.001), tb_dir=None) agent3 = DQNAgent(output_dim, [layer1, layer2], use_ddqn=True, learn_thresh=1000, update_rate=300, epsilon_decay_function=lambda e: e - 0.001, epsilon_lower_bound=0.01, optimizer=keras.optimizers.RMSprop(0.001), tb_dir=None) agent4 = DQNAgent(output_dim, [layer1, layer2], use_ddqn=True, learn_thresh=1000, update_rate=300, epsilon_decay_function=lambda e: e - 0.001, epsilon_lower_bound=0.01, optimizer=keras.optimizers.RMSprop(0.001), tb_dir=None) agent5 = DQNAgent(output_dim, [layer1, layer2], use_ddqn=True, learn_thresh=1000, update_rate=300, epsilon_decay_function=lambda e: e - 0.001, epsilon_lower_bound=0.01, optimizer=keras.optimizers.RMSprop(0.001), tb_dir=None) agent6 = DQNAgent(output_dim, [layer1, layer2], use_ddqn=True, learn_thresh=1000, update_rate=300, epsilon_decay_function=lambda e: e - 0.001, epsilon_lower_bound=0.01, optimizer=keras.optimizers.RMSprop(0.001), tb_dir=None) agent7 = DQNAgent(output_dim, [layer1, layer2], use_ddqn=True, learn_thresh=1000, update_rate=300, epsilon_decay_function=lambda e: e - 0.001, epsilon_lower_bound=0.01, optimizer=keras.optimizers.RMSprop(0.001), tb_dir=None) agent8 = DQNAgent(output_dim, [layer1, layer2], use_ddqn=True, learn_thresh=1000, update_rate=300, epsilon_decay_function=lambda e: e - 0.001, epsilon_lower_bound=0.01, optimizer=keras.optimizers.RMSprop(0.001), tb_dir=None) agent9 = DQNAgent(output_dim, [layer1, layer2], use_ddqn=True, learn_thresh=1000, update_rate=300, epsilon_decay_function=lambda e: e - 0.001, epsilon_lower_bound=0.01, optimizer=keras.optimizers.RMSprop(0.001), tb_dir=None) agent10 = DQNAgent(output_dim, [layer1, layer2], use_ddqn=True, learn_thresh=1000, update_rate=300, epsilon_decay_function=lambda e: e - 0.001, epsilon_lower_bound=0.01, optimizer=keras.optimizers.RMSprop(0.001), tb_dir=None) agents = [ agent1, agent2, agent3, agent4, agent5, agent6, agent7, agent8, agent9, agent10 ] agentE = EnsemblerAgent(env.action_space.n, agents, EnsemblerType.MAJOR_VOTING_BASED) evaluate = False for i_episode in tqdm(range(n_episodes + 1), desc="Episode"): state = env.reset() cumulative_reward = 0 state = np.reshape(state, [1, 2]) if i_episode > 0 and i_episode % 120 == 0: evaluate = True if evaluate == False: for t in range(env._max_episode_steps): if (render): env.render() next_action = agentE.act(state) new_state, reward, end, _ = env.step(next_action) original_state = new_state # r1 = reward + 0.1 * original_state[0] # r2 = reward + 0.2 * np.sin(3 * original_state[0]) # r3 = reward + 0.7 * (original_state[1] * original_state[1]) # Reward shaping r1 = reward + original_state[0] r2 = reward + np.sin(3 * original_state[0]) r3 = reward + (original_state[1] * original_state[1]) r4 = abs(new_state[0] - (-0.5)) # r in [0, 1] new_state = np.reshape(new_state, [1, 2]) agent1.memoise((state, next_action, r1, new_state, end)) agent2.memoise((state, next_action, r2, new_state, end)) agent3.memoise((state, next_action, r1, new_state, end)) agent4.memoise((state, next_action, r2, new_state, end)) agent5.memoise((state, next_action, r1, new_state, end)) agent6.memoise((state, next_action, r2, new_state, end)) agent7.memoise((state, next_action, r1, new_state, end)) agent8.memoise((state, next_action, r2, new_state, end)) agent9.memoise((state, next_action, r1, new_state, end)) agent10.memoise((state, next_action, r2, new_state, end)) if end: if t == env._max_episode_steps - 1: res[0] += 1 else: res[1] += 1 print("ENTRATO!,", t, "steps", "reward: ", cumulative_reward) steps.append(t) break else: state = new_state cumulative_reward += reward for agent in agentE.agents: agent.learn() cumulative_reward += reward scores.append(cumulative_reward) else: # Model validation for early stopping evaluate = False eval_res = [ 0, 0 ] # array of results accumulator: {[0]: Loss, [1]: Victory} eval_scores = [] # Cumulative rewards eval_steps = [] # Steps per episode for i_episode in range(100): state = env.reset() state = np.reshape(state, [1, 2]) cumulative_reward = 0 for t in range(env._max_episode_steps): if (render): env.render() next_action = agentE.act(state) new_state, reward, end, _ = env.step(next_action) new_state = np.reshape(new_state, [1, 2]) if end: if t == env._max_episode_steps - 1: eval_res[0] += 1 else: eval_res[1] += 1 eval_steps.append(t) break else: state = new_state cumulative_reward += reward cumulative_reward += reward eval_scores.append(cumulative_reward) testing_accuracy = accuracy(np.array(eval_res)) testing_mean_steps = np.array(eval_steps).mean() testing_mean_score = np.array(eval_scores).mean() print("\nTraining episodes:", len(steps), "Training mean score:", np.array(steps).mean(), "Training mean steps", np.array(scores).mean(), "\nAccuracy:", testing_accuracy, "Test mean score:", testing_mean_score, "Test mean steps:", testing_mean_steps) env.close() return { "results": np.array(res), "steps": np.array(steps), "scores": np.array(scores) }
def experiment(n_episodes, default_policy=False, policy=None, render=False, agent_config=None): """ Run a RL experiment that can be either training or testing Args: n_episodes: number of train/test episodes default_policy: boolean to enable testing/training phase policy: numpy tensor with a trained policy render: enable OpenAI environment graphical rendering agent_config: DQNAgent object Returns: Dictionary with: cumulative experiments outcomes list of steps per episode list of cumulative rewards trained policy """ res = [0, 0] # array of results accumulator: {[0]: Loss, [1]: Victory} scores = [] # Cumulative rewards steps = [] # Steps per episode env = gym.make('MountainCar-v0') env.seed(seed) input_dim = env.observation_space.shape[0] output_dim = env.action_space.n if agent_config is None: if default_policy: agent = DQNAgent(output_dim, None, use_ddqn=True, default_policy=True, model_filename=policy, epsilon=0, epsilon_lower_bound=0, learn_thresh=0) else: layer1 = Dense(15, input_dim=input_dim, activation='relu') layer2 = Dense(output_dim) agent = DQNAgent(output_dim, [layer1, layer2], use_ddqn=True, learn_thresh=1000, update_rate=300, epsilon_decay_function=lambda e: e * 0.95, epsilon_lower_bound=0.01, optimizer=keras.optimizers.RMSprop(0.001)) else: agent = agent_config for i_episode in tqdm(range(n_episodes), desc="Episode"): state = env.reset() cumulative_reward = 0 # Model validation for early stopping if i_episode > 0 and (i_episode % 100) == 0 and not default_policy: agent.save_model("tmp_model") evaluation_result = experiment(500, default_policy=True, policy="tmp_model") acc = accuracy(evaluation_result["results"]) if acc == 100: break else: print("Accuracy:", acc, "Episode:", i_episode) state = np.reshape(state, [1, 2]) for t in range(env._max_episode_steps): if (render): env.render() next_action = agent.act(state) new_state, reward, end, _ = env.step(next_action) reward = abs(new_state[0] - (-0.5)) # r in [0, 1] (reward shaping) new_state = np.reshape(new_state, [1, 2]) agent.memoise((state, next_action, reward, new_state, end)) if end: if t == env._max_episode_steps - 1: res[0] += 1 else: res[1] += 1 # print("ENTRATO!,", t, "steps") steps.append(t) break else: state = new_state cumulative_reward += reward agent.learn() cumulative_reward += reward scores.append(cumulative_reward) env.close() return { "results": np.array(res), "steps": np.array(steps), "scores": np.array(scores), "agent": agent }
#experiment(10, render=True, default_policy=True, policy="model1") input_dim = 2 output_dim = 3 experiments = [] layer1 = Dense(15, input_dim=input_dim, activation='relu') layer2 = Dense(output_dim) layers = [layer1, layer2] experiments.append(("model23", 25000, DQNAgent(output_dim, layers, use_ddqn=True, learn_thresh=1000, update_rate=300, epsilon_decay_function=lambda e: e * 0.995, epsilon_lower_bound=0.01, optimizer=keras.optimizers.Adam(0.001), tb_dir=None))) def train_and_test(experiments): df = pd.DataFrame(columns=[ 'model name', 'episode number', 'train mean score', 'train mean steps', 'test accuracy', 'test mean score', 'test mean steps' ]) for model_name, steps, train_agent in experiments: # Train train_res = experiment(steps, agent_config=train_agent) train_res["agent"].save_model(model_name)