dim_action = 40 dim_state = 3 am = ActionMap(dim_action) agent = DQNAgent(dim_state, dim_action, am) high = np.array([1., 1., 8.]) low = -high #print("mean 100 episode reward before learning: {}".format(calculate_mean_reward(agent, env))) episodes = 1000 for i in range(episodes): print(i) observation = env.reset() while True: env.render(mode="human") action = agent.act(observation) new_observation, reward, done, info = env.step( env.action_space.sample()) agent.remember(observation, action, reward, new_observation, done) if done: agent.replay(100) break print("mean 100 episode reward after learning: {}".format( calculate_mean_reward(agent, env)))
def run(env: RobotArmEnvironment, agent: DQNAgent, num_episodes: int, max_num_steps: int, batch_size: int, directory_path: str, random_run: bool = False): reward_history_file_name = directory_path + "reward.csv" action_history_file_name = directory_path + "action.csv" max_q_history_file_name = directory_path + "max-q.csv" state_history_file_name = directory_path + "state.csv" # Parse these files with: # with open(file_name, "r") as f: # reader = csv.reader(f, delimiter=" ") # for row in reader: # for col in row: # col = ast.literal_eval(col) # (nan values have to be checked for) previous = time.time() for episode_idx in range(num_episodes): state = env.reset() for step_idx in range(max_num_steps): with open(state_history_file_name, "a") as f: f.write( ("(" + ("{}," * 6) + ") ").format(*env.simulation.state)) if episode_idx % 100 == 0: env.render() time.sleep(1 / 10) # take an action if random_run: action = env.action_space.sample() else: max_q, action, prediction = agent.act(state) if not random_run: with open(max_q_history_file_name, "a") as f: f.write("{} ".format(max_q)) with open(action_history_file_name, "a") as f: f.write("({},{}) ".format( env.action_map.get(int(action))[0], env.action_map.get(int(action))[1])) # observe effect of action and remember new_state, reward, done, info = env.step(action) if not random_run: agent.remember(state, action, reward, new_state, done) with open(reward_history_file_name, "a") as f: f.write("{} ".format(float(reward))) # store new state state = new_state if done: break if not random_run: agent.replay(batch_size) # new line in all data files with open(action_history_file_name, "a") as f: f.write("\n") with open(reward_history_file_name, "a") as f: f.write("\n") if not random_run: with open(max_q_history_file_name, "a") as f: f.write("\n") with open(state_history_file_name, "a") as f: f.write(("(" + ("{}," * 6) + ") \n").format(*env.simulation.state)) if not random_run and episode_idx % 50 == 0: agent.save(directory_path + "weights-ep-{}".format(episode_idx)) current = time.time() print("{}: episode {:3}/{:3} completed in {:4}s".format( os.getpid(), episode_idx, num_episodes, current - previous)) previous = current
ct = time() total_time_acting = 0 total_time_stepping = 0 total_time_remembering = 0 total_overhead = time() ct_act, ct_step, ct_rem = 0, 0, 0 for i in range(max_iterations_per_episode): if (episode_idx + 1) % 50 == 0 or episode_idx == 0: env.render() # sleep(1 / 2) ct_act = time() action = agent.act(state) total_time_acting += time() - ct_act ct_step = time() next_state, reward, done, _ = env.step(action) # print("action {}, state {}".format(env.action_map.get(action), next_state)) total_time_stepping += time() - ct_step ct_rem = time() agent.remember(state, action, reward, next_state, done) total_time_remembering += time() - ct_rem state = next_state tr += reward agent.replay(32)
def run_experiments(reward_index): if reward_index < 0 or reward_index > 1: raise ValueError() num_episodes = number_of_episodes num_steps = max_iterations_per_episode batchsize = 32 state_size = 6 action_size = 81 memory_size = 100000 epsilon_start = 1 epsilon_min = 0.1 epsilon_decay_per_step = 10000 lr = 0.00001 dr = 0.99 amount_layers = 2 amount_nodes_layer = 40 frequency_updates = 1000 parameters = {} parameters['num_episodes'] = num_episodes parameters['num_steps'] = num_steps parameters['batchsize'] = batchsize parameters['state_size'] = state_size parameters['action_size'] = action_size parameters['memory_size'] = memory_size parameters['epsilon_start'] = epsilon_start parameters['epsilon_min'] = epsilon_min parameters['epsilon_decay_episodes_required'] = epsilon_decay_per_step parameters['learning_rate'] = lr parameters['discount_rate'] = dr parameters['amount_layers'] = amount_layers parameters['amount_nodes_layer'] = amount_nodes_layer parameters['frequency_update_target_model'] = frequency_updates agent = DQNAgent(6, 81, num_steps, epsilon_start, epsilon_min, epsilon_decay_per_step, dr, lr, amount_layers, (amount_nodes_layer, amount_nodes_layer), frequency_updates) with RobotArmEnvironment(reward_function_index=reward_index, reward_function_params=(1 / 6 * np.pi, 2 * np.pi, 1, 10, 0.05, 0.1, 2, 0.001, 1)) as env: ah = list() rh = list() for episode_idx in range(number_of_episodes): state = env.reset() tr = 0 ct = time.time() ah.append(list()) rh.append(list()) for i in range(max_iterations_per_episode): action = agent.act(state) ah[episode_idx].append(env.action_map.get(int(action))) next_state, reward, done, _ = env.step(action) rh[episode_idx].append(float(reward)) agent.remember(state, action, reward, next_state, done) state = next_state tr += reward if done: break agent.replay(32) print( "episode {}/{}, average reward {}, epsilon {}, time taken {}s". format(episode_idx + 1, number_of_episodes, tr, agent.get_epsilon(), time.time() - ct)) agent._update_epsilon() if episode_idx % 100 == 0 and episode_idx != 0: agent.safe() save_info(episode_idx, reward_index, parameters, env.action_map.to_json_object(), rh, ah, env.to_json_object())