dim_action = 40
    dim_state = 3

    am = ActionMap(dim_action)
    agent = DQNAgent(dim_state, dim_action, am)

    high = np.array([1., 1., 8.])
    low = -high

    #print("mean 100 episode reward before learning: {}".format(calculate_mean_reward(agent, env)))

    episodes = 1000
    for i in range(episodes):
        print(i)
        observation = env.reset()

        while True:
            env.render(mode="human")
            action = agent.act(observation)
            new_observation, reward, done, info = env.step(
                env.action_space.sample())

            agent.remember(observation, action, reward, new_observation, done)

            if done:
                agent.replay(100)
                break

    print("mean 100 episode reward after learning: {}".format(
        calculate_mean_reward(agent, env)))
Esempio n. 2
0
def run(env: RobotArmEnvironment,
        agent: DQNAgent,
        num_episodes: int,
        max_num_steps: int,
        batch_size: int,
        directory_path: str,
        random_run: bool = False):

    reward_history_file_name = directory_path + "reward.csv"
    action_history_file_name = directory_path + "action.csv"
    max_q_history_file_name = directory_path + "max-q.csv"
    state_history_file_name = directory_path + "state.csv"

    # Parse these files with:
    # with open(file_name, "r") as f:
    #     reader = csv.reader(f, delimiter=" ")
    #     for row in reader:
    #         for col in row:
    #             col = ast.literal_eval(col) # (nan values have to be checked for)

    previous = time.time()
    for episode_idx in range(num_episodes):
        state = env.reset()

        for step_idx in range(max_num_steps):
            with open(state_history_file_name, "a") as f:
                f.write(
                    ("(" + ("{}," * 6) + ") ").format(*env.simulation.state))

            if episode_idx % 100 == 0:
                env.render()
                time.sleep(1 / 10)

            # take an action
            if random_run:
                action = env.action_space.sample()
            else:
                max_q, action, prediction = agent.act(state)

            if not random_run:
                with open(max_q_history_file_name, "a") as f:
                    f.write("{} ".format(max_q))

            with open(action_history_file_name, "a") as f:
                f.write("({},{}) ".format(
                    env.action_map.get(int(action))[0],
                    env.action_map.get(int(action))[1]))

            # observe effect of action and remember
            new_state, reward, done, info = env.step(action)

            if not random_run:
                agent.remember(state, action, reward, new_state, done)

            with open(reward_history_file_name, "a") as f:
                f.write("{} ".format(float(reward)))

            # store new state
            state = new_state

            if done:
                break

        if not random_run:
            agent.replay(batch_size)

        # new line in all data files
        with open(action_history_file_name, "a") as f:
            f.write("\n")
        with open(reward_history_file_name, "a") as f:
            f.write("\n")
        if not random_run:
            with open(max_q_history_file_name, "a") as f:
                f.write("\n")
        with open(state_history_file_name, "a") as f:
            f.write(("(" + ("{}," * 6) + ") \n").format(*env.simulation.state))

        if not random_run and episode_idx % 50 == 0:
            agent.save(directory_path + "weights-ep-{}".format(episode_idx))

        current = time.time()
        print("{}: episode {:3}/{:3} completed in {:4}s".format(
            os.getpid(), episode_idx, num_episodes, current - previous))
        previous = current
            ct = time()

            total_time_acting = 0
            total_time_stepping = 0
            total_time_remembering = 0
            total_overhead = time()

            ct_act, ct_step, ct_rem = 0, 0, 0
            for i in range(max_iterations_per_episode):
                if (episode_idx + 1) % 50 == 0 or episode_idx == 0:
                    env.render()
                # sleep(1 / 2)

                ct_act = time()
                action = agent.act(state)
                total_time_acting += time() - ct_act

                ct_step = time()
                next_state, reward, done, _ = env.step(action)
                # print("action {}, state {}".format(env.action_map.get(action), next_state))
                total_time_stepping += time() - ct_step

                ct_rem = time()
                agent.remember(state, action, reward, next_state, done)
                total_time_remembering += time() - ct_rem

                state = next_state
                tr += reward

                agent.replay(32)
def run_experiments(reward_index):
    if reward_index < 0 or reward_index > 1:
        raise ValueError()

    num_episodes = number_of_episodes
    num_steps = max_iterations_per_episode
    batchsize = 32
    state_size = 6
    action_size = 81
    memory_size = 100000
    epsilon_start = 1
    epsilon_min = 0.1
    epsilon_decay_per_step = 10000
    lr = 0.00001
    dr = 0.99
    amount_layers = 2
    amount_nodes_layer = 40
    frequency_updates = 1000

    parameters = {}
    parameters['num_episodes'] = num_episodes
    parameters['num_steps'] = num_steps
    parameters['batchsize'] = batchsize
    parameters['state_size'] = state_size
    parameters['action_size'] = action_size
    parameters['memory_size'] = memory_size
    parameters['epsilon_start'] = epsilon_start
    parameters['epsilon_min'] = epsilon_min
    parameters['epsilon_decay_episodes_required'] = epsilon_decay_per_step
    parameters['learning_rate'] = lr
    parameters['discount_rate'] = dr
    parameters['amount_layers'] = amount_layers
    parameters['amount_nodes_layer'] = amount_nodes_layer
    parameters['frequency_update_target_model'] = frequency_updates

    agent = DQNAgent(6, 81, num_steps, epsilon_start, epsilon_min,
                     epsilon_decay_per_step, dr, lr, amount_layers,
                     (amount_nodes_layer, amount_nodes_layer),
                     frequency_updates)

    with RobotArmEnvironment(reward_function_index=reward_index,
                             reward_function_params=(1 / 6 * np.pi, 2 * np.pi,
                                                     1, 10, 0.05, 0.1, 2,
                                                     0.001, 1)) as env:

        ah = list()
        rh = list()

        for episode_idx in range(number_of_episodes):
            state = env.reset()
            tr = 0
            ct = time.time()

            ah.append(list())
            rh.append(list())

            for i in range(max_iterations_per_episode):
                action = agent.act(state)
                ah[episode_idx].append(env.action_map.get(int(action)))

                next_state, reward, done, _ = env.step(action)
                rh[episode_idx].append(float(reward))

                agent.remember(state, action, reward, next_state, done)

                state = next_state
                tr += reward

                if done:
                    break

                agent.replay(32)

            print(
                "episode {}/{}, average reward {}, epsilon {}, time taken {}s".
                format(episode_idx + 1, number_of_episodes, tr,
                       agent.get_epsilon(),
                       time.time() - ct))

            agent._update_epsilon()

            if episode_idx % 100 == 0 and episode_idx != 0:
                agent.safe()
                save_info(episode_idx, reward_index, parameters,
                          env.action_map.to_json_object(), rh, ah,
                          env.to_json_object())