Ejemplo n.º 1
0
def test(agent: DQNAgent, test_eps):
    env = gym.make(ENV_NAME)
    ep_rewards = []

    for test_ep in range(test_eps):
        obs = env.reset()
        done = False

        ep_reward = 0
        ep_step = 0

        while not done:

            action = agent.act(np.array(obs), evaluate=True)
            next_obs, reward, done, _ = env.step(action)
            env.render()

            obs = next_obs

            ep_reward += reward
            ep_step += 1

        ep_rewards.append(ep_reward)
        time.sleep(0.2)

    print('\n')
    print('=== Test performance ===')
    print(f'Mean: {np.mean(ep_rewards):.1f} / '
          f'Min: {np.min(ep_rewards):.1f} / '
          f'Max: {np.max(ep_rewards):.1f}')

    env.close()
    return ep_rewards
def dqn_run(episodes=2500,
            eps_start=1.0,
            eps_end=0.01,
            eps_decay=0.995,
            double_dqn=False,
            dueling_dqn=False,
            seed=42):
    env = start_env()
    env_info = reset_env_info(env)

    state_size = get_state_size(env_info)
    action_size = get_action_size(env)

    print('Seed used:', seed)
    agent = DQNAgent(state_size, action_size, double_dqn, dueling_dqn, seed)

    scores = []
    scores_window = deque(maxlen=100)
    eps = eps_start

    for episode in range(1, episodes + 1):
        env_info = reset_env_info(env)
        score = 0.0
        done = False
        while not done:
            state = env_info.vector_observations[0]
            action = agent.act(state, epsilon=eps)
            env_info = env_step(env, action)
            next_state = env_info.vector_observations[0]
            reward = env_info.rewards[0]
            done = env_info.local_done[0]
            agent.step(state, action, reward, next_state, done)
            score += reward

        scores_window.append(score)
        scores.append(score)
        eps = max(eps * eps_decay, eps_end)
        print('\rEpisode {}/{}\tAverage Score: {:.2f}, epsilon: {:.3f}'.format(
            episode, episodes, np.mean(scores_window), eps),
              end='     ')
        if episode % 100 == 0:
            print('\rEpisode {}/{}\tAverage Score: {:.2f}, epsilon: {:.3f}'.
                  format(episode, episodes, np.mean(scores_window), eps))
        if np.mean(scores_window) > 13.0:
            print(
                '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'
                .format(episode - 100, np.mean(scores_window)))
            torch.save(agent.qnetwork_local.state_dict(), 'checkpoint.pth')
            break

    env.close()
    return scores
Ejemplo n.º 3
0
def test_model(filename):
	env = gym.make("CartPole-v1")
	agent = DQNAgent(4, 2)
	agent.load_model(filename)

	state = env.reset()

	for _ in range(1000):
		env.render()
		state, _, done, _ = env.step(agent.act(state, explore=False))
		if done:
			break

	env.close()
Ejemplo n.º 4
0
def advise():
    n1 = float(request.form['n1'])
    n2 = float(request.form['n2'])
    n3 = float(request.form['n3'])
    cash = float(request.form['cash'])
    print(n1)
    print(cash)

    agent = DQNAgent(state_size, action_size)
    scaler = get_scaler(env)
    agent.load("202005011635-dqn.h5")

    state = env.reset()
    state[0] = n1
    state[1] = n2
    state[2] = n3
    state[-1] = cash
    state = scaler.transform([state])

    action = agent.act(state)
    # action_combo = list(map(list, itertools.product([0, 1, 2], repeat=3)))
    action_vec = action_combo[action]
    # action_map = {0: "sell", 1: "hold", 2: "buy"}

    # print(action_map[action_vec[0]], action_map[action_vec[1]], action_map[action_vec[2]])

    ans = []
    tmp = 1 if action_vec[0] == 0 and n1 == 0 else action_vec[0]
    if cash == 0 and tmp == 2: tmp = 1
    ans.append(action_map[tmp])
    tmp = 1 if action_vec[1] == 0 and n2 == 0 else action_vec[1]
    if cash == 0 and tmp == 2: tmp = 1
    ans.append(action_map[tmp])
    tmp = 1 if action_vec[2] == 0 and n3 == 0 else action_vec[2]
    if cash == 0 and tmp == 2: tmp = 1
    ans.append(action_map[tmp])

    print(ans)
    return render_template('index.html',
                           ans=ans,
                           n1=n1,
                           n2=n2,
                           n3=n3,
                           cash=cash)
Ejemplo n.º 5
0
def main():

    env = gym.make('carla-v0')
    state_size = env.image_size_net_chans
    action_size = len(env.action_space)
    agent = DQNAgent(state_size, action_size)

    done = False
    batch_size = 10

    try:

        for episode in range(EPISODES):
            state = env.reset(render=True)
            score = 0.0
            for time in range(10000):
                env.render()
                action = agent.act(state)
                next_state, reward, done = env.step(action)

                if done:
                    reward = -15
                else:
                    if abs(reward) < 0.5:
                        continue

                score += reward
                agent.remember(state, action, reward, next_state, done)
                state = next_state
                if done:
                    agent.update_target_model()
                    print('episode: {}/{}, score: {:.5}, e: {}'.format(
                        episode, EPISODES, score, agent.epsilon))
                    break
                if len(agent.memory) > batch_size:
                    agent.replay(batch_size)
            if episode % 10 == 0:
                agent.save(os.path.join('..', 'models', 'carla-ddqn.h5'))

    finally:

        env.world.destroy()
Ejemplo n.º 6
0
def main():
    # enable GPU memory growth
    physical_devices = tf.config.list_physical_devices('GPU')
    tf.config.experimental.set_memory_growth(physical_devices[0], True)

    # model
    model_name = input("Model name -> ")
    model_file = input("Model file -> ")
    my_model = "models/{}/{}.h5".format(model_name, model_file)

    epsilon = float(input("Epsilon -> "))
    episode_count = int(input("Episode count -> "))

    print("Loading", my_model, "with epsilon", epsilon)
    agent = DQNAgent(my_model, float(epsilon))

    # information
    resizeScale = (40, 30)
    frame_n = 3
    max_cte = 4.35

    # statistics
    score = []
    rewards = []
    highest_score = 0
    highest_reward = 0
    max_score = None

    # velocity
    max_velocity = 10.0
    max_acceleration = 0.75

    # steering
    max_steering = 0.75
    steering_step = 2 * max_steering / (agent.action_space - 1)
    steering_table = [
        i * steering_step - max_steering for i in range(agent.action_space)
    ]

    # setup donkey environment
    conf = {
        # "exe_path":"remote",
        "exe_path": "D:/sdsandbox/build2/donkey_sim.exe",
        "host": "127.0.0.1",
        "port": 9094,
        "body_style": "donkey",
        "body_rgb": (128, 128, 128),
        "car_name": "rl",
        "font_size": 100
    }

    # env = gym.make("donkey-generated-roads-v0", conf=conf)
    env = gym.make("donkey-generated-track-v0", conf=conf)
    env.viewer.handler.max_cte = max_cte
    cv2.namedWindow("camera")

    start = time.time()
    first_start = start

    for e in range(episode_count):
        # at each episode, reset environment to starting position
        state = env.reset()
        states = np.empty((frame_n, resizeScale[1], resizeScale[0], 3))
        states[0] = preprocessImage(state, resizeScale)
        need_frames = frame_n - 1

        done = False
        score.append(0)
        rewards.append(0.0)
        last_velocity = [0.0]
        laps = 0
        start = time.time()

        while not done and (score[-1] < max_score if max_score else True):
            if need_frames > 0:
                next_state, reward, done, info = env.step([
                    steering_table[random.randint(0, agent.action_space - 1)],
                    0.15
                ])

                states[frame_n - need_frames] = preprocessImage(
                    next_state, resizeScale)
                need_frames -= 1

                last_velocity.append(info["speed"])
                continue

            # select action, observe environment, calculate reward
            action, Q = agent.act(np.asarray([states]))
            steering = steering_table[action]
            throttle = calculateThrottle(last_velocity[-1], max_velocity,
                                         max_acceleration)

            next_state, reward, done, info = env.step([steering, throttle])

            img = cv2.resize(next_state, (320, 240),
                             interpolation=cv2.INTER_AREA)
            cv2.imshow("camera", img)

            last_velocity.append(round(info["speed"], 4))
            if abs(info["cte"]) >= max_cte:
                done = True
                reward = -1.0

            # for track
            else:
                reward = (1.0 - (abs(info["cte"]) / max_cte))

            # for roads
            # if not done:
            # reward = (1.0 - (abs(info["cte"]) / max_cte));

            if info["lap_finished"]:
                laps += 1

            score[-1] += 1
            rewards[-1] += reward

            # for roads
            # if self.score[-1] > 1500:
            # laps = max_laps

            next_states = np.roll(states, -1, axis=0)
            next_states[-1] = preprocessImage(next_state, resizeScale)
            states = next_states

            cv2.waitKey(1)

        env.step([0.0, -0.03])

        if len(score) > 20: score = score[-20:]
        if len(rewards) > 20: rewards = rewards[-20:]

        if score[-1] >= highest_score:
            highest_score = score[-1]

        if rewards[-1] >= highest_reward:
            highest_reward = rewards[-1]

        print(
            "episode: {}/{}, score: {}, reward: {}, laps: {}, e: {:.2}".format(
                e + 1, episode_count, score[-1], round(rewards[-1], 2), laps,
                round(agent.epsilon, 2)))

        if (e + 1) % 5 == 0:
            print("Took", round((time.time() - start) / 60, 2), "minutes\n")
            start = time.time()

    print("Showcase time:", round((time.time() - first_start) / 60, 2),
          "minutes")
Ejemplo n.º 7
0
  portfolio_value = []

  if args.mode == 'test':
    # remake the env with test data
    env = TradingEnv(test_data, args.initial_invest)
    # load trained weights
    agent.load(args.weights)
    # when test, the timestamp is same as time when weights was trained
    timestamp = re.findall(r'\d{12}', args.weights)[0]

  for e in range(args.episode):
    state = env.reset()
    state = scaler.transform([state])
    for time in range(env.n_step):
      action = agent.act(state)
      next_state, reward, done, info = env.step(action)
      next_state = scaler.transform([next_state])
      if args.mode == 'train':
        agent.remember(state, action, reward, next_state, done)
      state = next_state
      if done:
        print("episode: {}/{}, episode end value: {}".format(
          e + 1, args.episode, info['cur_val']))
        portfolio_value.append(info['cur_val']) # append episode end portfolio value
        break
      if args.mode == 'train' and len(agent.memory) > args.batch_size:
        agent.replay(args.batch_size)
    if args.mode == 'train' and (e + 1) % 10 == 0:  # checkpoint weights
      agent.save('weights/{}-dqn.h5'.format(timestamp))
    # Reset the environment
    obs = env.reset()
    obs = obs[0]
    env_renderer.reset()

    # Run a episode (until successful or max number of steps reached)
    for step in range(max_steps):
        # Normalize the observations
        # norm_obs = normalize_observation(obs[0], tree_depth=tree_depth)

        # Agent performs an action
        for _idx in range(n_agents):
            if obs[_idx] is not None:
                norm_obs = normalize_observation(obs[_idx], tree_depth=tree_depth)
                action = agent.act(state=norm_obs, eps=eps)
                action_dict.update({_idx: action})

        # Environment executes action and returns
        #     1. next observations for all agents
        #     2. corresponding rewards for all agents
        #     3. status if the agents are done
        #     4. information about actions, malfunction, speed and status
        next_obs, all_rewards, done, info = env.step(action_dict)
        for _idx in range(n_agents):
            if not done[_idx]:
                next_norm_obs = normalize_observation(next_obs[_idx], tree_depth=tree_depth)
                agent.remember((norm_obs, action_dict[_idx], all_rewards[_idx], next_norm_obs, done[_idx]))

        # Render the environment -> show me what you got!
        env_renderer.render_env(show=True, show_observations=True)
Ejemplo n.º 9
0
    for e in range(EPISODES):
        episode_number = e + 1

        # reset state in the beginning of each game
        state = env.reset()
        state = np.reshape(state, [1, 4])

        # time_t represents each frame of the game
        # Our goal is to keep the pole upright as long as possible until score of 500
        # the more time_t the more score
        for time_t in range(500):
            # turn this on if you want to render
            # env.render()

            # Decide action
            action = agent.act(state)

            # Advance the game to the next frame based on the action.
            # Reward is 1 for every frame the pole survived
            next_state, reward, done, _ = env.step(action)
            next_state = np.reshape(next_state, [1, 4])

            # Remember the previous state, action, reward, and done
            agent.remember(state, action, reward, next_state, done)

            # make next_state the new current state for the next frame.
            state = next_state

            agent.replay(32)

            # done becomes True when the game ends
Ejemplo n.º 10
0
def main():
    # enable GPU memory growth
    physical_devices = tf.config.list_physical_devices('GPU')
    tf.config.experimental.set_memory_growth(physical_devices[0], True)

    # model & training information
    model_name = input("Model name -> ")
    load_trained = input("Load trained (y/n)? ").lower() == "y"
    epsilon = float(input("Epsilon -> "))
    episode_count = int(input("Episode count -> "))

    model_location = "models/" + model_name + "/"
    model_path = model_location + ("model_trained.h5"
                                   if load_trained else "model.h5")

    print("Loading", model_path, "with epsilon", epsilon)
    agent = DQNAgent(model_path, epsilon)

    try:
        agent.memory = json.load(model_location + "data.json")
    except:
        agent.memory = []

    # training information
    resizeScale = (40, 30)
    batch_size = 12
    frame_n = 3
    max_cte = 4.35
    # max_cte = 3.5;

    # statistics
    score = []
    rewards = []
    highest_score = 0
    highest_reward = 0
    max_score = None

    # velocity
    max_velocity = 10.0
    max_acceleration = 0.75

    # steering
    max_steering = 0.75
    steering_step = 2 * max_steering / (agent.action_space - 1)
    steering_table = [
        i * steering_step - max_steering for i in range(agent.action_space)
    ]

    file = open("log.csv", "w+", newline="")
    log = writer(file)
    log.writerow([
        'Episode', 'Timestep', 'Avg Steer', 'Min Reward', 'Avg Reward',
        'Max Reward', 'Episode Length', 'Reward Sum', 'Max Q steer',
        'Max Q throttle', 'Epsilon', 'Episode Time', 'Avg Speed', 'Max Speed',
        'Min CTE', 'Avg CTE', 'Max CTE', 'Distance', "Average Throttle",
        "Max Throttle", "Min Throttle", "Average Absolute CTE",
        "Min Absolute CTE", "Max Absolute CTE"
    ])

    # setup donkey environment
    conf = {
        # "exe_path":"remote",
        "exe_path": "D:/sdsandbox/build2/donkey_sim.exe",
        "host": "127.0.0.1",
        "port": 9091,
        "body_style": "donkey",
        "body_rgb": (128, 128, 128),
        "car_name": "rl",
        "font_size": 100
    }

    # env = gym.make("donkey-generated-roads-v0", conf=conf)
    env = gym.make("donkey-generated-track-v0", conf=conf)
    env.viewer.handler.max_cte = max_cte
    cv2.namedWindow("camera")

    first_train = True
    first_start = time.time()
    timestep = 0
    success_episodes = 0
    max_laps = 5

    for e in range(episode_count):
        # at each episode, reset the environment
        state = env.reset()
        states = np.empty((frame_n, resizeScale[1], resizeScale[0], 3))
        states[0] = preprocessImage(state, resizeScale)
        need_frames = frame_n - 1

        done = False
        score.append(0)
        rewards.append(0.0)
        last_velocity = [0.0]
        laps = 0
        start = time.time()

        # logging
        steers = []
        throttles = []
        rewards_ = []
        velocities = []
        ctes = []
        ctes_absolute = []
        max_q_steer = 0.0
        distance = 0.0
        distance_time = start

        while not done and (score[-1] < max_score if max_score else True):
            if need_frames > 0:
                next_state, reward, done, info = env.step([
                    steering_table[random.randint(0, agent.action_space - 1)],
                    0.15
                ])

                states[frame_n - need_frames] = preprocessImage(
                    next_state, resizeScale)
                need_frames -= 1

                last_velocity.append(info["speed"])
                continue

            # select action, observe environment, calculate reward
            action, Q = agent.act(np.asarray([states]))
            steering = steering_table[action]
            throttle = calculateThrottle(last_velocity[-1], max_velocity,
                                         max_acceleration)

            next_state, reward, done, info = env.step([steering, throttle])
            last_velocity.append(round(info["speed"], 4))

            img = cv2.resize(next_state, (320, 240),
                             interpolation=cv2.INTER_AREA)
            cv2.imshow("camera", img)

            reward = 0.0 if not done else -1.0
            if abs(info["cte"]) >= max_cte:
                done = True
                reward = -1.0

            if not done:
                reward = (1.0 - (abs(info["cte"]) / max_cte))

            # for track
            if info["lap_finished"]:
                laps += 1
                if laps == max_laps:
                    done = True

            timestep += 1
            score[-1] += 1
            rewards[-1] += reward

            next_states = np.roll(states, -1, axis=0)
            next_states[-1] = preprocessImage(next_state, resizeScale)

            # save experience and update current state
            agent.remember([states], action, reward, [next_states], done)
            states = next_states

            if not first_train:
                agent.replay(batch_size)

            # logging
            steers.append(steering)
            throttles.append(throttle)
            rewards_.append(reward)
            velocities.append(last_velocity[-1])
            ctes.append(info["cte"])
            ctes_absolute.append(abs(info["cte"]))
            distance += last_velocity[-1] * (time.time() - distance_time)
            distance_time = time.time()

            if Q != None and (max_q_steer == None or Q > max_q_steer):
                max_q_steer = Q

            cv2.waitKey(1)

        # for roads
        # if distance > 1900:
        # laps = max_laps

        # logging
        if score[-1] > 0:
            log.writerow([
                e, timestep,
                round(mean(steers), 2),
                round(min(rewards_), 2),
                round(mean(rewards_), 2),
                round(max(rewards_), 2), score[-1],
                round(rewards[-1], 2),
                round(max_q_steer, 2), 0, agent.epsilon,
                round(time.time() - start, 2),
                round(mean(velocities), 2),
                round(max(velocities), 2),
                round(min(ctes), 2),
                round(mean(ctes), 2),
                round(max(ctes), 2),
                round(distance, 2),
                round(mean(throttles), 2),
                round(max(throttles), 2),
                round(min(throttles), 2),
                round(mean(ctes_absolute), 2),
                round(min(ctes_absolute), 2),
                round(max(ctes_absolute), 2)
            ])
        else:  # sometimes, something goes really wrong... don't count this episode
            e -= 1

        file.flush()

        # fix for persisting throttle bug
        env.step([0.0, -0.03])

        if len(agent.memory) > batch_size * 4 and first_train:
            agent.replay(batch_size)
            agent.act(np.asarray([states]))
            first_train = False

        if len(score) > 20: score = score[-20:]
        if len(rewards) > 20: rewards = rewards[-20:]

        if score[-1] >= highest_score:
            highest_score = score[-1]

        if rewards[-1] >= highest_reward:
            highest_reward = rewards[-1]
            agent.save()

        print(
            "episode: {}/{}, steps: {}, reward: {}, highest reward: {}, average: {}, laps: {}, e: {:.2}, memory: {}, replays: {}"
            .format(e + 1, episode_count, score[-1], round(rewards[-1], 2),
                    round(highest_reward, 2), round(mean(rewards), 2), laps,
                    agent.epsilon, len(agent.memory), agent.replays))

        if (e + 1) % 5 == 0:
            print("Took", round((time.time() - start) / 60, 2), "minutes\n")
            start = time.time()
            agent.merge_models()

        if laps == max_laps:
            success_episodes += 1
        else:
            success_episodes = 0

        if success_episodes == 5:
            print("Training successfull! Time: {} minutes.".format(
                round((time.time() - first_start) / 60.0, 2)))
            agent.save("end.h5")
            file.close()
            break

    agent.save()
    print("Total training time:", round((time.time() - first_start) / 60, 2),
          "minutes")
Ejemplo n.º 11
0
env = DrivingEnv(client)
agent = DQNAgent()

loss = -1
epsilon = 1
epsilon_decay = 0.995
epsilon_min = 0.01

for episode in range(10000000):
    view = env.reset()
    for iteration in range(1000000):
        if random.random() < epsilon:
            action = random.choice(range(3))
        else:
            action = agent.act(view)
        control = carla.VehicleControl(throttle=1, steer=[-1, 0, 1][action])
        next_view, reward, done = env.step(control)
        if iteration > 30:
            loss = agent.memorize(view, action, next_view, reward, done)
        view = next_view
        cv2.imshow('rgb', view[:, :, 0:3])
        cv2.imshow('depth', view[:, :, 3])
        cv2.imshow('seg', view[:, :, 4:7])
        cv2.waitKey(1)

        if done:
            break

    epsilon = epsilon * epsilon_decay
    epsilon = max(epsilon, epsilon_min)
Ejemplo n.º 12
0
def main():
    trial_len = 1030

    env = Environment(100000, 1, trial_len, stock1, stock2)
    trials = 100

    action_info = {
        's1_buys_per_trial': [],
        's1_sells_per_trial': [],
        's2_buys_per_trial': [],
        's2_sells_per_trial': [],
        'holds_per_trial': [],
        'illegal_action_trial': [],
        'profits_per_trial': [],
        'ranges_per_trial': [],
        'good_profits_and_range': []
    }

    dqn_agent = DQNAgent(env, stock1.name, stock2.name)
    menu_option = input(
        "Press 1 to load a model from filepath. Press any other button to start a new model "
    )
    if menu_option == "1":
        dqn_agent.load_model()
    steps = []
    for trial in range(trials):
        print('Trial ', trial)
        cur_state = env.state
        step_count = 0
        start_funds = env.get_funds()
        action = ''

        stock1_buys = 0
        stock1_sells = 0
        stock2_buys = 0
        stock2_sells = 0
        holds = 0
        illegal_action = False
        returns = []

        for step in range(trial_len):
            action_num = dqn_agent.act(cur_state)
            action, stock = None, None

            # Get action from Deep Q Net output
            if action_num == 0:
                action, stock = 'BUY', stock1.name
                stock1_buys += 1
            elif action_num == 1:
                action, stock = 'SELL', stock1.name
                stock1_sells += 1
            elif action_num == 2:
                action, stock = 'BUY', stock2.name
                stock2_buys += 1
            elif action_num == 3:
                action, stock = 'SELL', stock2.name
                stock2_sells += 1
            elif action_num == 4:
                action, stock = 'HOLD', ''
                holds += 1
            else:
                action, stock = None, None

            prev_funds = env.get_funds()
            print('Step {}:'.format(step))
            print('  Action: ', action)
            print('  Stock:  ', stock)
            new_state, reward, illegal_action = env.step(action, stock, 1)
            reward = reward if not illegal_action else -10000
            new_funds = env.get_funds()
            returns.append(new_funds - prev_funds)
            print('  Reward: ', reward)
            dqn_agent.remember(cur_state, action_num, reward, new_state,
                               illegal_action)

            dqn_agent.replay()
            dqn_agent.target_train()
            cur_state = new_state
            step_count += 1
            if illegal_action:
                print('Illegal action taken, starting new trial')
                break

        profit = start_funds - env.get_funds()
        df_range = (env.init_day_index, env.init_day_index + trial_len)
        print('Profit: ', start_funds - env.get_funds())

        if profit >= 5000.00:
            action_info['good_profits_and_range'].append((df_range, returns))
            print(action_info['good_profits_and_range'])

        action_info['profits_per_trial'].append(profit)

        action_info['s1_buys_per_trial'].append(stock1_buys)
        action_info['s1_sells_per_trial'].append(stock1_sells)
        action_info['s2_buys_per_trial'].append(stock2_buys)
        action_info['s2_sells_per_trial'].append(stock2_sells)
        action_info['holds_per_trial'].append(holds)
        action_info['illegal_action_trial'].append(illegal_action)
        action_info['ranges_per_trial'].append(
            (env.init_day_index, env.init_day_index + trial_len))

        n = random.randint(0, len(stock1) - trial_len)
        env = Environment(100000, 1, trial_len, stock1, stock2)

    print(
        "Average Profit: ",
        sum(action_info['profits_per_trial']) /
        len(action_info['profits_per_trial']))
    data_file_name = input(
        'Please type the name of the file you would like to save the action info to: '
    )
    menu_option2 = input(
        "Press 0 to quit, press 1 to save to model to location/ ")
    if menu_option2 == "1":
        fp = input("Enter the filepath to save this model to ")
        dqn_agent.custom_save_model(fp)

    action_info_df = pd.DataFrame(action_info)
    action_info_df.to_csv(data_file_name)
Ejemplo n.º 13
0
player._client.moveByVelocityAsync(0, 0, 0, 5).join()
time.sleep(2)  #time for airsim to setup
num_target_achieved = 0
num_actions_in_episode = 0
episode_next_ratio = 0.3

source, target = player.initAnEpisode()
# see [https://github.com/microsoft/AirSim/blob/master/docs/image_apis.md]
responses = player._client.simGetImages(
    [airsim.ImageRequest(3, airsim.ImageType.DepthPerspective, True, False)])
current_state = transform_input(responses)
current_state = np.expand_dims(current_state, axis=0)
print("Starting Training...")

while (True):
    action = agent.act(current_state)
    print(f"action : {action}")
    num_actions_in_episode += 1
    quad_offset = player.interpretAction(action, scaling_factor=0.25)
    quad_vel = player._client.getMultirotorState(
    ).kinematics_estimated.linear_velocity
    # player._client.moveByVelocityAsync(quad_vel.x_val+quad_offset[0], quad_vel.y_val+quad_offset[1],
    #     quad_vel.z_val+quad_offset[2], 5).join()
    #for initail phase let velocity in z be 0
    player._client.moveByVelocityAsync(quad_vel.x_val + quad_offset[0],
                                       quad_vel.y_val + quad_offset[1], 0,
                                       5).join()

    time.sleep(0.5)

    quad_state = player._client.getMultirotorState(
Ejemplo n.º 14
0
def train(
        training_episodes: int,
        env_name: str,
        agent_kwargs: dict,
        log_every: int = 500,
        render: bool = True
):

    env = gym.make(env_name)

    agent_kwargs['observation_space_dim'] = env.observation_space.shape[0]
    agent_kwargs['n_actions'] = env.action_space.n

    agent = DQNAgent(**agent_kwargs)

    print('Agents initialised. Training...')

    episode_rewards = []
    start_time = time.time()

    for episode in range(1, training_episodes + 1):

        obs = env.reset()
        done = False
        agent_losses = 0
        ep_reward = 0
        ep_step = 0

        while not done:
            action = agent.act(np.array(obs))

            next_obs, reward, done, info = env.step(action)
            env.render() if render and not episode % log_every else None

            ep_reward += reward

            loss = agent.step(
                state=np.array(obs),
                action=action,
                reward=reward,
                next_state=np.array(next_obs),
                done=done
            )

            agent_losses += loss if loss else 0

            obs = next_obs
            ep_step += 1

        episode_rewards.append(ep_reward)

        TB_WRITER.add_scalar('Loss', agent_losses, episode)
        TB_WRITER.add_scalar('Episode reward', ep_reward, episode)
        TB_WRITER.add_scalar('Epsilon', agent.epsilon, episode)

        if not episode % log_every:
            current_time = time.time()

            if render:
                time.sleep(0.2)  # pause to see final state

            print(f'Ep: {episode} / '
                  f'(Last {log_every:,.0f}) Mean: {np.mean(episode_rewards[-log_every:]):.1f} / '
                  f'Min: {np.min(episode_rewards[-log_every:]):.1f} / '
                  f'Max: {np.max(episode_rewards[-log_every:]):.1f} / '
                  f'EPS: {episode / (current_time - start_time):.1f} / '
                  f'Agent epsilon: {agent.epsilon:.2f}'
                  )

    print('Done training!\n')
    env.close()

    return agent, episode_rewards
Ejemplo n.º 15
0
def DqnProgram(args, setResult, training_result):
    parser = argparse.ArgumentParser()
    parser.add_argument('-e',
                        '--episode',
                        type=int,
                        default=2000,
                        help='number of episode to run')
    parser.add_argument('-b',
                        '--batch_size',
                        type=int,
                        default=32,
                        help='batch size for experience replay')
    parser.add_argument('-i',
                        '--initial_invest',
                        type=int,
                        default=20000,
                        help='initial investment amount')
    parser.add_argument('-m',
                        '--mode',
                        type=str,
                        required=True,
                        help='either "train" or "test"')
    parser.add_argument('-w',
                        '--weights',
                        type=str,
                        help='a trained model weights')
    args = parser.parse_args(args)

    maybe_make_dir('weights')
    maybe_make_dir('portfolio_val')

    import time
    timestamp = time.strftime('%Y%m%d%H%M')
    data = get_data(mode=args.mode)  # TODO UI의 종목과 연결시키기.
    data = np.array([c['종가'] for c in data])

    env = TradingEnv(data, args.initial_invest)
    state_size = env.observation_space.shape
    action_size = env.action_space.shape
    agent = DQNAgent(state_size, action_size)
    scaler = get_scaler(env)

    portfolio_value = []

    if not args.weights is None:
        agent.load(args.weights)
        timestamp = re.findall(r'\d{12}', args.weights)[0]

    for e in range(args.episode):
        state = env.reset()
        state = scaler.transform([state])
        for time in range(env.n_step):
            action = agent.act(state)
            next_state, reward, done, info = env.step(action)
            next_state = scaler.transform([next_state])
            if args.mode == 'train':
                agent.remember(state, action, reward, next_state, done)
            state = next_state
            if done:
                msg = "episode: {}/{}, episode end value: {}".format(
                    e + 1, args.episode, info['cur_val'])
                print(msg)
                setResult(msg=msg)
                training_result.append(info['cur_val'])
                portfolio_value.append(
                    info['cur_val'])  # append episode end portfolio value
                break
            if args.mode == 'train' and len(agent.memory) > args.batch_size:
                agent.replay(args.batch_size)
        if args.mode == 'train' and (e + 1) % 10 == 0:  # checkpoint weights
            agent.save('weights/{}-dqn.h5'.format(timestamp))

    # save portfolio value history to disk
    with open('portfolio_val/{}-{}.p'.format(timestamp, args.mode),
              'wb') as fp:
        pickle.dump(portfolio_value, fp)
Ejemplo n.º 16
0
# Let's explore the enviroment with random acitions
#run_gym(env)

from agent import DQNAgent

state_size = env.observation_space.shape[0]
action_size = env.action_space.n

# Instantiate agent
agent = DQNAgent(
    state_size=state_size,
    action_size=action_size,
    #                 use_double=True,
    #                 use_dueling=True,
    #                 use_priority=True,
    use_noise=True,
    seed=42)

agent.summary()

# Let's watch an untrained agent
#run_gym(env, get_action=lambda state: agent.act(state))

scores = train_agent(agent, env)

plot_scores(scores, 'NoisyNets Deep Q-Network', polyfit_deg=6)

#agent.load_weights('prioritized_local_weights.pth')

run_gym(env, get_action=lambda state: agent.act(state), max_t=1000)