Ejemplo n.º 1
0
print(env.action_space)
print(env.observation_space)
print(env.observation_space.high)
print(env.observation_space.low)

RL = DeepQNetwork(n_actions=3, n_features=2, learning_rate=0.001, gamma=0.9)

total_steps = 0
for i_episode in range(10):

    observation = env.reset()
    ep_r = 0
    while True:
        env.render()

        action = RL.choose_action(observation)

        observation_, reward, done, info = env.step(action)

        position, velocity = observation_

        # the higher the better
        # reward = abs(position - (-0.3))     # r in [0, 1]

        RL.save_experience(observation, action, reward, observation_)

        if total_steps > 1000:
            RL.learn()

        ep_r += reward
Ejemplo n.º 2
0
def runner(node_num):
    # Load checkpoint
    load_path = "weights/weights.ckpt"
    save_path = "weights/weights.ckpt"

    # set seed
    seed = 42
    np.random.seed(seed)
    random.seed(seed)

    # Generate graph for training...
    resources = 1
    # G, reward_save, num_nodes = generate_graph(nodes=node_num, type='gnp_adversarial')
    # G, reward_save, num_nodes = generate_graph(load_dir='../gml/ibm.gml', type='gml')
    G, reward_save, num_nodes = generate_graph(nodes=node_num,
                                               type='random_graph',
                                               seed=42)

    # Pick an arbitrary node to be the root
    root = 0
    # Try plotting. If on ssh, don't bother since there are some necessary plt.draw() commands
    # to plot a networkx graph.
    try:
        plot_graph(G, root, 'rl_graph.png')
    except:
        print('No display')

    # We may want to include the graph laplacian in the observation space
    # Graph laplacian is D - A
    # laplacian_matrix = nx.laplacian_matrix(G).toarray()
    # flat_laplacian = laplacian_matrix.flatten()

    # Build the learning environment
    env = environment(G, [root], resources)
    print('num_edges:', G.number_of_edges())
    print("Ratio Heuristic", ratio_heuristic(G, [root], resources), '\n')

    # Our observation space
    n_y = len(env.actions_permutations)

    # Initialize DQN
    DQN = DeepQNetwork(
        n_y=n_y,
        n_x=num_nodes,
        resources=resources,
        env=env,
        learning_rate=0.01,
        replace_target_iter=20,
        memory_size=20000,
        batch_size=256,
        reward_decay=0.6,
        epsilon_min=0.1,
        epsilon_greedy_decrement=5e-5,
        # load_path=load_path,
        # save_path=save_path,
        # laplacian=flat_laplacian,
        inner_act_func='leaky_relu',
        output_act_func='leaky_relu')

    episodes = 600
    rewards = []
    total_steps_counter = 0
    episodes_since_max = 0

    optimal_action_sequences = []
    overall_start = time.time()
    # DQN.epsilon = 0.5

    for episode in range(episodes):

        observation, done = env.reset()
        episode_reward = 0
        action_sequence = []
        start = time.time()
        train_time = 0

        while not done:
            # 1. Choose an action based on observation
            action = DQN.choose_action(observation)

            # check for random action
            if action == -1:
                # action = env.random_action()
                # now choose between truly random action and a ratio action
                r = random.random()
                if r < 0.6:
                    action = env.random_action()
                else:
                    action = env.ratio_action()

            # save the taken action
            action_sequence.append(action)

            # print('Chosen action', action)
            # 2. Take the chosen action in the environment
            observation_, reward, done = env.step(action, neg=False)
            # print(observation_, reward, done)
            # 3. Store transition
            DQN.store_transition(observation, action, reward, observation_)

            episode_reward += reward

            if total_steps_counter > 2000:
                # 4. Train
                s = time.time()
                DQN.learn()
                e = time.time()
                train_time += (e - s)

            if done:
                rewards.append(episode_reward)
                max_reward_so_far = np.amax(rewards)

                # if maximum reward so far, save the action sequence
                if episode_reward == max_reward_so_far:
                    optimal_action_sequences.append(
                        (action_sequence, episode_reward))
                    episodes_since_max = 0
                    # DQN.epsilon = 1

                print("==========================================")
                print("Episode: ", episode)
                print("Reward: ", round(episode_reward, 2))
                print("Epsilon: ", round(DQN.epsilon, 2))
                print("Max reward so far: ", max_reward_so_far)

                end = time.time()
                print('Episode time:', end - start)
                start = time.time()
                break

            # Save observation
            observation = observation_

            # Increase total steps
            total_steps_counter += 1

            # if episode == 700:
            #     DQN.epsilon_min = .1
            #     DQN.epsilon = 0.5

        episodes_since_max += 1
        print('train time across episode', train_time)

    overall_end = time.time()

    # TEST Q-Learning
    DQN.epsilon = 0
    DQN.epsilon_min = 0
    observation, done = env.reset()
    final_reward = 0
    action_sequence = []
    while not done:
        action = DQN.choose_action(observation)
        action_sequence.append(action)
        observation_, reward, done = env.step(action, neg=False)

        final_reward += reward
        if done:
            rewards.append(final_reward)
            max_reward_so_far = np.amax(rewards)

            # if maximum reward so far, save the action sequence
            if final_reward == max_reward_so_far:
                optimal_action_sequences.append(
                    (action_sequence, final_reward))
                episodes_since_max = 0
            break

        # Save observation
        observation = observation_

    print('final epsilon=0 reward', final_reward, '\n')

    # TESTING
    # convert our 'best' optimal action sequence to the vector representation, test it for correctness
    opt = optimal_action_sequences[len(optimal_action_sequences) - 1][0]
    reward = optimal_action_sequences[len(optimal_action_sequences) - 1][1]

    print()
    # print('RL action sequence:')
    env.reset()
    true_r = 0
    for action in opt:
        # print('action index', action)
        # debug will print the action at each step as a vector
        _, r, d = env.step(action, debug=True)
        true_r += r

    results = []
    # if we have a reasonable number of nodes (< 24), we can compute optimal using DP
    if num_nodes < 24:
        dp_time = time.time()
        results.append(DP_optimal(G, [root], resources))
        print('DP Opt: ', results[0])
        dp_time_end = time.time()
        results.append(dp_time_end - dp_time)
        print('DP time: ', results[1])
    else:
        results.append('n/a')
        results.append('n/a')

    print('\n Random Heuristic', random_heuristic(G, [root], resources), '\n')
    results.append(random_heuristic(G, [root], resources))

    # Only works on trees
    # print('\n Tree Heuristic:', simulate_tree_recovery(G, resources, root, clean=False), '\n')

    ratio_time_start = time.time()
    print('\n Ratio Heuristic', ratio_heuristic(G, [root], resources))
    ratio_time_end = time.time()
    print('Ratio time:', ratio_time_end - ratio_time_start)
    results.append(ratio_heuristic(G, [root], resources))
    results.append(ratio_time_end - ratio_time_start)

    print('\n reward during training:', reward)
    results.append(reward)
    print('RL method time (s): ', overall_end - overall_start, '\n')
    results.append(overall_end - overall_start)

    plot_bar_x(rewards, 'episode', 'reward_graph.png')
    with open(reward_save, 'w') as f:
        for item in rewards:
            f.write('%s\n' % item)

    return results
Ejemplo n.º 3
0
RENDER_ENV = False
EPISODES = 500
rewards = []
RENDER_REWARD_MIN = 0
total_steps_counter = 0

for episode in range(400):

    observation = env.reset()
    episode_reward = 0

    while True:
        if RENDER_ENV: env.render()

        # 1. Choose an action based on observation
        action = DQN.choose_action(observation)

        # 2. Take the chosen action in the environment
        observation_, reward, done, info = env.step(action)

        # 3. Store transition
        DQN.store_transition(observation, action, reward, observation_)

        episode_reward += reward

        if total_steps_counter > 1000:
            # 4. Train
            DQN.learn()

        if done:
            rewards.append(episode_reward)