Esempio n. 1
0
def main(desired_iterations, save_path):
    # Define a log file to use with tensorboard
    # Not that we currently make use of tensorboard at all
    LOG_DIR = tempfile.mkdtemp()
    print "Tensorboard Log: " + LOG_DIR + '\n'

    # The directory to save the animations to
    SAVE_DIR = save_path

    # Define the simulation
    sim = Planning(get_noodle_environment())

    # Tensorflow!
    tf.reset_default_graph()
    session = tf.InteractiveSession()
    journalist = tf.train.SummaryWriter(LOG_DIR)
    brain = MLP([
        sim.observation_size,
    ], [200, 200, sim.num_actions], [tf.tanh, tf.tanh, tf.identity])
    optimizer = tf.train.RMSPropOptimizer(learning_rate=0.001, decay=0.9)

    # DiscreteDeepQ object
    current_controller = DiscreteDeepQ(sim.observation_size,
                                       sim.num_actions,
                                       brain,
                                       optimizer,
                                       session,
                                       random_action_probability=0.2,
                                       discount_rate=0.9,
                                       exploration_period=1000,
                                       max_experience=10000,
                                       store_every_nth=1,
                                       train_every_nth=1,
                                       summary_writer=journalist)

    # Initialize the session
    session.run(tf.initialize_all_variables())
    session.run(current_controller.target_network_update)
    # journalist.add_graph(session.graph)

    # Run the simulation and let the robot learn
    num_simulations = 0

    iterations_needed = []
    total_rewards = []

    try:
        for game_idx in range(desired_iterations + 1):
            current_random_prob = current_controller.random_action_probability
            update_random_prob = game_idx != 0 and game_idx % 200 == 0
            if update_random_prob and 0.01 < current_random_prob <= 0.1:
                current_controller.random_action_probability = current_random_prob - 0.01
            elif update_random_prob and 0.1 < current_random_prob:
                current_controller.random_action_probability = current_random_prob - 0.1
            game = Planning(get_noodle_environment())
            game_iterations = 0

            observation = game.observe()
            while not game.is_over():
                action = current_controller.action(observation)
                reward = game.collect_reward(action)
                new_observation = game.observe()
                current_controller.store(observation, action, reward,
                                         new_observation)
                current_controller.training_step()
                observation = new_observation
                game_iterations += 1
            total_rewards.append(sum(game.collected_rewards))
            iterations_needed.append(game_iterations)
            rewards = []
            if game_idx % 50 == 0:
                print "\rGame %d:\nIterations before end: %d." % (
                    game_idx, game_iterations)
                if game.collected_rewards[-1] == 10:
                    print "Hit target!"
                print "Total Rewards: %s\n" % (sum(game.collected_rewards))
                if SAVE_DIR is not None:
                    game.save_path(SAVE_DIR, game_idx)

    except KeyboardInterrupt:
        print "Interrupted"

    # Plot the iterations and reward
    plt.figure(figsize=(12, 8))
    plt.plot(total_rewards, label='Reward')
    # plt.plot(iterations_needed, label='Iterations')
    plt.legend()
    plt.show()
Esempio n. 2
0
performances = []

try:
    for game_idx in range(2000):
        game = DiscreteHill()
        game_iterations = 0

        observation = game.observe()

        while game_iterations < 50 and not game.is_over():
            action = current_controller.action(observation)
            reward = game.collect_reward(action)
            game.perform_action(action)
            new_observation = game.observe()
            current_controller.store(observation, action, reward, new_observation)
            current_controller.training_step()
            observation = new_observation
            game_iterations += 1
        performance = float(game_iterations - (game.shortest_path)) / game.shortest_path
        performances.append(performance)
        if game_idx % 100 == 0:
            print "\rGame %d: iterations before success %d." % (game_idx, game_iterations),
            print "Pos: %s, Target: %s" % (game.position, game.target),
except KeyboardInterrupt:
    print "Interrupted"


# In[11]:

N = 500
Esempio n. 3
0
        game_iterations = 0

        observation = game.observe()
        
        prev_frames = [(observation, -1)] * (n_prev_frames - 1)
        memory = np.concatenate([np.concatenate([observation, np.array([-1])])] * (n_prev_frames - 1) + [observation])
        
        while game_iterations < 50 and not game.is_over():
            action = current_controller.action(memory)
            if n_prev_frames > 1:
                prev_frames = prev_frames[1:] + [(observation, action)]
            reward = game.collect_reward(action)
            game.perform_action(action)
            observation = game.observe()
            new_memory = np.concatenate([np.concatenate([a, np.array([b])]) for (a, b) in prev_frames] + [observation])
            current_controller.store(memory, action, reward, new_memory)
            current_controller.training_step()
            memory = new_memory
            game_iterations += 1
            cost = abs(game.target[0]) + abs(game.target[1])
        performances.append((game_iterations - cost) / float(cost))
        if game_idx % 100 == 0:
            print "\rGame %d: iterations before success %d." % (game_idx, game_iterations),
            print "Pos: %s, Target: %s" % (game.position, game.target),
except KeyboardInterrupt:
    print "Interrupted"


# In[327]:

N = 500
Esempio n. 4
0
        observation = game.observe()
        x0 = copy.deepcopy(observation)
        rewards = []
        cost0 = game.cost()
        path = [copy.deepcopy(observation)]
        while game_iterations < 100 and not game.is_over():
            action = current_controller.action(observation)
            game.perform_action(action)
            game.step(dt)
            cost1 = game.cost()

            reward = cost0 - cost1 - 2
            # reward = -reward
            rewards.append(reward)
            new_observation = game.observe()
            current_controller.store(observation, action, reward,
                                     new_observation)
            current_controller.training_step()

            observation = new_observation
            cost0 = cost1

            game_iterations += 1

            path.append(copy.deepcopy(observation))

        sio.savemat(
            '/home/fantaosha/Documents/tensorflow-deepq/results/quadrotor_path/quadrotor_'
            + str(game_idx) + '.mat', {'path': np.array(path)})
        performance = np.sum(rewards)
        performances.append(performance)
Esempio n. 5
0
        prev_frames = [(observation, -1)] * (n_prev_frames - 1)
        memory = np.concatenate(
            [np.concatenate([observation, np.array([-1])])] *
            (n_prev_frames - 1) + [observation])

        while game_iterations < 50 and not game.is_over():
            action = current_controller.action(memory)
            if n_prev_frames > 1:
                prev_frames = prev_frames[1:] + [(observation, action)]
            reward = game.collect_reward(action)
            game.perform_action(action)
            observation = game.observe()
            new_memory = np.concatenate(
                [np.concatenate([a, np.array([b])])
                 for (a, b) in prev_frames] + [observation])
            current_controller.store(memory, action, reward, new_memory)
            current_controller.training_step()
            memory = new_memory
            game_iterations += 1
            cost = abs(game.target[0]) + abs(game.target[1])
        performances.append((game_iterations - cost) / float(cost))
        if game_idx % 100 == 0:
            print "\rGame %d: iterations before success %d." % (
                game_idx, game_iterations),
            print "Pos: %s, Target: %s" % (game.position, game.target),
except KeyboardInterrupt:
    print "Interrupted"

# In[327]:

N = 500