Ejemplo n.º 1
0
def policy_gradient_train(nodes):
    env = gym.make('Mario-Kart-Royal-Raceway-v0')
    with tf.Session() as sess:
        saver = tf.train.Saver()
        if conf.resume_training:
            saver.restore(sess, conf.save_dir + conf.save_name)
        else:
            sess.run(tf.global_variables_initializer())
        state = env.reset()
        states, actions, rewards = [], [], []
        for episode in range(conf.max_episodes):
            reward = 0
            keep_training = True
            previous_state = np.zeros((conf.img_h, conf.img_w, conf.img_d))
            while keep_training:
                state = utils.resize_img(state)
                # Feed in the difference in states
                state_inp = state - previous_state
                previous_state = state
                out = sess.run(nodes["out"], feed_dict={nodes["state_inp"]: state_inp})
                states.append(state_inp)
                actions.append(out)
                state, r, end_episode, _ = env.step(out)
                reward += r
                rewards.append(reward)
                if end_episode:
                    keep_training = False
Ejemplo n.º 2
0
    env = gym.make('Mario-Kart-Luigi-Raceway-v0')
    state = env.reset()
    # state = resize_img(state)
    # state = utils.resize_img(state)
    env.render()
    print('env ready!')
    with tf.Session() as s:
        actor = Actor(s)
        print('actor ready!')
        print('beginning episode loop')
        total_reward = 0
        end_episode = False
        first = True
        while not end_episode:
            if first:
                state = resize_img(state)
                state = np.dstack((state, state, state, state))
                first = False
            action = actor.get_action(state)
            obs, reward, end_episode, info = env.step(action)
            obs = resize_img(obs)
            state = np.dstack((obs, obs, obs, obs))
            # state[:, :, :3] = obs
            env.render()
            total_reward += reward
        print('end episode... total reward: ' + str(total_reward))
        state = env.reset()
        print('env ready!')
        input('press <ENTER> to quit')
        env.close()
Ejemplo n.º 3
0
def deep_q_train(nodes):
    """
    Main training loop to train the graph
    :param nodes: Nodes for the graph so that the Tensorflow network can run
    :type nodes: dict{str: tf.Tensors}
    :return:
    """
    print("\nReinforcement Learning\n")
    env = gym.make('Mario-Kart-Luigi-Raceway-v0')
    with tf.Session() as sess:
        saver = tf.train.Saver()
        # Initialize all variables such as Q inside network
        if conf.resume_training:
            if conf.first_reinforcement:
                saver.restore(sess, conf.save_dir + conf.save_name_supervised)
            else:
                saver.restore(sess,
                              conf.save_dir + conf.save_name_reinforcement)
            if os.path.isdir('./pickles/epsilon.p'):
                epsilon = pkl.load(open('./pickles/epsilon.p'))
            else:
                epsilon = conf.initial_epsilon
            if os.path.isdir('./pickles/memory.p'):
                memory = pkl.load(open('./pickles/memory.p'))
            else:
                memory = deque(maxlen=conf.replay_memory)
        else:
            sess.run(tf.global_variables_initializer())
            epsilon = conf.initial_epsilon
            memory = deque(maxlen=conf.replay_memory)
        train_writer = tf.summary.FileWriter(conf.sum_dir + './train/',
                                             sess.graph)
        # Initialize memory to some capacity save_name_supervised
        for episode in range(1, conf.max_episodes):
            state = env.reset()
            state = utils.resize_img(state)
            state = np.dstack((state, state, state, state))
            if len(state.shape) < 4:
                state = np.expand_dims(state, axis=0)
            time_step = 0
            end_episode = False
            while not end_episode:
                # Grab actions from first state
                action = np.zeros([conf.OUTPUT_SIZE])
                # state = np.expand_dims(state, axis=0)
                out_t = sess.run(nodes["out"],
                                 feed_dict={nodes["state_inp"]: state})
                out_t = out_t[0]
                # Perform random explore action or else grab maximum output
                if random.random() <= epsilon:
                    cprint("[INFO]: Random Action", 'white')
                    action[0] = out_t[0]
                    action[1] = out_t[1]
                    action[2] = out_t[2]
                    action[3] = np.random.uniform()
                    action[4] = out_t[4]
                else:
                    action = out_t
                # Randomness factor
                if epsilon > conf.final_epsilon:
                    epsilon *= conf.epsilon_decay
                if time_step < 500:
                    action[2] = 1
                # Observe next reward from action
                action_input = [
                    int(action[0] * 80),
                    int(action[1] * 80),
                    int(round(action[2])),
                    int(round(action[3])),
                    int(round(action[4])),
                ]
                print(action_input)
                obs, reward, end_episode, _ = env.step(action_input)
                # Finish rest of the pipeline for this time step, but proceed to the next episode after
                obs = utils.resize_img(obs)
                env.render()
                new_state = np.zeros(state.shape)
                new_state[:, :, :, :3] = obs
                new_state[:, :, :, 3:] = state[:, :, :, :9]

                # Add to memory
                memory.append((state, action, reward, new_state))
                if time_step > conf.start_memory_sample:
                    cprint("MEMORY REPLAY", 'red')
                    batch = random.sample(memory, conf.batch_size)
                    mem_state = [mem[0] for mem in batch]
                    mem_action = [mem[1] for mem in batch]
                    mem_reward = [mem[2] for mem in batch]
                    mem_next_state = [mem[3] for mem in batch]

                    # Ensure that the inputs are 4 dimensional, since they are stored as 1xHxWxD from screen grabs
                    mem_inp = np.squeeze(mem_state, axis=1)
                    mem_next_state = np.squeeze(mem_next_state, axis=1)
                    mem_out = sess.run(
                        nodes["out"],
                        feed_dict={nodes["state_inp"]: mem_next_state})
                    # Allow broadcasting of vectors and arrays
                    yj = np.reshape(
                        mem_reward,
                        (conf.batch_size, 1)) + (conf.learning_rate * mem_out)
                    # for i in range(0, len(batch)):
                    #     yj.append(mem_reward[i] + conf.learning_rate*np.max(mem_out[i]))

                    # Perform gradient descent on the loss function with respect to the yj and predicted output
                    _ = sess.run(nodes["optim_r"],
                                 feed_dict={
                                     nodes["yj"]: yj,
                                     nodes["action_inp"]: mem_action,
                                     nodes["state_inp"]: mem_inp
                                 })
                state = new_state
                if len(state.shape) < 4:  # Ensure that it is 4 dimensional
                    state = np.expand_dims(state, axis=0)
                time_step += 1
                if time_step % conf.save_freq == 0:
                    saver.save(sess,
                               conf.save_dir + conf.save_name_reinforcement)
                if time_step % 100 == 0:
                    print("Episode: %d, Time Step: %d, Reward: %d" %
                          (episode, time_step, reward))
            train_writer.close()
Ejemplo n.º 4
0
def deep_q_train(model):
    """
    Main training loop to train the graph
    :param nodes: Nodes for the graph so that the Tensorflow network can run
    :type nodes: dict{str: tf.Tensors}
    :return:
    """
    print("\nReinforcement Learning\n")
    env = gym.make('Mario-Kart-Royal-Raceway-v0')

    # Initialize memory to some capacity
    memory = deque(maxlen=conf.replay_memory)
    epsilon = conf.initial_epsilon

    for episode in range(1, conf.max_episodes):
        end_episode = False
        # Will replace with samples from the initial game screen
        # Want to send in 4 screens at a time to process, so stack along depth of image
        input_tensor = env.reset()
        input_tensor = utils.resize_img(input_tensor)
        inp = np.dstack((input_tensor, input_tensor, input_tensor, input_tensor))
        time_step = 0
        while not end_episode:
            # Grab actions from first state
            action_input = np.zeros([conf.OUTPUT_SIZE])
            state = np.expand_dims(inp, axis=0)
            output = model(state)
            # Perform random explore action or else grab maximum output
            if random.random() <= epsilon:
                act_indx = random.randrange(conf.OUTPUT_SIZE)
            else:
                act_indx = output.data.cpu().numpy()
            action_input[act_indx] = 1

            # Randomness factor
            if epsilon > conf.final_epsilon:
                epsilon *= conf.epsilon_decay

            # Observe next reward from action
            observation, reward, end_episode, info = env.step(action_input)
            # Finish rest of the pipeline for this time step, but proceed to the next episode after
            obs = utils.resize_img(observation)
            env.render()
            obs = np.expand_dims(obs, axis=0)
            new_state = np.zeros(state.shape)
            new_state[:, :, :, :3] = obs
            new_state[:, :, :, 3:] = state[:, :, :, :9]
            # Add to memory
            memory.append((state, action_input, reward, new_state))

            
            if time_step > conf.start_memory_sample:
                batch = random.sample(memory, conf.batch_size)
                mem_state = [mem[0] for mem in batch]
                mem_action = [mem[1] for mem in batch]
                mem_reward = [mem[2] for mem in batch]
                mem_next_state = [mem[3] for mem in batch]

                yj = []
                mem_out = sess.run(nodes["out"], feed_dict={nodes["state_inp"]: mem_next_state})
                for i in range(0, len(batch)):
                    yj.append(mem_reward[i] + conf.learning_rate*np.max(mem_out[i]))

                # Perform gradient descent on the loss function with respect to the yj and predicted output
                _ = sess.run(nodes["optim_r"], feed_dict={nodes["yj"]: yj,
                                                          nodes["action_inp"]: mem_action,
                                                          nodes["state_inp"]: mem_state})
            state = new_state
            time_step += 1
            if time_step % conf.save_freq == 0:
                saver.save(sess, conf.save_dir + conf.save_name, global_step=time_step)
            if time_step % 100 == 0:
                print("Episode: %d, Time Step: %d, Reward: %d" % (episode, time_step, reward))
Ejemplo n.º 5
0
def resize_and_crop(img, min_size):
    img = resize_img(img, min_size=min_size)
    img = tf.image.random_crop(img, size=(image_size, image_size, 3))
    img = tf.cast(img, tf.bfloat16)
    return img