Ejemplo n.º 1
0
def main(args):
    args_dict = vars(args)
    print('args: {}'.format(args_dict))

    with tf.Graph().as_default() as g:
        # rollout subgraph
        with tf.name_scope('rollout'):
            observations = tf.placeholder(shape=(None, OBSERVATION_DIM),
                                          dtype=tf.float32)

            logits = build_graph(observations)

            logits_for_sampling = tf.reshape(logits, shape=(1, len(ACTIONS)))

            # Sample the action to be played during rollout.
            sample_action = tf.squeeze(
                tf.multinomial(logits=logits_for_sampling, num_samples=1))

        optimizer = tf.train.RMSPropOptimizer(learning_rate=args.learning_rate,
                                              decay=args.decay)

        # dataset subgraph for experience replay
        with tf.name_scope('dataset'):
            # the dataset reads from MEMORY
            ds = tf.data.Dataset.from_generator(gen,
                                                output_types=(tf.float32,
                                                              tf.int32,
                                                              tf.float32))
            ds = ds.shuffle(MEMORY_CAPACITY).repeat().batch(args.batch_size)
            iterator = ds.make_one_shot_iterator()

        # training subgraph
        with tf.name_scope('train'):
            # the train_op includes getting a batch of data from the dataset, so we do not need to use a feed_dict when running the train_op.
            next_batch = iterator.get_next()
            train_observations, labels, processed_rewards = next_batch

            # This reuses the same weights in the rollout phase.
            train_observations.set_shape((args.batch_size, OBSERVATION_DIM))
            train_logits = build_graph(train_observations)

            cross_entropies = tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits=train_logits, labels=labels)

            # Extra loss when the paddle is moved, to encourage more natural moves.
            probs = tf.nn.softmax(logits=train_logits)
            #move_cost = args.laziness * tf.reduce_sum(probs * [0, 1.0, 1.0], axis=1)

            loss = tf.reduce_sum(processed_rewards *
                                 cross_entropies)  #+ move_cost)

            global_step = tf.train.get_or_create_global_step()

            train_op = optimizer.minimize(loss, global_step=global_step)

        init = tf.global_variables_initializer()
        saver = tf.train.Saver(max_to_keep=args.max_to_keep)

        with tf.name_scope('summaries'):
            rollout_reward = tf.placeholder(shape=(), dtype=tf.float32)

            # the weights to the hidden layer can be visualized
            hidden_weights = tf.trainable_variables()[0]
            for h in range(args.hidden_dim):
                slice_ = tf.slice(hidden_weights, [0, h], [R * C, 1])  ##
                image = tf.reshape(slice_, [1, R, C, 1])
                tf.summary.image('hidden_{:04d}'.format(h), image)

            for var in tf.trainable_variables():
                tf.summary.histogram(var.op.name, var)
                tf.summary.scalar('{}_max'.format(var.op.name),
                                  tf.reduce_max(var))
                tf.summary.scalar('{}_min'.format(var.op.name),
                                  tf.reduce_min(var))

            tf.summary.scalar('rollout_reward', rollout_reward)
            tf.summary.scalar('loss', loss)

            merged = tf.summary.merge_all()

        print('Number of trainable variables: {}'.format(
            len(tf.trainable_variables())))

    #inner_env = gym.make('Pong-v0')
    # tf.agents helper to more easily track consecutive pairs of frames
    #env = FrameHistory(inner_env, past_indices=[0, 1], flatten=False)
    # tf.agents helper to automatically reset the environment
    #env = AutoReset(env)

    with tf.Session(graph=g) as sess:
        if args.restore:
            restore_path = tf.train.latest_checkpoint(args.output_dir)
            print('Restoring from {}'.format(restore_path))
            saver.restore(sess, restore_path)
        else:
            sess.run(init)

        summary_path = os.path.join(args.output_dir, 'summary')
        summary_writer = tf.summary.FileWriter(summary_path, sess.graph)

        # lowest possible score after an episode as the
        # starting value of the running reward
        _rollout_reward = -1.0  #old = -21.0

        for i in range(args.n_epoch):
            print('>>>>>>> epoch {}'.format(i + 1))

            print('>>> Rollout phase')
            epoch_memory = []
            episode_memory = []

            game = Game({'max_steps': 10000})  # initialize game from game.py
            h = random.randint(1, R * C + 1)
            l = random.randint(1, h // 2 + 1)
            # The loop for actions/stepss
            pizza_lines = [
                ''.join([random.choice("MT") for _ in range(C)])
                for _ in range(R)
            ]
            pizza_config = {
                'pizza_lines': pizza_lines,
                'r': R,
                'c': C,
                'l': l,
                'h': h
            }
            _observation = preprocess(
                game.init(pizza_config)
                [0])  #np.zeros(OBSERVATION_DIM) #get only first value of tuple
            while True:
                # sample one action with the given probability distribution
                _label = sess.run(sample_action,
                                  feed_dict={observations: [_observation]})

                _action = ACTIONS[_label]

                _state, _reward, _done, _ = game.step(_action)

                if args.render:
                    game.render()

                # record experience
                episode_memory.append((_observation, _label, _reward))

                # Get processed frame delta for the next step
                #pair_state = _pair_state

                #current_state, previous_state = pair_state
                #current_x = prepro(current_state)
                #previous_x = prepro(previous_state)

                _observation = preprocess(_state)

                if _done:
                    obs, lbl, rwd = zip(*episode_memory)

                    # processed rewards
                    prwd = discount_rewards(rwd, args.gamma)
                    prwd -= np.mean(prwd)
                    prwd /= (np.std(prwd) + .00000001
                             )  #epsilon otherwise div by zero if no rewards

                    # store the processed experience to memory
                    epoch_memory.extend(zip(obs, lbl, prwd))

                    # calculate the running rollout reward
                    _rollout_reward = 0.9 * _rollout_reward + 0.1 * sum(rwd)

                    episode_memory = []

                    #if args.render:
                    #	_ = input('episode done, press Enter to replay')
                    #	epoch_memory = []
                    #	continue

                    if len(epoch_memory) >= ROLLOUT_SIZE:
                        break

                    game = Game({'max_steps':
                                 10000})  # initialize game from game.py
                    h = random.randint(1, R * C + 1)
                    l = random.randint(1, h // 2 + 1)
                    pizza_lines = [
                        ''.join([random.choice("MT") for _ in range(C)])
                        for _ in range(R)
                    ]
                    pizza_config = {
                        'pizza_lines': pizza_lines,
                        'r': R,
                        'c': C,
                        'l': l,
                        'h': h
                    }
                    _observation = preprocess(
                        game.init(pizza_config)[0]
                    )  #np.zeros(OBSERVATION_DIM) #get only first value of tuple

            # add to the global memory
            MEMORY.extend(epoch_memory)

            print('>>> Train phase')
            print('rollout reward: {}'.format(_rollout_reward))

            # Here we train only once.
            _, _global_step = sess.run([train_op, global_step])

            if _global_step % args.save_checkpoint_steps == 0:

                print('Writing summary')

                feed_dict = {rollout_reward: _rollout_reward}
                summary = sess.run(merged, feed_dict=feed_dict)

                summary_writer.add_summary(summary, _global_step)

                save_path = os.path.join(args.output_dir, 'model.ckpt')
                save_path = saver.save(sess,
                                       save_path,
                                       global_step=_global_step)
                print('Model checkpoint saved: {}'.format(save_path))
Ejemplo n.º 2
0
def main(args):
    def preprocess(state_dict):
        stacked_frames = [
            np.zeros((R, C), dtype=np.float32) for i in range(stack_size)
        ]  # replace with np.int
        stacked_frames[0] = np.array(
            state_dict['ingredients_map']
        )  #.reshape((R,C)) #tomatoes_map #could use .append...
        #stacked_frames[1] = (stacked_frames[0] - 1) * -1            #mushrooms_map = (tomatoes_map - 1) * -1  #/!\ Add mushroom map and set stack_size to 5 instead of 4
        cursor_R, cursor_C = state_dict[
            'cursor_position']  #cursor_R, cursor_C = _state['cursor_position']
        stacked_frames[1] = np.zeros(
            [R, C])  # try np.zeros((R,C)) #cursor_map = np.zeros([R, C])
        stacked_frames[1][cursor_R,
                          cursor_C] = 1  #cursor_map[cursor_R,cursor_C] = 1
        slice_map = np.array(state_dict['slices_map']
                             )  #slice_map = np.array(_state['slices_map'])
        current_slice_id = slice_map[
            cursor_R,
            cursor_C]  #current_slice_id = slice_map[cursor_R,cursor_C]
        current_slice = np.where(
            slice_map == current_slice_id
        )  #current_slice = np.where(slice_map==current_slice_id)
        stacked_frames[2] = np.zeros([R, C])  #np.zeros([R, C])
        stacked_frames[2][
            current_slice] = 1  #current_slice_map[current_slice] = 1

        other_slice = np.where((slice_map != current_slice_id)
                               & (slice_map != -1))
        stacked_frames[3] = np.zeros([R,
                                      C])  #other_slice_map = np.zeros([R, C])
        stacked_frames[3][other_slice] = 1  #other_slice_map[other_slice] = 1
        """
        state = np.concatenate((
        np.array(state_dict['ingredients_map']).ravel(),
        np.array(state_dict['slices_map']).ravel(),
        np.array(state_dict['cursor_position']).ravel(),
        [state_dict['min_each_ingredient_per_slice'],
        state_dict['max_ingredients_per_slice']],
        ))
        """
        stacked_state = np.stack(stacked_frames, axis=2)
        #return state.astype(np.float).ravel()
        return stacked_state

    class DQNetwork:
        def __init__(self,
                     state_size,
                     action_size,
                     learning_rate,
                     name='DQNetwork'):
            #self.state_size = state_size         #(OK)
            self.state_size = state_size  #(OK)
            self.action_size = action_size  #(OK)
            self.learning_rate = learning_rate  #(OK) args.

            with tf.variable_scope(name):
                # We create the placeholders
                # *state_size means that we take each elements of state_size in tuple hence is like if we wrote
                ### Replacing [None, *state_size] by [1, batch_size, *state_size] NOPE needs [None, *state_size for predict_action (1 value)]
                #self.inputs_ = tf.placeholder(tf.float32, [None, *state_size], name="inputs")
                self.inputs_ = tf.placeholder(tf.float32, [None, *state_size],
                                              name="inputs")
                self.actions_ = tf.placeholder(tf.float32,
                                               [None, self.action_size],
                                               name="actions_")

                # Remember that target_Q is the R(s,a) + ymax Qhat(s', a')
                self.target_Q = tf.placeholder(tf.float32, [None],
                                               name="target")
                """
                First convnet:
                CNN
                ELU
                """
                # Input is RxCx4 (from 110x84x4)
                self.conv1 = tf.layers.conv2d(
                    inputs=self.inputs_,
                    filters=32,
                    kernel_size=[4, 4],  # from [8,8]
                    strides=[1, 1],
                    padding="VALID",
                    kernel_initializer=tf.contrib.layers.
                    xavier_initializer_conv2d(),
                    name="conv1")

                self.conv1_out = tf.nn.elu(self.conv1, name="conv1_out")
                """
                Second convnet:
                CNN
                ELU
                """
                """
                self.conv2 = tf.layers.conv2d(inputs = self.conv1_out,
                                    filters = 64,
                                    kernel_size = [3,3],  # from [4,4]
                                    strides = [2,2],
                                    padding = "VALID",
                                    kernel_initializer=tf.contrib.layers.xavier_initializer_conv2d(),
                                    name = "conv2")

                self.conv2_out = tf.nn.elu(self.conv2, name="conv2_out")
                """
                """
                Third convnet:
                CNN
                ELU
                """
                """
                self.conv3 = tf.layers.conv2d(inputs = self.conv2_out,
                                    filters = 64,
                                    kernel_size = [3,3],
                                    strides = [2,2],
                                    padding = "VALID",
                                    kernel_initializer=tf.contrib.layers.xavier_initializer_conv2d(),
                                    name = "conv3")

                self.conv3_out = tf.nn.elu(self.conv3, name="conv3_out")

                self.flatten = tf.contrib.layers.flatten(self.conv3_out)
                """
                self.flatten = tf.contrib.layers.flatten(self.conv1_out)
                #1_Hot_Encode L and H and add it here below flatten [TO BE ADDED - FIXED FOR NOW...]
                ### INIT self.flatten to our flatten state!!! (no CNN for now)
                #self.flatten = self.inputs_

                # append 5 node features at the end (cursor 2x1, L, H)
                self.fc = tf.layers.dense(
                    inputs=self.flatten,
                    units=512,
                    activation=tf.nn.elu,
                    kernel_initializer=tf.contrib.layers.xavier_initializer(),
                    name="fc1")
                self.fc2 = tf.layers.dense(
                    inputs=self.fc,
                    units=512,
                    activation=tf.nn.elu,
                    kernel_initializer=tf.contrib.layers.xavier_initializer(),
                    name="fc2")

                self.output = tf.layers.dense(
                    inputs=self.fc2,
                    kernel_initializer=tf.contrib.layers.xavier_initializer(),
                    units=self.action_size,
                    activation=None)

                # Q is our predicted Q value.
                self.Q = tf.reduce_sum(tf.multiply(self.output, self.actions_))

                # The loss is the difference between our predicted Q_values and the Q_target
                # Sum(Qtarget - Q)^2
                self.loss = tf.reduce_mean(tf.square(self.target_Q - self.Q))

                self.optimizer = tf.train.AdamOptimizer(
                    self.learning_rate).minimize(self.loss)

    class Memory():
        def __init__(self, max_size):
            self.buffer = deque(maxlen=max_size)
            #stack_size = 4 # We stack 4 frames ## could be commented

            # Initialize deque with zero-images one array for each image
            #stacked_frames  =  deque([np.zeros((110,84), dtype=np.int) for i in range(stack_size)], maxlen=4)

        #MEMORY = deque([], maxlen=MEMORY_CAPACITY)   #NEEDS TO BE CHECKED!!
        #MEMORY is buffer....
        def add(self, experience):
            self.buffer.append(experience)

        def sample(self, batch_size):
            buffer_size = len(self.buffer)
            index = np.random.choice(np.arange(buffer_size),
                                     size=batch_size,
                                     replace=False)

            return [self.buffer[i]
                    for i in index]  # similar to the generator....

    def predict_action(explore_start, explore_stop, decay_rate, decay_step,
                       state, actions):
        ## EPSILON GREEDY STRATEGY
        # Choose action a from state s using epsilon greedy.
        ## First we randomize a number
        exp_exp_tradeoff = np.random.rand()

        # Here we'll use an improved version of our epsilon greedy strategy used in Q-learning notebook
        explore_probability = explore_stop + (
            explore_start - explore_stop) * np.exp(-decay_rate * decay_step)

        if (explore_probability > exp_exp_tradeoff):
            # Make a random action (exploration)
            choice = random.randint(1, len(possible_actions)) - 1
            action = possible_actions[choice]

        else:
            # Get action from Q-network (exploitation)
            # Estimate the Qs values state
            Qs = sess.run(DQNetwork.output,
                          feed_dict={
                              DQNetwork.inputs_: state.reshape(
                                  (1, *state.shape))
                          })

            # Take the biggest Q value (= the best action)
            choice = np.argmax(Qs)
            action = possible_actions[choice]

        return action, explore_probability

    # Init
    possible_actions = np.array(np.identity(action_size,
                                            dtype=int).tolist())  #(OK)
    print("The action size is : ", action_size)  #(OK)
    print(possible_actions)  #(OK)

    # Reset the graph
    tf.reset_default_graph()

    # Instantiate the DQNetwork
    DQNetwork = DQNetwork(state_size, action_size, learning_rate)

    # Instantiate memory
    memory = Memory(max_size=memory_size)
    for i in range(pretrain_length):
        # If it's the first step
        if i == 0:

            game = Game({'max_steps': max_steps
                         })  # initialize game from game.py // not 10000
            h = 6  #random.randint(1, R * C + 1)
            l = 1  #random.randint(1, h // 2 + 1)
            pizza_lines = [
                ''.join([random.choice("MT") for _ in range(C)])
                for _ in range(R)
            ]
            pizza_lines = [
                "TMMMTTT", "MMMMTMM", "TTMTTMT", "TMMTMMM", "TTTTTTM",
                "TTTTTTM"
            ]
            pizza_config = {
                'pizza_lines': pizza_lines,
                'r': R,
                'c': C,
                'l': l,
                'h': h
            }
            _state = preprocess(
                game.init(pizza_config)
                [0])  #np.zeros(OBSERVATION_DIM) #get only first value of tuple

        # Get the next_state, the rewards, done by taking a random action
        choice = random.randint(1, len(possible_actions)) - 1
        action = possible_actions[choice]  #as one-hot
        #translate _action into 1 to 5 action for the game...
        _action = ACTIONS[np.argmax(action)]
        next_state, _reward, _done, _ = game.step(
            _action)  #next_state is _state in Game agent
        _next_state = preprocess(next_state)

        if episode_render and i % 20 == 0:  # NEEDS TO BE CHECKED  args.render:
            game.render()

        # If the episode is finished (we maxed out the number of frames)
        if _done:
            # We finished the episode
            _next_state = np.zeros(
                _state.shape
            )  # _state is flattened with cursor, L and H appended

            # Add experience to memory (push action one-hot encoded instead of _action label e.g.'right')
            memory.add((_state, action, _reward, _next_state, _done))

            # Start a new episode
            game = Game({'max_steps':
                         max_steps})  # initialize game from game.py not
            h = 6  #random.randint(1, R * C + 1)
            l = 1  #random.randint(1, h // 2 + 1)
            pizza_lines = [
                ''.join([random.choice("MT") for _ in range(C)])
                for _ in range(R)
            ]
            pizza_lines = [
                "TMMMTTT", "MMMMTMM", "TTMTTMT", "TMMTMMM", "TTTTTTM",
                "TTTTTTM"
            ]
            pizza_config = {
                'pizza_lines': pizza_lines,
                'r': R,
                'c': C,
                'l': l,
                'h': h
            }
            _state = preprocess(
                game.init(pizza_config)
                [0])  #np.zeros(OBSERVATION_DIM) #get only first value of tuple
            ### Watch-out, _observation will be flattened and won't conserve Rc needed for CNN

        else:
            # Add experience to memory (push action one-hot encoded instead of _action label e.g.'right')
            memory.add((_state, action, _reward, _next_state, _done))

            # Our new state is now the next_state
            _state = _next_state

    # Setup TensorBoard Writer  #NEEDS TO BE CHECKED
    #summary_path = os.path.join('gs://pizza-game/', './summary')
    summary_path = os.path.join(args.output_dir, 'summary')
    writer = tf.summary.FileWriter(summary_path)
    #writer = tf.summary.FileWriter("./tensorboard/dqn/1")

    for var in tf.trainable_variables():
        tf.summary.histogram(var.op.name, var)
        tf.summary.scalar('{}_max'.format(var.op.name), tf.reduce_max(var))
        tf.summary.scalar('{}_min'.format(var.op.name), tf.reduce_min(var))

    #tf.summary.scalar('rollout_reward', rollout_reward)
    tf.summary.scalar('loss', DQNetwork.loss)
    _total_reward = tf.placeholder(tf.float32, (), name="tot_reward")
    tf.summary.scalar('reward', _total_reward)

    write_op = tf.summary.merge_all()

    # Saver will help us to save our model
    saver = tf.train.Saver()

    if training == True:
        with tf.Session() as sess:
            # Initialize the variables
            sess.run(tf.global_variables_initializer())

            # Initialize the decay rate (that will use to reduce epsilon)
            decay_step = 0
            rewards_list = []
            average_reward = []
            average_reward_scalar = 0
            for episode in range(total_episodes):
                # Set step to 0
                step = 0

                # Initialize the rewards of the episode
                episode_rewards = []
                episode_actions = []
                # Make a new episode and observe the first state
                # Start a new episode
                game = Game({'max_steps':
                             max_steps})  # initialize game from game.py
                h = 6  #random.randint(1, R * C + 1)
                l = 1  #random.randint(1, h // 2 + 1)
                pizza_lines = [
                    ''.join([random.choice("MT") for _ in range(C)])
                    for _ in range(R)
                ]
                pizza_lines = [
                    "TMMMTTT", "MMMMTMM", "TTMTTMT", "TMMTMMM", "TTTTTTM",
                    "TTTTTTM"
                ]
                pizza_config = {
                    'pizza_lines': pizza_lines,
                    'r': R,
                    'c': C,
                    'l': l,
                    'h': h
                }
                _state = preprocess(game.init(pizza_config)[0])

                while step < max_steps:
                    step += 1

                    #Increase decay_step
                    decay_step += 1

                    # Predict the action to take and take it
                    action, explore_probability = predict_action(
                        explore_start, explore_stop, decay_rate, decay_step,
                        _state, possible_actions)
                    #action is one-hot... so translate _action into 1 to 5 action for the game...
                    _action = ACTIONS[np.argmax(action)]

                    #Perform the action and get the next_state, reward, and done information
                    next_state, _reward, _done, _ = game.step(
                        _action)  #next_state is _state in Game agent
                    _next_state = preprocess(next_state)

                    # Add the reward to total reward
                    episode_rewards.append(_reward)
                    episode_actions.append(_action)

                    # If the game is finished
                    if _done:
                        # The episode ends so no next state
                        _next_state = np.zeros(
                            _state.shape
                        )  # _state is flattened with cursor, L and H appended

                        # Set step = max_steps to end the episode
                        step = max_steps

                        # Get the total reward of the episode
                        total_reward = np.sum(episode_rewards)
                        average_reward.append(total_reward)
                        if (episode % 100 == 0
                                and episode < 500) or (episode % 1000 == 0):
                            print(
                                'Episode: {}'.format(episode),
                                'Total reward: {}'.format(total_reward),
                                'Explore P: {:.4f}'.format(
                                    explore_probability),
                                'Training Loss {:.4f}'.format(loss))
                            print(episode_actions)
                            print(episode_rewards)
                            # Add reward to that point between printing episodes
                            if (episode < 500):
                                if (episode == 0):
                                    average_reward_scalar = np.sum(
                                        average_reward)
                                else:
                                    average_reward_scalar = np.sum(
                                        average_reward) / 100
                            else:
                                if (episode == 1000):
                                    average_reward_scalar = np.sum(
                                        average_reward) / 600
                                else:
                                    average_reward_scalar = np.sum(
                                        average_reward) / 1000
                        rewards_list.append((episode, total_reward))

                        # Store transition <st,at,rt+1,st+1> in memory D
                        memory.add(
                            (_state, action, _reward, _next_state, _done))

                        if episode_render and (
                            (episode % 100 == 0 and episode < 500) or
                            (episode % 1000 == 0)):
                            game.render()
                    else:
                        # Add experience to memory
                        memory.add(
                            (_state, action, _reward, _next_state, _done))

                        # st+1 is now our current state
                        _state = _next_state

                    ### LEARNING PART
                    # Obtain random mini-batch from memory
                    batch = memory.sample(batch_size)
                    # reshaping states by using squeeze....
                    states_mb = np.array([each[0] for each in batch],
                                         ndmin=3)  ## Consider modifying ndmin
                    #states_mb = np.squeeze(states_mb, axis=0)
                    actions_mb = np.array([each[1] for each in batch])

                    rewards_mb = np.array([each[2] for each in batch])
                    # reshaping next_states by using squeeze....
                    next_states_mb = np.array([each[3] for each in batch],
                                              ndmin=3)
                    #next_states_mb = np.squeeze(next_states_mb, axis=0)

                    dones_mb = np.array([each[4] for each in batch])
                    target_Qs_batch = []

                    # Get Q values for next_state /!\ --- Why shape of DQNetwork.inputs_ = (1, 64, 89??)
                    Qs_next_state = sess.run(
                        DQNetwork.output,
                        feed_dict={DQNetwork.inputs_: next_states_mb})

                    # Set Q_target = r if the episode ends at s+1, otherwise set Q_target = r + gamma*maxQ(s', a')
                    for i in range(0, len(batch)):
                        terminal = dones_mb[i]

                        # If we are in a terminal state, only equals reward
                        if terminal:
                            target_Qs_batch.append(rewards_mb[i])

                        else:
                            target = rewards_mb[i] + gamma * np.max(
                                Qs_next_state[i])
                            target_Qs_batch.append(target)

                    targets_mb = np.array([each for each in target_Qs_batch])
                    loss, _ = sess.run(
                        [DQNetwork.loss, DQNetwork.optimizer],
                        feed_dict={
                            DQNetwork.inputs_: states_mb,
                            DQNetwork.target_Q: targets_mb,
                            DQNetwork.actions_: actions_mb
                        })

                # Save model every 5 episodes
                if (episode % 100 == 0 and episode < 500) or (episode % 1000
                                                              == 0):
                    save_os_path = os.path.join(args.output_dir,
                                                'models/model.ckpt')
                    save_path = saver.save(sess, save_os_path)
                    print("Model Saved")
                    # Write TF Summaries
                    summary = sess.run(write_op,
                                       feed_dict={
                                           DQNetwork.inputs_: states_mb,
                                           DQNetwork.target_Q: targets_mb,
                                           DQNetwork.actions_: actions_mb,
                                           _total_reward: average_reward[0]
                                       })
                    average_reward = []
                    #_total_reward = tf.placeholder(tf.float32, (), name="tot_reward")
                    writer.add_summary(summary, episode)
                    writer.flush()

    with tf.Session() as sess:
        total_test_rewards = []

        # Load the model
        restore_os_path = os.path.join(args.restore_dir, 'models/model.ckpt')
        restore_path = saver.restore(sess, restore_os_path)
        #saver.restore(sess, restore_path)

        for episode in range(1):
            total_rewards = 0

            game = Game({'max_steps':
                         max_steps})  # initialize game from game.py
            h = 6  #random.randint(1, R * C + 1)
            l = 1  #random.randint(1, h // 2 + 1)
            pizza_lines = [
                ''.join([random.choice("MT") for _ in range(C)])
                for _ in range(R)
            ]
            pizza_lines = [
                "TMMMTTT", "MMMMTMM", "TTMTTMT", "TMMTMMM", "TTTTTTM",
                "TTTTTTM"
            ]
            pizza_config = {
                'pizza_lines': pizza_lines,
                'r': R,
                'c': C,
                'l': l,
                'h': h
            }
            _state = preprocess(
                game.init(pizza_config)
                [0])  #np.zeros(OBSERVATION_DIM) #get only first value of tuple

            print("****************************************************")
            print("EPISODE ", episode)

            while True:
                _state = _state.reshape((1, *state_size))
                # Get action from Q-network
                # Estimate the Qs values state
                Qs = sess.run(DQNetwork.output,
                              feed_dict={DQNetwork.inputs_: _state})

                # Take the biggest Q value (= the best action)
                choice = np.argmax(Qs)
                action = possible_actions[choice]  #as one-hot
                #translate _action into 1 to 5 action for the game...
                _action = ACTIONS[np.argmax(action)]
                print(_action)

                #Perform the action and get the next_state, reward, and done information
                next_state, _reward, _done, _ = game.step(
                    _action)  #next_state is _state in Game agent
                _next_state = preprocess(next_state)
                game.render()

                total_rewards += _reward

                if _done:
                    print("Score", total_rewards)
                    total_test_rewards.append(total_rewards)
                    break

                _state = _next_state
Ejemplo n.º 3
0
    def sample(self, batch_size):
        buffer_size = len(self.buffer)
        index = np.random.choice(np.arange(buffer_size),
                                 size=batch_size,
                                 replace=False)

        return [self.buffer[i] for i in index]  # similar to the generator....


# Instantiate memory
memory = Memory(max_size=memory_size)
for i in range(pretrain_length):
    # If it's the first step
    if i == 0:

        game = Game({'max_steps':
                     max_steps})  # initialize game from game.py // not 10000
        h = 6  #random.randint(1, R * C + 1)
        l = 1  #random.randint(1, h // 2 + 1)
        pizza_lines = [
            ''.join([random.choice("MT") for _ in range(C)]) for _ in range(R)
        ]
        pizza_lines = [
            "TMMMTTT", "MMMMTMM", "TTMTTMT", "TMMTMMM", "TTTTTTM", "TTTTTTM"
        ]
        pizza_config = {
            'pizza_lines': pizza_lines,
            'r': R,
            'c': C,
            'l': l,
            'h': h
        }